## Setup the environment

In [None]:
import numpy as np 
import pandas as pd 

import warnings
warnings.filterwarnings('ignore')

import seaborn as sns
import matplotlib.pyplot as  plt
# static images of your plot embedded in the notebook
%matplotlib inline  


from pandas import read_csv
from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder
# from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier



## Loading data



In [None]:
X = pd.read_csv("train.csv", index_col=0)
X_test = pd.read_csv("test.csv", index_col=0)

print('Training data shape: ' + str(X.shape))
print('Testing data shape: ' + str(X_test.shape))

In [None]:
X.head()

In [None]:
X_test.head()

In [None]:
def rstr(X, pred):
    rows = X.shape[0]
    types = X.dtypes
    counts = X.apply(lambda x: x.count())
    uniques = X.apply(lambda x: x.unique())
    uniques_count = X.apply(lambda x: x.unique().shape[0])
    nulls = X.apply(lambda x: x.isnull().sum())
    missing_rate = (X.isnull().sum()/ rows) * 100
    skewness = X.skew()
    kurtosis = X.kurt() 
    
    if pred is None:
        cols = ['types', 'counts', 'nulls', 'missing rate', 'unique value count', 'unique value', 'skewness', 'kurtosis']
        values = pd.concat([types, counts, nulls, missing_rate, uniques_count, uniques, skewness, kurtosis], axis = 1)
    else:
        cols = ['types', 'counts', 'nulls', 'missing_rate', 'unique value count', 'unique value', 'skewness', 'kurtosis', 'corr '  + pred]
        values = pd.concat([types, counts, nulls, missing_rate, uniques_count, uniques, skewness, kurtosis, X.corr()[pred]], axis = 1, sort=False)
        
    values.columns = cols
    dtypes = values.types.value_counts()
    print('___________________________\nData types:\n',  values.types.value_counts())
    print('___________________________')
    return values

details = rstr(X, None)
display(details.sort_values(by='missing rate', ascending=False))

In [None]:
plt.figure(figsize=(16, 8))
plt.xticks(rotation=90)
sns.scatterplot(x=X.columns[0:75], y=X.skew())

### Normal skewness is between -1 and 1, so all features are highly skewed

In [None]:
plt.figure(figsize=(16, 8))
plt.xticks(rotation=90)
sns.scatterplot(x=X.columns[0:75], y=X.kurtosis())

### Normal kurtosis range is <3 Need to normalize the data

In [None]:
details = rstr(X_test, None)
display(details)

## Minimal data processing before building baseline modles

### Splitting test data for training and validation
Since this is a classification problem, let's make sure each class has reasonable number of samples in the train/validation split by using stratified train-test split.


In [None]:
X.groupby(by='target').size()

In [None]:
y = X.pop('target')
X_train, X_validate, y_train, y_validate = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [None]:
print(X_train.shape, X_validate.shape, y_train.shape, y_validate.shape)

In [None]:
# check the ratio of samples in each class to make sure the split if reasonable
temp = pd.concat([X_train, y_train], axis=1)
temp.groupby(by='target').size()

### Encode the target

A simple label encoding for the train data.

In [None]:
# Use the class label's last digit as encoding
y_train_enc = y_train.apply(lambda x: str(x)[-1])
print(y_train)
print(y_train_enc)

In [None]:
# Use oneHot encoding for the validation target because the evaluation requirement of the competition.
y_validate_enc = pd.get_dummies(y_validate)
print(y_validate_enc)

### Define scoring function based on competition documentation

    log loss=−1N∑i=1N∑j=1Myijlog(pij)


In [None]:
# log loss=−1N∑i=1N∑j=1Myijlog(pij)
# each row is divided by the row sum
# In order to avoid the extremes of the log function, predicted probabilities are replaced with max(min(p,1−10−15),10−15)
def get_logloss(prediction):
    prediction = pd.concat([pd.DataFrame(X_validate.index), pd.DataFrame(prediction)], axis=1).set_index('id')
    prediction.columns = ['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9']

    prediction_prelog = prediction.applymap(lambda x: max(min(x, 1-10**-15),10**-15))
    prediction_scaled = prediction_prelog/prediction_prelog.sum(axis=1)[:,None]
    prediction_log = prediction_scaled.applymap(lambda x: np.log(x))

    logloss = -prediction_log.multiply(y_validate_enc).values.sum()/result.shape[0]


    return logloss

## Select three models to build a baseline

*Decision Tree*

*Naive Bayes*

*Gradient Boost*



### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=0, min_samples_leaf=600)
model.fit(X_train, y_train_enc)
dt_prediction = model.predict_proba(X_validate)
print(dt_prediction)

In [None]:
get_logloss(dt_prediction)

# 10.43762174509599 
# First result with first model - Decision Tree(sample leaf set to 10)

# 1.9677494343476694
# Changed sample leaf to 100

# 1.8276959878207062
# Changed sample leaf to 200

# 1.8107869385833477
# Changed sample leaf to 300

# 1.8011824957234837
# Changed sample leaf to 400

# 1.7999674006491755
# sample leaf 500

# 1.798511069434078
# sample leaf 600

# 1.79997749164544
# sample leaf 700      <------------- improvement stopped  

### Naive Bayes


In [None]:
# training a Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb_prediction = GaussianNB().fit(X_train, y_train_enc).predict_proba(X_validate)
print(gnb_prediction)

In [None]:
get_logloss(gnb_prediction)

# 13.232232141881067
# first score for GaussianNB

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train_enc)
mnb_prediction = clf.predict_proba(X_validate)
print(mnb_prediction)

In [None]:
get_logloss(mnb_prediction)

# 4.75101374812709
# First score for MultinomialNB

### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=10, random_state=0).fit(X_train, y_train_enc)
gb_prediction = clf.predict_proba(X_validate)


In [None]:
get_logloss(gb_prediction)

# 4.131055844907932
# first score with n_estimators=100, learning_rate=1.0, max_depth=10

=================================================================================================================================================

## Data Engineering

### Skewness and Kurtosis
According to our overview after loading the data, all features are highly skewed with high kurtosis. Let's address these first.


In [None]:
X.hist(grid=False, figsize=(30, 20), bins=30)

In [None]:
X.agg(['skew', 'kurtosis']).transpose()

In [None]:
from scipy.stats import boxcox

# Box-Cox Transformation in Python
# add a tiny number to each value to remove 0 since neither log or boxcox works with 0.
X_normalization_temp = X + 1
for col in X_normalization_temp.columns:
    X_normalization_temp[col] = boxcox(X_normalization_temp[col])[0]

X_normalization_temp.hist(grid=False, figsize=(30, 20), bins=30)

In [None]:
X_train_normalization = X_train + 1
for col in X_train_normalization.columns:
    X_train_normalization[col] = boxcox(X_train_normalization[col])[0]

X_train_normalization.hist(grid=False, figsize=(30, 20), bins=30)

In [None]:
X_validate_normalization = X_validate + 1
for col in X_validate_normalization.columns:
    X_validate_normalization[col] = boxcox(X_validate_normalization[col])[0]

X_validate_normalization.hist(grid=False, figsize=(30, 20), bins=30)

In [None]:
X_test_normalization = X_test + 1
for col in X_test_normalization.columns:
    X_test_normalization[col] = boxcox(X_test_normalization[col])[0]

X_test_normalization.hist(grid=False, figsize=(30, 20), bins=30)

In [None]:
dt_model = DecisionTreeClassifier(random_state=0, min_samples_leaf=600)
dt_model.fit(X_train_normalization, y_train_enc)
dt_prediction = model.predict_proba(X_validate_normalization)
print(dt_prediction)

In [None]:
get_logloss(dt_prediction)

# 1.813381650885666
# This is not a better score compare to pre-normalization which makes sense since decision tree models don't require data normalization

In [None]:
gnb = GaussianNB()
gnb_prediction = GaussianNB().fit(X_train_normalization, y_train_enc).predict_proba(X_validate_normalization)
print(gnb_prediction)

In [None]:
get_logloss(gnb_prediction)

# 5.2309914125829176
# The GaussianNB model has a big improvement (compare to 13.232232141881067)

In [None]:
clf = MultinomialNB()
clf.fit(X_train_normalization, y_train_enc)
mnb_prediction = clf.predict_proba(X_validate_normalization)
print(mnb_prediction)

In [None]:
get_logloss(mnb_prediction)

# 1.784880528011906
# the multinomialNB also has big improvement after normalization (compare to 4.75101374812709)

This is added for source control testing
