## Data Pre-processing

In [None]:
# Import Modules

import math
import data_loader
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import sklearn.linear_model as linear_model
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

In [None]:
# Data Preparation

# Define column names
column_names = [
     'age', 'workclass', 'fnlwgt', 'education', 'education-num', 
     'marital-status', 'occupation', 'relationship', 'race', 'sex', 
     'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'salary']

# Load and split data by data_loader
train, validation = data_loader.load_train_data('data/adult.data')
test = data_loader.load_test_data('data/adult.test')

# Combined data to consolidate features
train['train'], train['test'] = 1, 0
validation['train'], validation['test'] = 0, 0
test['train'], test['test'] = 0, 1
combined = pd.concat([train, validation, test])
combined.columns = column_names + ['train', 'test']

# Convert label to numerical binary data
combined['salary'] = combined['salary'].replace(' <=50K.', ' <=50K')
combined['salary'] = combined['salary'].replace(' <=50K', 0)
combined['salary'] = combined['salary'].replace(' >50K.', ' >50K')
combined['salary'] = combined['salary'].replace(' >50K', 1)

# Clean garbage
for c in combined.columns:
    combined[c] = combined[c].replace(' ?', np.nan)
combined.dropna(how='any',inplace=True)

print(combined.shape)
combined.sample(n=3)

In [None]:
# Split X
X_combined = pd.concat([combined.iloc[:, :-3], combined.iloc[:, -2:]], axis=1)
print(X_combined.shape)

# Split Y
Y_combined = combined.loc[:,'salary':'test']
print(Y_combined.shape)
Y_train = Y_combined[(Y_combined['train']==1) & (Y_combined['test']==0)].copy()
Y_valid = Y_combined[(Y_combined['train']==0) & (Y_combined['test']==0)].copy()
Y_test  = Y_combined[(Y_combined['train']==0) & (Y_combined['test']==1)].copy()

In [None]:
# Plot feature distribution

def plot_feature_distribution(df):
    fig = plt.figure(figsize=(20,15))
    cols = 5
    rows = math.ceil(float(df.shape[1]) / cols)
    for i, column in enumerate(df.columns):
        if column in ['salary', 'train', 'test']: continue
        ax = fig.add_subplot(rows, cols, i + 1)
        ax.set_title(column)
        if df.dtypes[column] == np.object:
            df[column].value_counts().plot(kind="bar", axes=ax)
        else:
            df[column].hist(axes=ax)
            plt.xticks(rotation="vertical")
        plt.grid(True)
    plt.subplots_adjust(hspace=0.7, wspace=0.2)
    plt.show()

# Plot
plot_feature_distribution(X_combined)

In [None]:
#Feature Modification
FEAT_MOD = False

if FEAT_MOD:
    X_combined_m = X_combined.copy()
    X_combined_m.loc[X_combined_m['native-country'] != ' United-States', 'native-country'] = 'Non-US'
    X_combined_m.loc[X_combined_m['native-country'] == ' United-States', 'native-country'] = 'US'
    X_combined_m['native-country'] = X_combined_m['native-country'].map({'US':1,'Non-US':0}).astype(int)

    del X_combined_m['education-num']

    print(X_combined_m.shape)
    X_combined_m.head()

In [None]:
# Encoding

def number_encode_features(df):
    result = df.copy()
    for column in result.columns:
        if column in ['train', 'test']: continue
        if result.dtypes[column] == np.object:
            # Fit label encoder and Transform labels to normalized encoding
            result[column] = LabelEncoder().fit_transform(result[column])
    return result

# Numerical Encoding: feature labeling
X_factorized = number_encode_features(X_combined)

# One-hot Ecoding: feature spanning
X_encoded = pd.get_dummies(X_combined)

In [None]:
# Normalizing Data

# First check ranges of each feature
# def summerize_data(df):
#     for column in df.columns:
#         print(column)
#         if df.dtypes[column] == np.object: # Categorical data
#             print(df[column].value_counts())
#         else:
#             print(df[column].describe() )
#         print('\n')
# summerize_data(X_factorized)


# scaler = StandardScaler()

# Normalized factorized data scales
# factorized_norm = pd.DataFrame(
#     scaler.fit_transform(factorized.astype(float)), 
#     columns=factorized.columns
# )
# print(factorized_norm.shape)
# Normalized all data scales
# X_factorized_norm = X_factorized.copy()
# col_names = X_factorized_norm.columns[:-2]
# features = X_factorized_norm[col_names]
# features = scaler.fit_transform(features.values.astype(float))
# X_factorized_norm[col_names] = features


# Normalized encoded data scales
# encoded_norm = pd.DataFrame(
#     scaler.fit_transform(encoded.astype(float)), 
#     columns=encoded.columns
# )
# print(encoded_norm.shape)

In [None]:
# Split data into X and Y with Train, Validation and Test

X_train_n = X_factorized[(X_factorized['train']==1) & (X_factorized['test']==0)].copy()
X_valid_n = X_factorized[(X_factorized['train']==0) & (X_factorized['test']==0)].copy()
X_test_n  = X_factorized[(X_factorized['train']==0) & (X_factorized['test']==1)].copy()
X_train_o = X_encoded[(X_encoded['train']==1) & (X_encoded['test']==0)].copy()
X_valid_o = X_encoded[(X_encoded['train']==0) & (X_encoded['test']==0)].copy()
X_test_o  = X_encoded[(X_encoded['train']==0) & (X_encoded['test']==1)].copy()

X_list = [
    X_train_n, X_valid_n, X_test_n, 
    X_train_o, X_valid_o, X_test_o
]
Y_list = [Y_train, Y_valid, Y_test]
for x in X_list:
    x.drop(["train", "test"], axis=1, inplace=True)
    print(x.shape, end=', ')
print('')
    
for y in Y_list:
    y.drop(["train", "test"], axis=1, inplace=True)
    print(y.shape, end=', ')
print('')

In [None]:
# Plot Correlation and Importance of Data Based on Models

plt.figure(figsize=(18,6))
lr = LogisticRegression(solver='lbfgs', max_iter=300)
lr.fit(X_train_n, Y_train.iloc[:,0])
coefs1 = pd.Series(lr.coef_[0], index=X_train_n.columns)
plt.subplot(1,2,1)
coefs1.sort_values().plot(kind="bar")
# plt.show()

gb = GradientBoostingClassifier()
gb.fit(X_train_n, Y_train.iloc[:,0])
importance = gb.feature_importances_
coefs2 = pd.Series(importance, index=X_train_n.columns)
plt.subplot(1,2,2)
coefs2.sort_values().plot(kind="bar")
plt.show()

# Plot using data encoded with one-hot
lr = LogisticRegression(solver='lbfgs', max_iter=300)
lr.fit(X_train_o, Y_train.iloc[:,0])
plt.figure(figsize=(20,6))
coefs1 = pd.Series(lr.coef_[0], index=X_train_o.columns)
coefs1.sort_values()[-20:].plot(kind="bar")
plt.show()

----

### Model basic

See how performance for each model is like with default setting and no parameter tuning. Also, no unimportant features removed yet.

In [None]:
model_names = ['LR','GaussianNB','Gradient Boosting', 'Neural Network']

In [None]:
# Logistic Regression
lr = LogisticRegression(solver='lbfgs', max_iter=1000)

# Predict validation data
lr.fit(X_train_o, Y_train.iloc[:,0])
v_lr_pred = lr.predict(X_valid_o)
v_lr_probs = lr.predict_proba(X_valid_o)
v_lr_probs = v_lr_probs[:,1]
v_lr_acc_score = accuracy_score(Y_valid.iloc[:,0], v_lr_pred)
v_lr_auc_score = roc_auc_score(Y_valid.iloc[:,0], v_lr_probs)
print('Validation - Model: {}, Accuracy: {}, AUC: {}'.format(model_names[0], v_lr_acc_score, v_lr_auc_score))

# Predict test data
lr.fit(X_train_o, Y_train.iloc[:,0])
t_lr_pred = lr.predict(X_test_o)
t_lr_probs = lr.predict_proba(X_test_o)
t_lr_probs = t_lr_probs[:,1]
t_lr_acc_score = accuracy_score(Y_test.iloc[:,0], t_lr_pred)
t_lr_auc_score = roc_auc_score(Y_test.iloc[:,0], t_lr_probs)
print('Test - Model: {}, Accuracy: {}, AUC:{}\n'.format(model_names[0], t_lr_acc_score, t_lr_auc_score))

In [None]:
# Gaussian Naive Bayes

nb = GaussianNB()
# Predict validation data
nb.fit(X_train_o, Y_train.iloc[:,0])
v_nb_pred = nb.predict(X_valid_o)
v_nb_probs = nb.predict_proba(X_valid_o)
v_nb_probs = v_nb_probs[:,1]
v_nb_acc_score = accuracy_score(Y_valid.iloc[:,0], v_nb_pred)
v_nb_auc_score = roc_auc_score(Y_valid.iloc[:,0], v_nb_probs)
print('Validation - Model: {}, Accuracy: {}, AUC: {}'.format(model_names[1], v_nb_acc_score, v_nb_auc_score))
# Predict test data
nb.fit(X_train_o, Y_train.iloc[:,0])
t_nb_pred = nb.predict(X_test_o)
t_nb_probs = nb.predict_proba(X_test_o)
t_nb_probs = t_nb_probs[:,1]
t_nb_acc_score = accuracy_score(Y_test.iloc[:,0], t_nb_pred)
t_nb_auc_score = roc_auc_score(Y_test.iloc[:,0], t_nb_probs)
print('Test - Model: {}, Accuracy: {}, AUC: {}\n'.format(model_names[1], t_nb_acc_score, t_nb_auc_score))

In [None]:
# Gradient Boosting - no tuning

gb = GradientBoostingClassifier()

# Predict validation data
gb.fit(X_train_o, Y_train.iloc[:,0])
v_gb_pred = gb.predict(X_valid_o)
v_gb_probs = gb.predict_proba(X_valid_o)
v_gb_probs = v_gb_probs[:,1]
v_gb_acc_score = accuracy_score(Y_valid.iloc[:,0], v_gb_pred)
v_gb_auc_score = roc_auc_score(Y_valid.iloc[:,0], v_gb_probs)
print('Validation - Model: {}, Accuracy: {}, AUC: {}'.format(model_names[2], v_gb_acc_score, v_gb_auc_score))

# Predict test data
gb.fit(X_train_o, Y_train.iloc[:,0])
t_gb_pred = gb.predict(X_test_o)
t_gb_probs = gb.predict_proba(X_test_o)
t_gb_probs = t_gb_probs[:,1]
t_gb_acc_score = accuracy_score(Y_test.iloc[:,0], t_gb_pred)
t_gb_auc_score = roc_auc_score(Y_test.iloc[:,0], t_gb_probs)
print('Test - Model: {}, Accuracy: {}, AUC: {}'.format(model_names[2], t_gb_acc_score, t_gb_auc_score))

In [None]:
# Neural Network - no tuning

mlp = MLPClassifier(max_iter=5000) # adjust max_iter to 5000 for convergence
# Predict validation data
mlp.fit(X_train_o, Y_train.iloc[:,0])
v_mlp_pred = mlp.predict(X_valid_o)
v_mlp_probs = mlp.predict_proba(X_valid_o)
v_mlp_probs = v_mlp_probs[:,1]
v_mlp_acc_score = accuracy_score(Y_valid.iloc[:,0], v_mlp_pred)
v_mlp_auc_score = roc_auc_score(Y_valid.iloc[:,0], v_mlp_probs)
print('Validation - Model: {}, Accuracy: {}, AUC: {}'.format(model_names[3], v_mlp_acc_score, v_mlp_auc_score))
# Predict test data
mlp.fit(X_train_o, Y_train.iloc[:,0])
t_mlp_pred = mlp.predict(X_test_o)
t_mlp_probs = mlp.predict_proba(X_test_o)
t_mlp_probs = t_mlp_probs[:,1]
t_mlp_acc_score = accuracy_score(Y_test.iloc[:,0], t_mlp_pred)
t_mlp_auc_score = roc_auc_score(Y_test.iloc[:,0], t_mlp_probs)
print('Test - Model: {}, Accuracy: {}, AUC: {}\n'.format(model_names[3], t_mlp_acc_score, t_mlp_auc_score))

In [None]:
# Accuracy and AUC score comparison before unimportant feature remove
print('Accuracy and AUC score comparison before unimportant feature remove\n')
print('Base models:\n')
print('Logistic Regression with no tuning (Validation) - Accuracy: {:2.5f}, AUC: {:2.5f}'.format(v_lr_acc_score, v_lr_auc_score))
print('Gaussian Naive Bayes with no tuning (Validation) - Accuracy: {:2.5f}, AUC: {:2.5f}'.format(v_nb_acc_score, v_nb_auc_score))
print('Logistic Regression with no tuning (Test) - Accuracy: {:2.5f}, AUC: {:2.5f}'.format(t_lr_acc_score, t_lr_auc_score))
print('Gaussian Naive Bayes with no tuning (Test) - Accuracy: {:2.5f}, AUC: {:2.5f}'.format(t_nb_acc_score, t_nb_auc_score))

print('\n---------------------\n')

print('Advanced models:\n')
print('Gradient Boosting with no tuning (Validation) - Accuracy: {:2.5f}, AUC: {:2.5f}'.format(v_gb_acc_score, v_gb_auc_score))
print('Neural Network with no tuning (Validation) - Accuracy: {:2.5f}, AUC: {:2.5f}'.format(v_mlp_acc_score, v_mlp_auc_score))
print('Gradient Boosting with no tuning (Test) - Accuracy: {:2.5f}, AUC: {:2.5f}'.format(t_gb_acc_score, t_gb_auc_score))
print('Neural Network with no tuning (Test) - Accuracy: {:2.5f}, AUC: {:2.5f}'.format(t_mlp_acc_score, t_mlp_auc_score))


---

### Looking into the parameters

In [None]:
names = ['LR','GaussianNB', 'Gradient Boosting', 'Neural Network']

In [None]:
# Logistic Regression
c_val = [0.01, 0.1, 1, 10, 100]
lr_val_accu = [None]*len(c_val)
lr_test_accu = [None]*len(c_val)
lr_val_auc = [None]*len(c_val)
lr_test_auc = [None]*len(c_val)

for i in range(len(c_val)):
    lr = LogisticRegression(solver='liblinear', max_iter=1000, C=c_val[i], penalty='l1')
    print('C value: {}'.format(c_val[i]))
    # Predict validation data
    lr.fit(X_train_o, Y_train.iloc[:,0])
    val_lr_prediction = lr.predict(X_valid_o)
    val_lr_probs = lr.predict_proba(X_valid_o)
    val_lr_probs = val_lr_probs[:,1]
    val_lr_acc_score = accuracy_score(Y_valid.iloc[:,0], val_lr_prediction)
    val_lr_auc_score = roc_auc_score(Y_valid.iloc[:,0], val_lr_probs)
    lr_val_accu[i] = val_lr_acc_score
    lr_val_auc[i] = val_lr_auc_score
    print('Validation - Model: {}, Accuracy: {}, AUC: {}'.format(names[0], val_lr_acc_score, val_lr_auc_score))
    
    # Predict test data
    lr.fit(X_train_o, Y_train.iloc[:,0])
    test_lr_prediction = lr.predict(X_test_o)
    test_lr_probs = lr.predict_proba(X_test_o)
    test_lr_probs = test_lr_probs[:,1]
    test_lr_acc_score = accuracy_score(Y_test.iloc[:,0], test_lr_prediction)
    test_lr_auc_score = roc_auc_score(Y_test.iloc[:,0], test_lr_probs)
    lr_test_accu[i] = test_lr_acc_score
    lr_test_auc[i] = test_lr_auc_score
    print('Test - Model: {}, Accuracy: {}, AUC: {}\n'.format(names[0], test_lr_acc_score, test_lr_auc_score))
    print('------------------')

In [None]:
# Logistic Regression: c value accuracy and AUC plot

plt.figure(figsize=(16, 5))
# Accuracy
plt.subplot(1,2,1)
plt.title('Logistic Regression accuracy')
plt.xlabel('C value')
plt.ylabel('Accuracy')
plt.plot(c_val, lr_val_accu, 'b-', label='Validation')
plt.plot(c_val, lr_test_accu, 'y-', label='Test')
plt.legend()

# AUC Score
plt.subplot(1,2,2)
plt.title('Logistic Regression AUC score')
plt.xlabel('C value')
plt.ylabel('AUC score')
plt.plot(c_val, lr_val_auc, 'b-', label='Validation')
plt.plot(c_val, lr_test_auc, 'y-', label='Test')
plt.legend()
plt.show()

In [None]:
# Gaussian Naive Bayes
var_smooth = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4]
nb_val_accu = [None]*len(var_smooth)
nb_test_accu = [None]*len(var_smooth)
nb_val_auc = [None]*len(var_smooth)
nb_test_auc = [None]*len(var_smooth)

for i in range(len(var_smooth)):
    nb = GaussianNB(var_smoothing=var_smooth[i])
    print('Step value: {}'.format(var_smooth[i]))
    
    # Predict validation data
    nb.fit(X_train_o, Y_train.iloc[:,0])
    val_nb_prediction = nb.predict(X_valid_o)
    val_nb_probs = nb.predict_proba(X_valid_o)
    val_nb_probs = val_nb_probs[:,1]
    val_nb_acc_score = accuracy_score(Y_valid.iloc[:,0], val_nb_prediction)
    val_nb_auc_score = roc_auc_score(Y_valid.iloc[:,0], val_nb_probs)
    nb_val_accu[i] = val_nb_acc_score
    nb_val_auc[i] = val_nb_auc_score
    print('Validation - Model: {}, Accuracy: {}, AUC: {}'.format(names[1], val_nb_acc_score, val_nb_auc_score))
    
    # Predict test data
    nb.fit(X_train_o, Y_train.iloc[:,0])
    test_nb_prediction = nb.predict(X_test_o)
    test_nb_probs = nb.predict_proba(X_test_o)
    test_nb_probs = test_nb_probs[:,1]
    test_nb_acc_score = accuracy_score(Y_test.iloc[:,0], test_nb_prediction)
    test_nb_auc_score = roc_auc_score(Y_test.iloc[:,0], test_nb_probs)
    nb_test_accu[i] = test_nb_acc_score
    nb_test_auc[i] = test_nb_auc_score
    print('Test - Model: {}, Accuracy: {}, AUC: {}\n'.format(names[1], test_nb_acc_score, test_nb_auc_score))
    print('------------------')

In [None]:
# Gaussian Naive Bayes: var_smoothing accuracy and AUC plot
plt.figure(figsize=(16,5))

# Accuracy
plt.subplot(1,2,1)
plt.title('Gaussian Naive Bayes accuracy')
plt.xlabel('Var_smoothing value')
plt.ylabel('Accuracy')
plt.plot(c_val, nb_val_accu, 'b-', label='Validation')
plt.plot(c_val, nb_test_accu, 'y-', label='Test')
plt.legend()

# AUC
plt.subplot(1,2,2)
plt.title('Gaussian Naive Bayes AUC')
plt.xlabel('Var_smoothing value')
plt.ylabel('AUC score')
plt.plot(c_val, nb_val_auc, 'b-', label='Validation')
plt.plot(c_val, nb_test_auc, 'y-', label='Test')
plt.legend()
plt.show()

In [None]:
# Gradient Boosting - n_estimators
stages = [50, 100, 200, 400, 800, 1600]
gb_val_accu = [None]*len(stages)
gb_test_accu = [None]*len(stages)
gb_val_auc = [None]*len(stages)
gb_test_auc = [None]*len(stages)

for i in range(len(stages)):
    gb = GradientBoostingClassifier(n_estimators=stages[i])
    print('N_estimators: {}'.format(stages[i]))
    
    # Predict validation data
    gb.fit(X_train_o, Y_train.iloc[:,0])
    val_gb_prediction = gb.predict(X_valid_o)
    val_gb_probs = gb.predict_proba(X_valid_o)
    val_gb_probs = val_gb_probs[:,1]
    val_gb_acc_score = accuracy_score(Y_valid.iloc[:,0], val_gb_prediction)
    val_gb_auc_score = roc_auc_score(Y_valid.iloc[:,0], val_gb_probs)
    gb_val_accu[i] = val_gb_acc_score
    gb_val_auc[i] = val_gb_auc_score
    print('Validation - Model: {}, Accuracy: {}, AUC: {}'.format(names[2], val_gb_acc_score, val_gb_auc_score))
    
    # Predict test data
    gb.fit(X_train_o, Y_train.iloc[:,0])
    test_gb_prediction = gb.predict(X_test_o)
    test_gb_probs = gb.predict_proba(X_test_o)
    test_gb_probs = test_gb_probs[:,1]
    test_gb_acc_score = accuracy_score(Y_test.iloc[:,0], test_gb_prediction)
    test_gb_auc_score = roc_auc_score(Y_test.iloc[:,0], test_gb_probs)
    gb_test_accu[i] = test_gb_acc_score
    gb_test_auc[i] = test_gb_auc_score
    print('Test - Model: {}, Accuracy: {}, AUC: {}\n'.format(names[2], test_gb_acc_score, test_gb_auc_score))
    print('------------------')

In [None]:
# Gradient Boosting: n_estimator accuracy and AUC plot
plt.figure(figsize=(16,5))

# Accuracy
plt.subplot(1,2,1)
plt.title('Gradient Boosting accuracy')
plt.xlabel('N_estimators')
plt.ylabel('Accuracy')
plt.plot(stages, gb_val_accu, 'b-', label='Validation')
plt.plot(stages, gb_test_accu, 'y-', label='Test')
plt.legend()

# AUC
plt.subplot(1,2,2)
plt.title('Gradient Boosting AUC')
plt.xlabel('N_estimators')
plt.ylabel('AUC score')
plt.plot(stages, gb_val_auc, 'b-', label='Validation')
plt.plot(stages, gb_test_auc, 'y-', label='Test')
plt.legend()
plt.show()

In [None]:
# Gradient Boosting - learning rate
learn = [0.1, 0.2, 0.3, 0.4, 0.5]
gb_val_accu4 = [None]*len(learn)
gb_test_accu4 = [None]*len(learn)
gb_val_auc4 = [None]*len(learn)
gb_test_auc4 = [None]*len(learn)

for i in range(len(learn)):
    gb4 = GradientBoostingClassifier(learning_rate=learn[i])
    print('Learning rate: {}'.format(learn[i]))
    
    # Predict validation data
    gb4.fit(X_train_o, Y_train.iloc[:,0])
    val_gb_prediction4 = gb4.predict(X_valid_o)
    val_gb_probs4 = gb4.predict_proba(X_valid_o)
    val_gb_probs4 = val_gb_probs4[:,1]
    val_gb_acc_score4 = accuracy_score(Y_valid.iloc[:,0], val_gb_prediction4)
    val_gb_auc_score4 = roc_auc_score(Y_valid.iloc[:,0], val_gb_probs4)
    gb_val_accu4[i] = val_gb_acc_score4
    gb_val_auc4[i] = val_gb_auc_score4
    print('Validation - Model: {}, Accuracy: {}, AUC: {}'.format(names[2], val_gb_acc_score4, val_gb_auc_score4))
    
    # Predict test data
    gb4.fit(X_train_o, Y_train.iloc[:,0])
    test_gb_prediction4 = gb4.predict(X_test_o)
    test_gb_probs4 = gb4.predict_proba(X_test_o)
    test_gb_probs4 = test_gb_probs4[:,1]
    test_gb_acc_score4 = accuracy_score(Y_test.iloc[:,0], test_gb_prediction4)
    test_gb_auc_score4 = roc_auc_score(Y_test.iloc[:,0], test_gb_probs4)
    gb_test_accu4[i] = test_gb_acc_score4
    gb_test_auc4[i] = test_gb_auc_score4
    print('Test - Model: {}, Accuracy: {}, AUC: {}\n'.format(names[2], test_gb_acc_score4, test_gb_auc_score4))
    print('------------------')

In [None]:
# Gradient Boosting: learning rate accuracy and AUC plot
plt.figure(figsize=(16,5))

# Accuracy
plt.subplot(1,2,1)
plt.title('Gradient Boosting accuracy')
plt.xlabel('Learning rate')
plt.ylabel('Accuracy')
plt.plot(learn, gb_val_accu4, 'b-', label='Validation')
plt.plot(learn, gb_test_accu4, 'y-', label='Test')
plt.legend()

# AUC
plt.subplot(1,2,2)
plt.title('Gradient Boosting AUC')
plt.xlabel('Learning rate')
plt.ylabel('AUC score')
plt.plot(learn, gb_val_auc4, 'b-', label='Validation')
plt.plot(learn, gb_test_auc4, 'y-', label='Test')
plt.legend()
plt.show()

In [None]:
names = ['LR','GaussianNB', 'Gradient Boosting', 'Neural Network']

In [None]:
# Neural Network - tolerance

tol = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
mlp_val_accu3 = [None]*len(tol)
mlp_test_accu3 = [None]*len(tol)
mlp_val_auc3 = [None]*len(tol)
mlp_test_auc3 = [None]*len(tol)

for i in range(len(tol)):
    mlp3 = MLPClassifier(tol=tol[i], max_iter=5000) # max_iter=5000 for convergence purpose
    print('Tolerance: {}'.format(tol[i]))
    
    # Predict validation data
    mlp3.fit(X_train_o, Y_train.iloc[:,0])
    val_mlp_prediction3 = mlp3.predict(X_valid_o)
    val_mlp_probs3 = mlp3.predict_proba(X_valid_o)
    val_mlp_probs3 = val_mlp_probs3[:,1]
    val_mlp_acc_score3 = accuracy_score(Y_valid.iloc[:,0], val_mlp_prediction3)
    val_mlp_auc_score3 = roc_auc_score(Y_valid.iloc[:,0], val_mlp_probs3)
    mlp_val_accu3[i] = val_mlp_acc_score3
    mlp_val_auc3[i] = val_mlp_auc_score3
    print('Validation - Model: {}, Accuracy: {}, AUC: {}'.format(names[3], val_mlp_acc_score3, val_mlp_auc_score3))
    
    # Predict test data
    mlp3.fit(X_train_o, Y_train.iloc[:,0])
    test_mlp_prediction3 = mlp3.predict(X_test_o)
    test_mlp_probs3 = mlp3.predict_proba(X_test_o)
    test_mlp_probs3 = test_mlp_probs3[:,1]
    test_mlp_acc_score3 = accuracy_score(Y_test.iloc[:,0], test_mlp_prediction3)
    test_mlp_auc_score3 = roc_auc_score(Y_test.iloc[:,0], test_mlp_probs3)
    mlp_test_accu3[i] = test_mlp_acc_score3
    mlp_test_auc3[i] = test_mlp_auc_score3
    print('Test - Model: {}, Accuracy: {}, AUC: {}\n'.format(names[3], test_mlp_acc_score3, test_mlp_auc_score3))

In [None]:
# Neural Network: tolerance accuracy and AUC plot
plt.figure(figsize=(16,5))

# Accuracy
plt.subplot(1,2,1)
plt.title('MLPClassifer accuracy')
plt.xlabel('Tolerance')
plt.ylabel('Accuracy')
plt.plot(tol, mlp_val_accu3, 'b-', label='Validation')
plt.plot(tol, mlp_test_accu3, 'y-', label='Test')
plt.legend()

# AUC
plt.subplot(1,2,2)
plt.title('MLPClassifer AUC')
plt.xlabel('Tolerance')
plt.ylabel('AUC score')
plt.plot(tol, mlp_val_auc3, 'b-', label='Validation')
plt.plot(tol, mlp_test_auc3, 'y-', label='Test')
plt.legend()
plt.show()

In [None]:
# Neural Network - max_iter value

max_iter = [5000, 6000, 7000, 8000, 10000]
mlp_val_accu4 = [None]*len(max_iter)
mlp_test_accu4 = [None]*len(max_iter)
mlp_val_auc4 = [None]*len(max_iter)
mlp_test_auc4 = [None]*len(max_iter)

for i in range(len(max_iter)):
    mlp4 = MLPClassifier(max_iter=max_iter[i])
    print('max_iter: {}'.format(max_iter[i]))
    
    # Predict validation data
    mlp4.fit(X_train_o, Y_train.iloc[:,0])
    val_mlp_prediction4 = mlp4.predict(X_valid_o)
    val_mlp_probs4 = mlp4.predict_proba(X_valid_o)
    val_mlp_probs4 = val_mlp_probs4[:,1]
    val_mlp_acc_score4 = accuracy_score(Y_valid.iloc[:,0], val_mlp_prediction4)
    val_mlp_auc_score4 = roc_auc_score(Y_valid.iloc[:,0], val_mlp_probs4)
    mlp_val_accu4[i] = val_mlp_acc_score4
    mlp_val_auc4[i] = val_mlp_auc_score4
    print('Validation - Model: {}, Accuracy: {}, AUC: {}'.format(names[3], val_mlp_acc_score4, val_mlp_auc_score4))
    
    # Predict test data
    mlp4.fit(X_train_o, Y_train.iloc[:,0])
    test_mlp_prediction4 = mlp4.predict(X_test_o)
    test_mlp_probs4 = mlp4.predict_proba(X_test_o)
    test_mlp_probs4 = test_mlp_probs4[:,1]
    test_mlp_acc_score4 = accuracy_score(Y_test.iloc[:,0], test_mlp_prediction4)
    test_mlp_auc_score4 = roc_auc_score(Y_test.iloc[:,0], test_mlp_probs4)
    mlp_test_accu4[i] = test_mlp_acc_score4
    mlp_test_auc4[i] = test_mlp_auc_score4
    print('Test - Model: {}, Accuracy: {}, AUC: {}\n'.format(names[3], test_mlp_acc_score4, test_mlp_auc_score4))

In [None]:
# Neural Network: max_iter accuracy and AUC plot
plt.figure(figsize=(16,5))

# Accuracy
plt.subplot(1,2,1)
plt.title('MLPClassifer accuracy')
plt.xlabel('Max iterations')
plt.ylabel('Accuracy')
plt.plot(max_iter, mlp_val_accu4, 'b-', label='Validation')
plt.plot(max_iter, mlp_test_accu4, 'y-', label='Test')
plt.legend()

# AUC
plt.subplot(1,2,2)
plt.title('MLPClassifer AUC')
plt.xlabel('Max iterations')
plt.ylabel('AUC score')
plt.plot(max_iter, mlp_val_auc4, 'b-', label='Validation')
plt.plot(max_iter, mlp_test_auc4, 'y-', label='Test')
plt.legend()
plt.show()

---

### Model tuning

Try tuning different parameters for each model and see which parameter(s) are better accuracy and AUC score performance using GridSearchCV.

In [None]:
names = ['LR','GaussianNB', 'Gradient Boosting', 'Neural Network']

In [None]:
# Logistic Regression parameter tuning
lr = LogisticRegression(solver='liblinear', max_iter=1000, penalty='l1')

lr_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
}

scoring = {'AUC': 'roc_auc', 'Accuracy': 'accuracy'}

# Grid Search for Logistic Regression
gd_lr = GridSearchCV(estimator=lr, param_grid=lr_grid, scoring=scoring, cv=5, n_jobs=-1, refit='Accuracy')
lr_res = gd_lr.fit(X_train_o, Y_train.iloc[:,0])
val_lr_prediction = gd_lr.predict(X_valid_o)
val_lr_probs = gd_lr.predict_proba(X_valid_o)
val_lr_probs = val_lr_probs[:,1]
val_lr_acc_score = accuracy_score(Y_valid.iloc[:,0], val_lr_prediction)
val_lr_auc_score = roc_auc_score(Y_valid.iloc[:,0], val_lr_probs)

test_lr_prediction = gd_lr.predict(X_valid_o)
test_lr_probs = gd_lr.predict_proba(X_valid_o)
test_lr_probs = test_lr_probs[:,1]
test_lr_acc_score = accuracy_score(Y_valid.iloc[:,0], test_lr_prediction)
test_lr_auc_score = roc_auc_score(Y_valid.iloc[:,0], test_lr_probs)

In [None]:
# Print Logistic Regression results
print('Validation - Model: {}, Accuracy: {}, AUC: {}'.format(names[0], val_lr_acc_score, val_lr_auc_score))
print('Test - Model: {}, Accuracy: {}, AUC: {}'.format(names[0], test_lr_acc_score, test_lr_auc_score))
print("Best parameter(s) for %s: %s" % (names[0], lr_res.best_params_))

In [None]:
# Gaussian Naive Bayes parameter tuning
nb = GaussianNB()

nb_grid = {
    'var_smoothing': [1e-8, 1e-7, 1e-6, 1e-5, 1e-4]
}

scoring = {'AUC': 'roc_auc', 'Accuracy': 'accuracy'}

# Grid Search for Naives Bayes
gd_nb = GridSearchCV(estimator=nb, param_grid=nb_grid, scoring=scoring, cv=5, n_jobs=-1, refit='Accuracy')
nb_res = gd_nb.fit(X_train_o, Y_train.iloc[:,0])
val_nb_prediction = gd_nb.predict(X_valid_o)
val_nb_probs = gd_nb.predict_proba(X_valid_o)
val_nb_probs = val_nb_probs[:,1]
val_nb_acc_score = accuracy_score(Y_valid.iloc[:,0], val_nb_prediction)
val_nb_auc_score = roc_auc_score(Y_valid.iloc[:,0], val_nb_probs)

test_nb_prediction = gd_nb.predict(X_valid_o)
test_nb_probs = gd_nb.predict_proba(X_valid_o)
test_nb_probs = test_nb_probs[:,1]
test_nb_acc_score = accuracy_score(Y_valid.iloc[:,0], test_nb_prediction)
test_nb_auc_score = roc_auc_score(Y_valid.iloc[:,0], test_nb_probs)

In [None]:
# Print Gaussian Naive Bayes results
print('Validation - Model: {}, Accuracy: {}, AUC: {}'.format(names[1], val_nb_acc_score, val_nb_auc_score))
print('Test - Model: {}, Accuracy: {}, AUC: {}\n'.format(names[1], test_nb_acc_score, test_nb_auc_score))
print("Best parameter(s) for %s: %s" % (names[1], nb_res.best_params_))

In [None]:
# Gradient Boosting parameter tuning
gb = GradientBoostingClassifier()

gb_grid = {
    'n_estimators': [50, 100, 200, 400, 800, 1600],
    'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5]
}

scoring = {'AUC': 'roc_auc', 'Accuracy': 'accuracy'}

# Grid Search for Gradient Boosting
gd_gb = GridSearchCV(estimator=gb, param_grid=gb_grid, scoring=scoring, cv=5, n_jobs=-1, refit='Accuracy')
gb_res = gd_gb.fit(X_train_o, Y_train.iloc[:,0])
val_gb_prediction = gd_gb.predict(X_valid_o)
val_gb_probs = gd_gb.predict_proba(X_valid_o)
val_gb_probs = val_gb_probs[:,1]
val_gb_acc_score = accuracy_score(Y_valid.iloc[:,0], val_gb_prediction)
val_gb_auc_score = roc_auc_score(Y_valid.iloc[:,0], val_gb_probs)

test_gb_prediction = gd_gb.predict(X_valid_o)
test_gb_probs = gd_gb.predict_proba(X_valid_o)
test_gb_probs = test_gb_probs[:,1]
test_gb_acc_score = accuracy_score(Y_valid.iloc[:,0], test_gb_prediction)
test_gb_auc_score = roc_auc_score(Y_valid.iloc[:,0], test_gb_probs)

In [None]:
# Print Gradient Boosting results
print('Validation - Model: {}, Accuracy: {}, AUC: {}'.format(names[2], val_gb_acc_score, val_gb_auc_score))
print('Test - Model: {}, Accuracy: {}, AUC: {}\n'.format(names[2], test_gb_acc_score, test_gb_auc_score))
print("Best parameter(s) for %s: %s" % (names[2], gb_res.best_params_))

In [None]:
# Neural Network parameter tuning
mlp = MLPClassifier()

mlp_grid = {
    'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
    'max_iter': [5000, 6000, 7000, 8000, 10000]
}

scoring = {'AUC': 'roc_auc', 'Accuracy': 'accuracy'}

# Grid Search for Neural Network
gd_mlp = GridSearchCV(estimator=mlp, param_grid=mlp_grid, scoring=scoring, cv=5, n_jobs=-1, refit='Accuracy')
mlp_res = gd_mlp.fit(X_train_o, Y_train.iloc[:,0])
val_mlp_prediction = gd_mlp.predict(X_valid_o)
val_mlp_probs = gd_mlp.predict_proba(X_valid_o)
val_mlp_probs = val_mlp_probs[:,1]
val_mlp_acc_score = accuracy_score(Y_valid.iloc[:,0], val_mlp_prediction)
val_mlp_auc_score = roc_auc_score(Y_valid.iloc[:,0], val_mlp_probs)

test_mlp_prediction = gd_mlp.predict(X_valid_o)
test_mlp_probs = gd_mlp.predict_proba(X_valid_o)
test_mlp_probs = test_mlp_probs[:,1]
test_mlp_acc_score = accuracy_score(Y_valid.iloc[:,0], test_mlp_prediction)
test_mlp_auc_score = roc_auc_score(Y_valid.iloc[:,0], test_mlp_probs)

In [None]:
# Print Neural Network results
print('Validation - Model: {}, Accuracy: {}, AUC: {}'.format(names[3], val_mlp_acc_score, val_mlp_auc_score))
print('Test - Model: {}, Accuracy: {}, AUC: {}\n'.format(names[3], test_mlp_acc_score, test_mlp_auc_score))
print("Best parameter(s) for %s: %s" % (names[3], mlp_res.best_params_))