# Build a model 
## create training, validation and test samples.

In [None]:
import pandas as pd

In [None]:
# load features from pre processed data file
df = pd.read_csv('data\df_data_features.csv')

The first thing I like to do is to shuffle the samples in case there was some order (e.g. all positive samples on top). Here n is the number of data points. random_state is a seed for the random number generator. This allows you to obtain reproducible results when sampling from the DataFrame.

In [None]:
# shuffle data
df = df.sample(n=len(df), random_state = 42)
df = df.reset_index(drop = True)

#extract 30% data and split them equally to validation and test samples. 
df_valid = df.sample(frac=0.30, random_state=42)
df_train = df.drop(df_valid.index)
df_test = df_valid.sample(frac=0.5, random_state=42)
df_valid = df_valid.drop(df_test.index)

print('test data size: %.3f'%(len(df_test)/len(df)))
print('validation data size: %.3f'%(len(df_valid)/len(df)))
print('training data size: %.3f'%(len(df_train)/len(df)))

In [None]:
#Have we used all the data?
print('sample count (n = %d)'%len(df))
assert len(df) == (len(df_test)+len(df_valid)+len(df_train)),'Not all samples used.'

In [None]:
def calc_prevalence(y_actual):
    return (sum(y_actual)/len(y_actual))

In [None]:
print('Test prevalence(n = %d):%.3f'%(len(df_test),calc_prevalence(df_test.OUTPUT_LABEL.values)))
print('Valid prevalence(n = %d):%.3f'%(len(df_valid),calc_prevalence(df_valid.OUTPUT_LABEL.values)))
print('Train all prevalence(n = %d):%.3f'%(len(df_train), calc_prevalence(df_train.OUTPUT_LABEL.values)))

Is the data ready to be dropped into predictive model?

The dataset is  imbalanced with more negatives than positives. So the model might just assign all samples as negative.

Let me create a balanced training data set by sub-sampling. There may be other approaches to create a balanced training data set.

In [None]:

# split the training data into positive and negative
rows_pos = df_train.OUTPUT_LABEL == 1
df_train_pos = df_train.loc[rows_pos]
df_train_neg = df_train.loc[~rows_pos]

# create balanced data by merging positive and equal number of negative data samples
df_train_balanced = pd.concat([df_train_pos, df_train_neg.sample(n = len(df_train_pos), random_state = 42)],axis = 0)

# shuffle the order of training samples 
df_train_balanced = df_train_balanced.sample(n = len(df_train_balanced), random_state = 42).reset_index(drop = True)

print('Balanced training data prevalence(n = %d):%.3f'%(len(df_train_balanced), calc_prevalence(df_train_balanced.OUTPUT_LABEL.values)))


In [None]:
df_train_balanced.to_csv('data\df_train_balanced.csv',index=False)
df_train.to_csv('data\df_train.csv',index=False)
df_valid.to_csv('data\df_valid.csv',index=False)
df_test.to_csv('data\df_test.csv',index=False)

In [None]:
col2use = df_train.columns.tolist()
col2use.remove('OUTPUT_LABEL')
col2use.remove('Unnamed: 0')

In [None]:
#Get features values for model training 
X_train_all = df_train[col2use].values
X_train = df_train_balanced[col2use].values
X_valid = df_valid[col2use].values

y_train = df_train_balanced['OUTPUT_LABEL'].values
y_valid = df_valid['OUTPUT_LABEL'].values

print('Training All shapes:',X_train_all.shape)
print('Training shapes:',X_train.shape, y_train.shape)
print('Validation shapes:',X_valid.shape, y_valid.shape)

The features in this dataset are of different scales. Before feeding the data to a machine learning model, this data should be normalized.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train_all)

X_train_tf = scaler.transform(X_train)
X_valid_tf = scaler.transform(X_valid)

In [None]:
#Save the scaler to use it with test data
import pickle
pickle.dump(scaler, open('data\scaler.sav', 'wb'))

### utility functions

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
def calc_specificity(y_actual, y_pred, thresh=0.5):
    # calculates specificity
    return sum((y_pred < thresh) & (y_actual == 0)) /sum(y_actual ==0)

def print_report(y_actual, y_pred, thresh=0.5):
    auc = roc_auc_score(y_actual, y_pred)
    accuracy = accuracy_score(y_actual, (y_pred > thresh))
    recall = recall_score(y_actual, (y_pred > thresh))
    precision = precision_score(y_actual, (y_pred > thresh))
    specificity = calc_specificity(y_actual, y_pred, thresh)
    print('AUC:%.3f'%auc)
    print('accuracy:%.3f'%accuracy)
    print('recall:%.3f'%recall)
    print('precision:%.3f'%precision)
    print('specificity:%.3f'%specificity)
    print('prevalence:%.3f'%calc_prevalence(y_actual))
    print(' ')
    return auc, accuracy, recall, precision, specificity

# Model selection

## KNN

In [None]:
# k-nearest neighbors
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors = 100)
knn.fit(X_train_tf, y_train)

In [None]:
Y_knn = knn.predict(X_valid_tf)

y_train_preds = knn.predict_proba(X_train_tf)[:,1]
y_valid_preds_knn = knn.predict_proba(X_valid_tf)[:,1]

print('KNN')
print('Training:')
thresh = 0.5
knn_train_auc, knn_train_accuracy, knn_train_recall, \
    knn_train_precision, knn_train_specificity = print_report(y_train,y_train_preds, thresh)
print('Validation:')
knn_valid_auc, knn_valid_accuracy, knn_valid_recall, \
    knn_valid_precision, knn_valid_specificity = print_report(y_valid,y_valid_preds_knn, thresh)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix,roc_curve, auc,roc_auc_score
roc_auc = roc_auc_score(y_valid, y_valid_preds_knn)
fp_rate, tp_rate, thresholds = roc_curve(y_valid, y_valid_preds_knn)
plt.figure()
plt.plot(fp_rate, tp_rate, label='KNN (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC - KNN')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()


In [None]:
import seaborn as sns
#plot confusion matrix
plt.figure(figsize=(9,9))
sns.heatmap(confusion_matrix(y_valid, Y_knn), annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Greens_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
title = 'Accuracy Score: {0}'.format(knn.score(X_valid_tf , y_valid))
plt.title(title, size = 12);

In [None]:
# Checking the summary of classification
from sklearn.metrics import classification_report
print(classification_report(y_valid, Y_knn, target_names = ['NO', 'YES']))

## Stochastic Gradient Descent

In [None]:
# stochastic gradient descent
from sklearn.linear_model import SGDClassifier
sgdc = SGDClassifier(loss='log_loss', random_state=42)
sgdc.fit(X_train_tf, y_train)

In [None]:
Y_sgdc=sgdc.predict(X_valid_tf)

In [None]:
y_train_preds = sgdc.predict_proba(X_train_tf)[:,1]
y_valid_preds_sgdc = sgdc.predict_proba(X_valid_tf)[:,1]

print('Stochastic Gradient Descend')
print('Training:')
sgdc_train_auc, sgdc_train_accuracy, sgdc_train_recall, sgdc_train_precision, sgdc_train_specificity =print_report(y_train,y_train_preds, thresh)
print('Validation:')
sgdc_valid_auc, sgdc_valid_accuracy, sgdc_valid_recall, sgdc_valid_precision, sgdc_valid_specificity = print_report(y_valid,y_valid_preds_sgdc, thresh)

In [None]:
plt.figure(figsize=(9,9))
sns.heatmap(confusion_matrix(y_valid, Y_sgdc), annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Greens_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(sgdc.score(X_valid_tf , y_valid))
plt.title(all_sample_title, size = 15);

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(max_depth = 10, random_state = 42)
tree.fit(X_train_tf, y_train)


In [None]:
Y_tree=tree.predict(X_valid_tf)

In [None]:
y_train_preds = tree.predict_proba(X_train_tf)[:,1]
y_valid_preds_tree = tree.predict_proba(X_valid_tf)[:,1]

print('Decision Tree')
print('Training:')
tree_train_auc, tree_train_accuracy, tree_train_recall, tree_train_precision, tree_train_specificity =print_report(y_train,y_train_preds, thresh)
print('Validation:')
tree_valid_auc, tree_valid_accuracy, tree_valid_recall, tree_valid_precision, tree_valid_specificity = print_report(y_valid,y_valid_preds_tree, thresh)

In [None]:
from sklearn.metrics import confusion_matrix,roc_curve, auc,roc_auc_score
roc_auc = roc_auc_score(y_valid, y_valid_preds_tree)
fp_rate, tp_rate, thresholds = roc_curve(y_valid, y_valid_preds_tree)
plt.figure()
plt.plot(fp_rate, tp_rate, label='Decision Tree (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC -  Decision Trees')
plt.legend(loc="lower right")
plt.show()

In [None]:
plt.figure(figsize=(9,9))
sns.heatmap(confusion_matrix(y_valid, Y_tree), annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Greens_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(tree.score(X_valid_tf , y_valid))
plt.title(all_sample_title, size = 15);

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(max_depth = 6, random_state = 42)
rf.fit(X_train_tf, y_train)


In [None]:
Y_rf=rf.predict(X_valid_tf)

In [None]:
y_train_preds = rf.predict_proba(X_train_tf)[:,1]
y_valid_preds_rf = rf.predict_proba(X_valid_tf)[:,1]

print('Random Forest')
print('Training:')
rf_train_auc, rf_train_accuracy, rf_train_recall, rf_train_precision, rf_train_specificity =print_report(y_train,y_train_preds, thresh)
print('Validation:')
rf_valid_auc, rf_valid_accuracy, rf_valid_recall, rf_valid_precision, rf_valid_specificity = print_report(y_valid,y_valid_preds_rf, thresh)

In [None]:
from sklearn.metrics import confusion_matrix,roc_curve, auc,roc_auc_score
roc_auc = roc_auc_score(y_valid, y_valid_preds_rf)
fp_rate, tp_rate, thresholds = roc_curve(y_valid, y_valid_preds_rf)
plt.figure()
plt.plot(fp_rate, tp_rate, label='Random Forest area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC -  Random Forest')
plt.legend(loc="lower right")
plt.show()

In [None]:
plt.figure(figsize=(9,9))
sns.heatmap(confusion_matrix(y_valid, Y_rf), annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(rf.score(X_valid_tf , y_valid))
plt.title(all_sample_title, size = 15);

## Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc =GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
     max_depth=3, random_state=42)
gbc.fit(X_train_tf, y_train)


In [None]:
Y_gbc=gbc.predict(X_valid_tf)

In [None]:
y_train_preds = gbc.predict_proba(X_train_tf)[:,1]
y_valid_preds_gbc = gbc.predict_proba(X_valid_tf)[:,1]

print('Gradient Boosting Classifier')
print('Training:')
gbc_train_auc, gbc_train_accuracy, gbc_train_recall, gbc_train_precision, gbc_train_specificity = print_report(y_train,y_train_preds, thresh)
print('Validation:')
gbc_valid_auc, gbc_valid_accuracy, gbc_valid_recall, gbc_valid_precision, gbc_valid_specificity = print_report(y_valid,y_valid_preds_gbc, thresh)

In [None]:
plt.figure(figsize=(9,9))
sns.heatmap(confusion_matrix(y_valid, Y_gbc), annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(gbc.score(X_valid_tf , y_valid))
plt.title(all_sample_title, size = 15);

In [None]:
sns.set(style="darkgrid")

In [None]:
df_results = pd.DataFrame({'classifier':['KNN','KNN','SGDC','SGDC','DT','DT','RF','RF','GBC','GBC'],
                           'data_set':['train','valid']*5,
                          'auc':[knn_train_auc, knn_valid_auc,sgdc_train_auc,sgdc_valid_auc,tree_train_auc,tree_valid_auc,rf_train_auc,rf_valid_auc,gbc_train_auc,gbc_valid_auc],
                          'accuracy':[knn_train_accuracy, knn_valid_accuracy,sgdc_train_accuracy,sgdc_valid_accuracy,tree_train_accuracy,tree_valid_accuracy,rf_train_accuracy,rf_valid_accuracy,gbc_train_accuracy,gbc_valid_accuracy],
                          'recall':[knn_train_recall, knn_valid_recall,sgdc_train_recall,sgdc_valid_recall,tree_train_recall,tree_valid_recall,rf_train_recall,rf_valid_recall,gbc_train_recall,gbc_valid_recall],
                          'precision':[knn_train_precision, knn_valid_precision,sgdc_train_precision,sgdc_valid_precision,tree_train_precision,tree_valid_precision,rf_train_precision,rf_valid_precision,gbc_train_precision,gbc_valid_precision],
                          'specificity':[knn_train_specificity, knn_valid_specificity,sgdc_train_specificity,sgdc_valid_specificity,tree_train_specificity,tree_valid_specificity,rf_train_specificity,rf_valid_specificity,gbc_train_specificity,gbc_valid_specificity]})

In [None]:
ax = sns.barplot(x="classifier", y="auc", hue="data_set", data=df_results)
fontsize=12
ax.set_xlabel('Classifier',fontsize = fontsize)
ax.set_ylabel('AUC', fontsize = fontsize)
ax.tick_params(labelsize=fontsize)

# Put the legend out of the figure
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0., fontsize = fontsize)
plt.grid()
plt.show()

In [None]:
ax = sns.barplot(x="classifier", y="recall", hue="data_set", data=df_results)
ax.set_xlabel('Classifier',fontsize = fontsize)
ax.set_ylabel('Recall', fontsize = fontsize)
ax.tick_params(labelsize=fontsize)

# Put the legend out of the figure
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0., fontsize = fontsize)
plt.show()

In [None]:
ax = sns.barplot(x="classifier", y="precision", hue="data_set", data=df_results)
ax.set_xlabel('Classifier',fontsize = fontsize)
ax.set_ylabel('Precision', fontsize = fontsize)
ax.tick_params(labelsize=fontsize)

# Put the legend out of the figure
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0., fontsize = fontsize)
plt.show()

In [None]:
ax = sns.barplot(x="classifier", y="specificity", hue="data_set", data=df_results)
ax.set_xlabel('Classifier',fontsize = fontsize)
ax.set_ylabel('Specificity', fontsize = fontsize)
ax.tick_params(labelsize=fontsize)

# Put the legend out of the figure
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0., fontsize = fontsize)
plt.show()

In [None]:
plt.figure(figsize=(14, 5))
ax = plt.subplot(111)

models = ['KNN','Stochastic Gradient Descent Classifier','Decision Tree','Random Forest','Gradient Bossting Classifier' ]
model = np.arange(len(models))
values = [knn_train_auc, sgdc_train_auc,  tree_train_auc,rf_train_auc, gbc_train_auc]
plt.bar(model, values, align='center', width = 0.15, alpha=0.7, color = 'red', label= 'AUC')
plt.xticks(model, models)

values = [knn_train_accuracy, sgdc_train_accuracy,tree_train_accuracy, rf_train_accuracy,gbc_train_accuracy]
plt.bar(model+0.15, values, align='center', width = 0.15, alpha=0.7, color = 'blue', label = 'auccuracy')
plt.xticks(model, models)

values = [knn_train_recall, sgdc_train_recall,  tree_train_recall,rf_train_recall,gbc_train_recall]
plt.bar(model+0.3, values, align='center', width = 0.15, alpha=0.7, color = 'green', label = 'recall')
plt.xticks(model, models)

#ax.invert_yaxis()
values = [knn_train_precision,sgdc_train_precision, tree_train_precision, rf_train_precision,gbc_train_precision]
plt.bar(model+0.45, values, align='center', width = 0.15, alpha=0.7, color = 'orange', label = 'precision')
plt.xticks(model, models,rotation=90)

values = [knn_train_specificity, sgdc_train_specificity, tree_train_specificity, rf_train_specificity,gbc_train_specificity]
plt.bar(model+0.60, values, align='center', width = 0.15, alpha=0.7, color = 'black', label = 'specifity')
plt.xticks(model, models)

plt.ylabel('Metrics')
plt.title('Performance of different models on training data')
# removing the axis on the top and right of the plot window
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.legend()

plt.show()

In [None]:
plt.figure(figsize=(14, 5))
ax = plt.subplot(111)

models = ['KNN','Stochastic Gradient Descent Classifier','Decision Tree','Random Forest','Gradient Bossting Classifier' ]
values = [knn_valid_auc, sgdc_valid_auc, tree_valid_auc, rf_valid_auc, gbc_valid_auc]
model = np.arange(len(models))
plt.bar(model, values, align='center', width = 0.15, alpha=0.7, color = 'red', label= 'AUC')
plt.xticks(model, models)

values = [knn_valid_accuracy,  sgdc_valid_accuracy, tree_valid_accuracy,rf_valid_accuracy, gbc_valid_accuracy]
model = np.arange(len(models))
plt.bar(model+0.15, values, align='center', width = 0.15, alpha=0.7, color = 'blue', label = 'auccuracy')
plt.xticks(model, models)

values = [knn_train_recall, sgdc_train_recall, tree_train_recall, rf_train_recall, gbc_train_recall]
model = np.arange(len(models))
plt.bar(model+0.3, values, align='center', width = 0.15, alpha=0.7, color = 'green', label = 'recall')
plt.xticks(model, models)

values = [knn_valid_precision, sgdc_valid_precision, tree_valid_precision, rf_valid_precision, gbc_valid_precision]
model = np.arange(len(models))
plt.bar(model+0.45, values, align='center', width = 0.15, alpha=0.7, color = 'orange', label = 'precision')
plt.xticks(model, models,rotation=90)

values = [knn_valid_specificity, sgdc_valid_specificity, tree_valid_specificity, rf_valid_specificity, gbc_valid_specificity]
model = np.arange(len(models))
plt.bar(model+0.60, values, align='center', width = 0.15, alpha=0.7, color = 'black', label = 'specifity')
plt.xticks(model, models)

plt.title('Performance of different models on validation data')
plt.ylabel('Metric')
    
# removing the axis on the top and right of the plot window
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.legend()

plt.show()

In [None]:
fpr_knn, tpr_knn, thresholds = roc_curve(y_valid, y_valid_preds_knn)#knn
fpr_sgdc, tpr_sgdc, thresholds = roc_curve(y_valid,y_valid_preds_sgdc )#SGD tree
fpr_dt, tpr_dt, thresholds = roc_curve(y_valid,y_valid_preds_tree )#decision tree
fpr_rf, tpr_rf, thresholds = roc_curve(y_valid, y_valid_preds_rf)#random forest classifier
fpr_gbc, tpr_gbc, thresholds = roc_curve(y_valid,y_valid_preds_gbc )#gbc tree

roc_auc_knn = roc_auc_score(y_valid, y_valid_preds_knn)
roc_auc_sgdc = roc_auc_score(y_valid, y_valid_preds_sgdc)
roc_auc_tree = roc_auc_score(y_valid, y_valid_preds_tree)
roc_auc_rf = roc_auc_score(y_valid, y_valid_preds_rf)
roc_auc_gbc = roc_auc_score(y_valid, y_valid_preds_gbc)

In [None]:
#Draw ROC curve of different models
plt.figure(figsize=(8,8))

plt.plot(fpr_knn, tpr_knn, label='KNN area = %0.2f)' % roc_auc_knn)
plt.plot(fpr_sgdc, tpr_sgdc, label='SGDC area = %0.2f)' % roc_auc_sgdc)
plt.plot(fpr_dt, tpr_dt, label='Decision Tree area = %0.2f)' % roc_auc_tree)
plt.plot(fpr_rf, tpr_rf, label='Random Forest area = %0.2f)' % roc_auc_rf)
plt.plot(fpr_gbc, tpr_gbc, label='GBC area = %0.2f)' % roc_auc_gbc)

plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',label='random', alpha=.8)
plt.xticks(np.arange(0,1.1,0.1))
plt.yticks(np.arange(0,1.1,0.1))
plt.grid()
plt.legend()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

In [None]:
from sklearn.model_selection import ShuffleSplit
from learn_curve import plot_learning_curve

In [None]:
title = "Learning Curves (Random Forest)"
# Cross validation with 5 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=5, test_size=0.15, random_state=42)
estimator = RandomForestClassifier(max_depth = 6, random_state = 42)
plot_learning_curve(estimator, title, X_train_tf, y_train, ylim=(0.2, 1.01), cv=cv, n_jobs=4)

plt.show()

In [None]:
title = "Learning Curves (Stochastic Gradient Descent)"
# Cross validation with 5 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=5, test_size=0.15, random_state=42)
estimator = SGDClassifier(loss='log_loss',random_state = 42)
plot_learning_curve(estimator, title, X_train_tf, y_train, ylim=(0.2, 1.01), cv=cv, n_jobs=4)

plt.show()

In [None]:
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = col2use,
                                    columns=['importance']).sort_values('importance',
                                                                        ascending=False)

In [None]:
feature_importances.head(10)

In [None]:
num = 50
ylocs = np.arange(num)
# get the feature importance for top num and sort in reverse order
values_to_plot = feature_importances.iloc[:num].values.ravel()[::-1]
feature_labels = list(feature_importances.iloc[:num].index)[::-1]

plt.figure(num=None, figsize=(8, 15), dpi=80, facecolor='w', edgecolor='k');
plt.barh(ylocs, values_to_plot, align = 'center')
plt.ylabel('Features')
plt.xlabel('Importance Score')
plt.title('Feature Importance Score - Random Forest')
plt.yticks(ylocs, feature_labels)
plt.show()

In [None]:
feature_importances = pd.DataFrame(tree.feature_importances_,
                                   index = col2use,
                                    columns=['importance']).sort_values('importance',
                                                                        ascending=False)

In [None]:
feature_importances.head(10)

In [None]:
num = 50
ylocs = np.arange(num)
# get the feature importance for top num and sort in reverse order
values_to_plot = feature_importances.iloc[:num].values.ravel()[::-1]
feature_labels = list(feature_importances.iloc[:num].index)[::-1]

plt.figure(num=None, figsize=(8, 15), dpi=80, facecolor='w', edgecolor='k');
plt.barh(ylocs, values_to_plot, align = 'center')
plt.ylabel('Features')
plt.xlabel('Importance Score')
plt.title('Feature Importance Score - Random Forest')
plt.yticks(ylocs, feature_labels)
plt.show()

It looks like most important variables for random forest and decision tree are continuous variables. It may be because they can split many times compared to categorical variables.

It may be worth reducing the number of variables to top N positive and negative features or use PCA.

## Hyperparameter tuning
Hyperparameter tuning is about optimizing the parameters used in the models. For example, what is the maximum depth for your random forest? 

I will use Grid search to test all possible combinations over a grid of values. This is very computationally intensive. Most of this section is based on this medium blog post (https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74)by William Koehrsen. 

I will optimize the hyper parameters for stochastic gradient descent, random forest and gradient boosting classifier. 

In [None]:
rf.get_params()

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# number of trees
n_estimators = range(200,1000,200)
# maximum depth of the tree
max_depth = range(1,10,1)
# minimum number of samples to split a node
min_samples_split = range(2,10,2)
# criterion for evaluating a split
criterion = ['gini','entropy']

# random grid

random_grid = {'n_estimators':n_estimators,
              'max_depth':max_depth,
              'min_samples_split':min_samples_split,
              'criterion':criterion}

print(random_grid)

I will use auc to evaluate hyperparameters using RandomizedSearchCV.

In [None]:
from sklearn.metrics import make_scorer, roc_auc_score
auc_scoring = make_scorer(roc_auc_score)

In [None]:
# create the randomized search cross-validation
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                               n_iter = 20, cv = 5, scoring=auc_scoring,
                               verbose = 1, random_state = 42)

* scoring = evaluation metric used to pick the best model
* n_iter = number of different combinations
* cv = number of cross-validation splits

Increasing n_iter and cv will increase run-time and decrease chance of overfitting. Note that the number of variables and grid size also influences the runtime. Cross-validation is a technique for splitting the data multiple times to get a better estimate of the performance metric. 

In [None]:
import time
# fit the random search model (this will take a few minutes)
t1 = time.time()
rf_random.fit(X_train_tf, y_train)
t2 = time.time()
print(t2-t1)

In [None]:
rf_random.best_params_

 Let's analyze the performance of the best model compared to the baseline model.

In [None]:
y_train_preds = rf.predict_proba(X_train_tf)[:,1]
y_valid_preds = rf.predict_proba(X_valid_tf)[:,1]

print('Baseline Random Forest')
rf_train_auc_base = roc_auc_score(y_train, y_train_preds)
rf_valid_auc_base = roc_auc_score(y_valid, y_valid_preds)

print('Training AUC:%.3f'%(rf_train_auc_base))
print('Validation AUC:%.3f'%(rf_valid_auc_base))

print('Optimized Random Forest')
y_train_preds_random = rf_random.best_estimator_.predict_proba(X_train_tf)[:,1]
y_valid_preds_random = rf_random.best_estimator_.predict_proba(X_valid_tf)[:,1]

rf_train_auc = roc_auc_score(y_train, y_train_preds_random)
rf_valid_auc = roc_auc_score(y_valid, y_valid_preds_random)

print('Training AUC:%.3f'%(rf_train_auc))
print('Validation AUC:%.3f'%(rf_valid_auc))

Optimize stochastic gradient descent

In [None]:
penalty = ['none','l2','l1']
max_iter = range(100,500,100)
alpha = [0.001,0.003,0.01,0.03,0.1,0.3]
random_grid_sgdc = {'penalty':penalty,
              'max_iter':max_iter,
              'alpha':alpha}
# create the randomized search cross-validation
sgdc_random = RandomizedSearchCV(estimator = sgdc, param_distributions = random_grid_sgdc, 
                                 n_iter = 20, cv = 5, scoring=auc_scoring,verbose = 0, 
                                 random_state = 42)

t1 = time.time()
sgdc_random.fit(X_train_tf, y_train)
t2 = time.time()
print(t2-t1)

In [None]:
sgdc_random.best_params_

In [None]:
y_train_preds = sgdc.predict_proba(X_train_tf)[:,1]
y_valid_preds = sgdc.predict_proba(X_valid_tf)[:,1]

print('Baseline sgdc')
sgdc_train_auc_base = roc_auc_score(y_train, y_train_preds)
sgdc_valid_auc_base = roc_auc_score(y_valid, y_valid_preds)

print('Training AUC:%.3f'%(sgdc_train_auc_base))
print('Validation AUC:%.3f'%(sgdc_valid_auc_base))
print('Optimized sgdc')
y_train_preds_random = sgdc_random.best_estimator_.predict_proba(X_train_tf)[:,1]
y_valid_preds_random = sgdc_random.best_estimator_.predict_proba(X_valid_tf)[:,1]
sgdc_train_auc = roc_auc_score(y_train, y_train_preds_random)
sgdc_valid_auc = roc_auc_score(y_valid, y_valid_preds_random)

print('Training AUC:%.3f'%(sgdc_train_auc))
print('Validation AUC:%.3f'%(sgdc_valid_auc))

 ## Optimize gradient boosting classifier

In [None]:
# number of trees
n_estimators = range(100,500,100)

# maximum depth of the tree
max_depth = range(1,5,1)

# learning rate
learning_rate = [0.001,0.01,0.1]

# random grid

random_grid_gbc = {'n_estimators':n_estimators,
              'max_depth':max_depth,
              'learning_rate':learning_rate}

# create the randomized search cross-validation
gbc_random = RandomizedSearchCV(estimator = gbc, param_distributions = random_grid_gbc,
                                n_iter = 20, cv = 2, scoring=auc_scoring,
                                verbose = 0, random_state = 42)


t1 = time.time()
gbc_random.fit(X_train_tf, y_train)
t2 = time.time()
print(t2-t1)

In [None]:
gbc_random.best_params_

In [None]:
y_train_preds = gbc.predict_proba(X_train_tf)[:,1]
y_valid_preds = gbc.predict_proba(X_valid_tf)[:,1]

print('Baseline gbc')
gbc_train_auc_base = roc_auc_score(y_train, y_train_preds)
gbc_valid_auc_base = roc_auc_score(y_valid, y_valid_preds)

print('Training AUC:%.3f'%(gbc_train_auc_base))
print('Validation AUC:%.3f'%(gbc_valid_auc_base))

print('Optimized gbc')
y_train_preds_random = gbc_random.best_estimator_.predict_proba(X_train_tf)[:,1]
y_valid_preds_random = gbc_random.best_estimator_.predict_proba(X_valid_tf)[:,1]
gbc_train_auc = roc_auc_score(y_train, y_train_preds_random)
gbc_valid_auc = roc_auc_score(y_valid, y_valid_preds_random)

print('Training AUC:%.3f'%(gbc_train_auc))
print('Validation AUC:%.3f'%(gbc_valid_auc))

In [None]:

df_results = pd.DataFrame({'classifier':['SGD','SGD','RF','RF','GB','GB'],
                           'model':['base','optimized']*3,
                          'auc':[sgdc_valid_auc_base,sgdc_valid_auc,
                                 rf_valid_auc_base,rf_valid_auc,
                                 gbc_valid_auc_base,gbc_valid_auc,],
                          })

In [None]:

df_results

In [None]:
ax = sns.barplot(x="classifier", y="auc", hue="model", data=df_results)
ax.set_xlabel('Classifier',fontsize = 15)
ax.set_ylabel('AUC', fontsize = 15)
ax.tick_params(labelsize=15)
# Put the legend out of the figure
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., fontsize = 15)

plt.show()


Slight improvement on validation dataset. It looks like the best among the three is gradient boosting classifier. 

In [None]:
# save the model to use it on new data.
import pickle
pickle.dump(gbc_random.best_estimator_, open('model\gradient_boost_classifier_trained.pkl', 'wb'),protocol = 4)

## Model Evaluation
Evaluate the performance of trained model test dataset.

In [None]:
X_test = df_test[col2use].values
y_test = df_test['OUTPUT_LABEL'].values

scaler = pickle.load(open('data\scaler.sav', 'rb'))
X_test_tf = scaler.transform(X_test)

In [None]:
trained_model = pickle.load(open('model\gradient_boost_classifier_trained.pkl','rb'))

In [None]:
y_train_preds = trained_model.predict_proba(X_train_tf)[:,1]
y_valid_preds = trained_model.predict_proba(X_valid_tf)[:,1]
y_test_preds = trained_model.predict_proba(X_test_tf)[:,1]

In [None]:
thresh = 0.5

print('Training:')
train_auc, train_accuracy, train_recall, train_precision, train_specificity = print_report(y_train,y_train_preds, thresh)
print('Validation:')
valid_auc, valid_accuracy, valid_recall, valid_precision, valid_specificity = print_report(y_valid,y_valid_preds, thresh)
print('Test:')
test_auc, test_accuracy, test_recall, test_precision, test_specificity = print_report(y_test,y_test_preds, thresh)

In [None]:
from sklearn.metrics import roc_curve 

fpr_train, tpr_train, thresholds_train = roc_curve(y_train, y_train_preds)
auc_train = roc_auc_score(y_train, y_train_preds)

fpr_valid, tpr_valid, thresholds_valid = roc_curve(y_valid, y_valid_preds)
auc_valid = roc_auc_score(y_valid, y_valid_preds)

fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_test_preds)
auc_test = roc_auc_score(y_test, y_test_preds)

plt.plot(fpr_train, tpr_train, 'r-',label ='Train AUC:%.3f'%auc_train)
plt.plot(fpr_valid, tpr_valid, 'b-',label ='Valid AUC:%.3f'%auc_valid)
plt.plot(fpr_test, tpr_test, 'g-',label ='Test AUC:%.3f'%auc_test)
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()