# Build a model 
## create training, validation and test samples.

In [None]:
import pandas as pd

In [None]:
# load features from pre processed data file
df = pd.read_csv('data\df_data_features.csv')

The first thing I like to do is to shuffle the samples in case there was some order (e.g. all positive samples on top). Here n is the number of data points. random_state is a seed for the random number generator. This allows you to obtain reproducible results when sampling from the DataFrame.

In [None]:
# shuffle data
df = df.sample(n=len(df), random_state = 42)
df = df.reset_index(drop = True)

#extract 30% data and split them equally to validation and test samples. 
df_valid = df.sample(frac=0.30, random_state=42)
df_train = df.drop(df_valid.index)
df_test = df_valid.sample(frac=0.5, random_state=42)
df_valid = df_valid.drop(df_test.index)

print('test data size: %.3f'%(len(df_test)/len(df)))
print('validation data size: %.3f'%(len(df_valid)/len(df)))
print('training data size: %.3f'%(len(df_train)/len(df)))

In [None]:
#Have we used all the data?
print('sample count (n = %d)'%len(df))
assert len(df) == (len(df_test)+len(df_valid)+len(df_train)),'Not all samples used.'

In [None]:
def calc_prevalence(y_actual):
    return (sum(y_actual)/len(y_actual))

In [None]:
print('Test prevalence(n = %d):%.3f'%(len(df_test),calc_prevalence(df_test.OUTPUT_LABEL.values)))
print('Valid prevalence(n = %d):%.3f'%(len(df_valid),calc_prevalence(df_valid.OUTPUT_LABEL.values)))
print('Train all prevalence(n = %d):%.3f'%(len(df_train), calc_prevalence(df_train.OUTPUT_LABEL.values)))

Is the data ready to be dropped into predictive model?

The dataset is  imbalanced with more negatives than positives. So the model might just assign all samples as negative.

Let me create a balanced training data set by sub-sampling. There may be other approaches to create a balanced training data set.

In [None]:

# split the training data into positive and negative
rows_pos = df_train.OUTPUT_LABEL == 1
df_train_pos = df_train.loc[rows_pos]
df_train_neg = df_train.loc[~rows_pos]

# create balanced data by merging positive and equal number of negative data samples
df_train_balanced = pd.concat([df_train_pos, df_train_neg.sample(n = len(df_train_pos), random_state = 42)],axis = 0)

# shuffle the order of training samples 
df_train_balanced = df_train_balanced.sample(n = len(df_train_balanced), random_state = 42).reset_index(drop = True)

print('Balanced training data prevalence(n = %d):%.3f'%(len(df_train_balanced), calc_prevalence(df_train_balanced.OUTPUT_LABEL.values)))


In [None]:
df_train_balanced.to_csv('data\df_train_balanced.csv',index=False)
df_train.to_csv('data\df_train.csv',index=False)
df_valid.to_csv('data\df_valid.csv',index=False)
df_test.to_csv('data\df_test.csv',index=False)

In [None]:
col2use = df_train.columns.tolist()
col2use.remove('OUTPUT_LABEL')
col2use.remove('Unnamed: 0')

In [None]:
#Get features values for model training 
X_train_all = df_train[col2use].values
X_train = df_train_balanced[col2use].values
X_valid = df_valid[col2use].values

y_train = df_train_balanced['OUTPUT_LABEL'].values
y_valid = df_valid['OUTPUT_LABEL'].values

print('Training All shapes:',X_train_all.shape)
print('Training shapes:',X_train.shape, y_train.shape)
print('Validation shapes:',X_valid.shape, y_valid.shape)

The features in this dataset are of different scales. Before feeding the data to a machine learning model, this data should be normalized.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train_all)

X_train_tf = scaler.transform(X_train)
X_valid_tf = scaler.transform(X_valid)

In [None]:
#Save the scaler to use it with test data
import pickle
pickle.dump(scaler, open('data\scaler.sav', 'wb'))

### utility functions

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
def calc_specificity(y_actual, y_pred, thresh=0.5):
    # calculates specificity
    return sum((y_pred < thresh) & (y_actual == 0)) /sum(y_actual ==0)

def print_report(y_actual, y_pred, thresh=0.5):
    auc = roc_auc_score(y_actual, y_pred)
    accuracy = accuracy_score(y_actual, (y_pred > thresh))
    recall = recall_score(y_actual, (y_pred > thresh))
    precision = precision_score(y_actual, (y_pred > thresh))
    specificity = calc_specificity(y_actual, y_pred, thresh)
    print('AUC:%.3f'%auc)
    print('accuracy:%.3f'%accuracy)
    print('recall:%.3f'%recall)
    print('precision:%.3f'%precision)
    print('specificity:%.3f'%specificity)
    print('prevalence:%.3f'%calc_prevalence(y_actual))
    print(' ')
    return auc, accuracy, recall, precision, specificity

# Model selection

## KNN

In [None]:
# k-nearest neighbors
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors = 100)
knn.fit(X_train_tf, y_train)

In [None]:
Y_knn = knn.predict(X_valid_tf)

y_train_preds = knn.predict_proba(X_train_tf)[:,1]
y_valid_preds_knn = knn.predict_proba(X_valid_tf)[:,1]

print('KNN')
print('Training:')
thresh = 0.5
knn_train_auc, knn_train_accuracy, knn_train_recall, \
    knn_train_precision, knn_train_specificity = print_report(y_train,y_train_preds, thresh)
print('Validation:')
knn_valid_auc, knn_valid_accuracy, knn_valid_recall, \
    knn_valid_precision, knn_valid_specificity = print_report(y_valid,y_valid_preds_knn, thresh)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix,roc_curve, auc,roc_auc_score
roc_auc = roc_auc_score(y_valid, y_valid_preds_knn)
fp_rate, tp_rate, thresholds = roc_curve(y_valid, y_valid_preds_knn)
plt.figure()
plt.plot(fp_rate, tp_rate, label='KNN (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC - KNN')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()


In [None]:
#plot confusion matrix
import seaborn as sns
plt.figure(figsize=(9,9))
sns.heatmap(confusion_matrix(y_valid, Y_knn), annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Greens_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
title = 'Accuracy Score: {0}'.format(knn.score(X_valid_tf , y_valid))
plt.title(title, size = 12);

In [None]:
# Checking the summary of classification
from sklearn.metrics import classification_report
print(classification_report(y_valid, Y_knn, target_names = ['NO', 'YES']))

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state = 42)
lr.fit(X_train_tf, y_train)

In [None]:
Y_lr=lr.predict(X_valid_tf)

y_train_preds = lr.predict_proba(X_train_tf)[:,1]
y_valid_preds_lr= lr.predict_proba(X_valid_tf)[:,1]

print('Logistic Regression')
print('Training:')
lr_train_auc, lr_train_accuracy, lr_train_recall, \
    lr_train_precision, lr_train_specificity = print_report(y_train,y_train_preds, thresh)
print('Validation:')
lr_valid_auc, lr_valid_accuracy, lr_valid_recall, \
    lr_valid_precision, lr_valid_specificity = print_report(y_valid,y_valid_preds_lr, thresh)

In [None]:
from sklearn.metrics import confusion_matrix,roc_curve, auc,roc_auc_score
roc_auc = roc_auc_score(y_valid, y_valid_preds_lr)
fp_rate, tp_rate, thresholds = roc_curve(y_valid, y_valid_preds_lr)
plt.figure()
plt.plot(fp_rate, tp_rate, label='Logistic Regression (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC -Logistic Regression ')
plt.legend(loc="lower right")
plt.show()

In [None]:
plt.figure(figsize=(9,9))
sns.heatmap(confusion_matrix(y_valid, Y_lr), annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Greens_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(lr.score(X_valid_tf , y_valid))
plt.title(all_sample_title, size = 15);