In [21]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
# from xgboost import XGBClassifier

from sklearn.preprocessing import RobustScaler

import seaborn as sns 
import matplotlib.pyplot as plt

In [2]:
df = pd.read_pickle('final_df.pkl')
X = df.drop(columns='label')
y = df['label']

In [3]:
# split into training and validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77, stratify=y)

In [4]:
# let's upsample the human training data
# we'll use random oversampling for now
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_train_res, y_train_res = ros.fit_sample(X_train,y_train)

In [5]:
# let's impute the missing default_profile_image data for our training and test sets
# we need to scale before using KNN imputation 
# we need to perform scaling and imputation on the training set, then pull the parameters for the test
# we'll use RobustScaler since we have a lot of outliers -- our means are rather useless

# fit the scaler to our resampled X_train and transform X_train and X_test
scaler = RobustScaler().fit(X_train_res)
X_train_res_sca = pd.DataFrame(scaler.transform(X_train_res), columns=X_train.columns)
X_test_sca = pd.DataFrame(scaler.transform(X_test), columns=X_train.columns)

In [None]:
X_test_sca

In [6]:
# This will need to be undone and placed in a pipeline when we do validation
from sklearn.impute import KNNImputer

# fit the imputer to our scaled X_train, use it to impute train and test and cast to DataFrames
imputer = KNNImputer(n_neighbors=3).fit(X_train_res_sca)
X_train_final = pd.DataFrame(imputer.transform(X_train_res_sca), columns=X.columns)
X_test_final = pd.DataFrame(imputer.transform(X_test_sca), columns=X.columns)

In [None]:
y_train.value_counts()

In [None]:
# round the imputed to 0 and 1, +1 to return to 0-1 scale
X_train_final['default_profile'] = X_train_final['default_profile'].round() + 1
X_test_final['default_profile'] = X_test_final['default_profile'].round() + 1

In [None]:
from collections import Counter
Counter(X_train_final['default_profile'])

#### Model Pipelining

In [None]:
# Logit with C=1, 10k iterations
logit = LogisticRegression(C=1, max_iter=10000)
logit.fit(X_train_final, y_train_res)
log_pred = logit.predict(X_test_final)
log_F1 = f1_score(y_test, log_pred)
log_recall = recall_score(y_test, log_pred)
print("The score for logistic regression is")
print("Training acc: {:6.2f}%".format(100*logit.score(X_train_final, y_train_res)))
print("Test acc: {:6.2f}%".format(100*logit.score(X_test_final, y_test)))
print("Test recall: {:6.2f}%".format(100*log_recall))
print("Test F1: {:6.2f}%".format(100*log_F1))

In [7]:
# K=5 KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_final, y_train_res)
knn_pred = knn.predict(X_test_final)
knn_F1 = f1_score(y_test, knn_pred)
knn_recall = recall_score(y_test, knn_pred)
print("The score for kNN is")
print("Training acc: {:6.2f}%".format(100*knn.score(X_train_final, y_train_res)))
print("Test set acc: {:6.2f}%".format(100*knn.score(X_test_final, y_test)))
print("Test recall: {:6.2f}%".format(100*knn_recall))
print("Test F1: {:6.2f}%".format(100*knn_F1))

The score for kNN is
Training acc:  96.18%
Test set acc:  93.63%
Test recall:  93.64%
Test F1:  95.00%


In [None]:
# NaiveBayes
nb = GaussianNB()
nb.fit(X_train_final, y_train_res)
nb_pred = nb.predict(X_test_final)
nb_F1 = f1_score(y_test, nb_pred)
nb_recall = recall_score(y_test, nb_pred)
print("The score for Naive Bayes is")
print("Training acc: {:6.2f}%".format(100*nb.score(X_train_final, y_train_res)))
print("Test set acc: {:6.2f}%".format(100*nb.score(X_test_final, y_test)))
print("Test recall: {:6.2f}%".format(100*nb_recall))
print("Test F1: {:6.2f}%".format(100*nb_F1))

In [None]:
tree = DecisionTreeClassifier()
tree.fit(X_train_final, y_train_res)
tree_pred = tree.predict(X_test_final)
tree_F1 = f1_score(y_test, tree_pred)
tree_recall = recall_score(y_test, tree_pred)
print("The score for Decision Tree is")
print("Training acc: {:6.2f}%".format(100*tree.score(X_train_final, y_train_res)))
print("Test set acc: {:6.2f}%".format(100*tree.score(X_test_final, y_test)))
print("Test recall: {:6.2f}%".format(100*tree_recall))
print("Test F1: {:6.2f}%".format(100*tree_F1))

In [None]:
forest = RandomForestClassifier()
forest.fit(X_train_final, y_train_res)
forest_pred = forest.predict(X_test_final)
forest_F1 = f1_score(y_test, forest_pred)
forest_recall = recall_score(y_test,forest_pred)
print("The score for Random Forest is")
print("Training acc: {:6.2f}%".format(100*forest.score(X_train_final, y_train_res)))
print("Test set acc: {:6.2f}%".format(100*forest.score(X_test_final, y_test)))
print("Test recall: {:6.2f}%".format(100*forest_recall))
print("Test F1: {:6.2f}%".format(100*forest_F1))

In [None]:
confusion_matrix(y_test, forest_pred)

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr())

In [None]:
from yellowbrick.model_selection import FeatureImportances
forest2 = RandomForestClassifier()
viz = FeatureImportances(forest2)
viz.fit(X_train_final, y_train_res)
viz.show()

In [10]:
from sklearn.pipeline import Pipeline
knn_pipe = Pipeline([('scaler', RobustScaler()), ('imputer', KNNImputer(n_neighbors=3)), ('knn', KNeighborsClassifier(n_neighbors=5))])
knn_pipe.fit(X_train_res, y_train_res)
knn_pipe.score(X_test, y_test)

0.936270248913473

In [13]:
rf_pipe = Pipeline([('scaler', RobustScaler()), ('imputer', KNNImputer(n_neighbors=3)), ('rf', RandomForestClassifier())])
rf_pipe.fit(X_train_res, y_train_res)
rf_pipe.score(X_test, y_test)

0.9540497826945871

In [18]:
lr_pipe = Pipeline([('scaler', RobustScaler()), ('imputer', KNNImputer(n_neighbors=3)), ('lr', LogisticRegression())])
lr_pipe.fit(X_train_res, y_train_res)
lr_pipe.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8596207032793363

In [15]:
knn_f1 = f1_score(y_test, knn_pipe.predict(X_test))

0.949961222273926

In [17]:
rf_f1 = f1_score(y_test, rf_pipe.predict(X_test))

0.9640461248338331

In [19]:
lr_f1 = f1_score(y_test, lr_pipe.predict(X_test))

0.89187790998448

In [27]:
lr_params = {'lr__max_iter':[2000],
              'lr__penalty':['l1','l2'], 
              'lr__C':np.logspace(-4,4,20)}

clf_lr = GridSearchCV(lr_pipe, param_grid = lr_params, cv = 5, verbose = 100, n_jobs = -1)
best_clf_lr = clf_lr.fit(X_train, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 45.2min


KeyboardInterrupt: 

In [None]:
clf_lr = GridSearchCV(lr_pipe)