In [1]:
# import dependencies

import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample
from sklearn import preprocessing

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier

In [2]:
# read data
x_train = pd.read_csv("data/processed_data/x_train.csv")
x_val = pd.read_csv("data/processed_data/x_val.csv")
y_train = pd.read_csv("data/processed_data/y_train.csv")
y_val = pd.read_csv("data/processed_data/y_val.csv")
x_test = pd.read_csv("data/processed_data/x_test.csv")

In [None]:
y_train.value_counts()

Logistic regression train dataset

In [None]:
# Logistic regression on unbalanced dataset

# Normalize x data 

scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)

# Model development

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV

params = {'penalty':['l1','l2'],
          'class_weight':['balanced', None],
          'C':np.linspace(0.01,10,100)}
model = LogisticRegression()
lr_model = RandomizedSearchCV(model, 
                              param_distributions = params, 
                              n_iter = 10, 
                              scoring = 'roc_auc', 
                              cv = 10, 
                              n_jobs = -1, 
                              verbose = 20)
lr_model.fit(x_train_scaled, 
             y_train.values.ravel())

In [None]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = lr_model.predict_proba(x_train_scaled)[:,1]
actual = y_train.values.ravel()

# Kolmogorov-Smirnov statistic. 
# The higher the ks_stat, the more efficient is the model at capturing the target (Ones)

cutoffs = np.linspace(0.01,0.99,99)
KS_all = []
for cutoff in cutoffs:
    
    predicted = (predicted_train > cutoff).astype(int)
    len(predicted)
    TP = ((predicted == 1) & (actual == 1)).sum()
    TN = ((predicted == 0) & (actual == 0)).sum()
    FP = ((predicted == 1) & (actual == 0)).sum()
    FN = ((predicted == 0) & (actual == 1)).sum()

    P = TP+FN
    N = TN+FP

    KS = (TP/P)-(FP/N)
    KS_all.append(KS)

# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[KS_all == max(KS_all)][0]

predicted_val = lr_model.predict_proba(x_val_scaled)[:,1]
val_classes = (predicted_val>cutoff_optimum).astype(int)

cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)
print("roc_auc_score:",roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

In [3]:
# RandomForest classifier

# model development

params = {'n_estimators':[100, 500],
          'min_samples_split':[2, 5, None],
          'min_samples_leaf':[1,10,100]}

model = RandomForestClassifier()
rf_model = RandomizedSearchCV(model, 
                              param_distributions = params, 
                              n_iter = 10, 
                              scoring = 'roc_auc', 
                              cv = 10, 
                              n_jobs = -1, 
                              verbose = 20)
rf_model.fit(x_train,
             y_train.values.ravel())

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  2

RandomizedSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'min_samples_leaf': [1, 100, 100],
                                        'min_samples_split': [2, 5, None],
                                        'n_estimators': [100, 500]},
                   scoring='roc_auc', verbose=20)

In [4]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = rf_model.predict_proba(x_train)[:,1]
actual = y_train.values.ravel()

# Kolmogorov-Smirnov statistic. 
# The higher the ks_stat, the more efficient is the model at capturing the target (Ones)

cutoffs = np.linspace(0.01,0.99,99)
KS_all = []
for cutoff in cutoffs:
    
    predicted = (predicted_train > cutoff).astype(int)
    len(predicted)
    TP = ((predicted == 1) & (actual == 1)).sum()
    TN = ((predicted == 0) & (actual == 0)).sum()
    FP = ((predicted == 1) & (actual == 0)).sum()
    FN = ((predicted == 0) & (actual == 1)).sum()

    P = TP+FN
    N = TN+FP

    KS = (TP/P)-(FP/N)
    KS_all.append(KS)

# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[KS_all == max(KS_all)][0]

predicted_val = rf_model.predict_proba(x_val)[:,1]
val_classes = (predicted_val>cutoff_optimum).astype(int)

cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)
print("roc_auc_score:", roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

roc_auc_score: 0.7276970537179381
confusion_matrix: 
 [[9073 6854]
 [ 855 3116]]
classification_report: 
               precision    recall  f1-score   support

           0       0.91      0.57      0.70     15927
           1       0.31      0.78      0.45      3971

    accuracy                           0.61     19898
   macro avg       0.61      0.68      0.57     19898
weighted avg       0.79      0.61      0.65     19898



In [5]:
# GradientBoostingClassifier classifier

# model development

params = {'n_estimators':[100, 500],
        'learning_rate':[0.1, 0.01],
        'min_samples_leaf':[1,10,100],
        'min_samples_split': [2, 5],
         'tol': [0.001, 0.0001]}

model = GradientBoostingClassifier()
gb_model = RandomizedSearchCV(model, 
                              param_distributions = params, 
                              n_iter = 10, 
                              scoring = 'roc_auc', 
                              cv = 10, 
                              n_jobs = -1, 
                              verbose = 20)
gb_model.fit(x_train,
             y_train.values.ravel())

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   22.5s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   22.7s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   22.8s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   22.9s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   45.4s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   45.7s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   45.9s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   46.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  4

RandomizedSearchCV(cv=10, estimator=GradientBoostingClassifier(), n_jobs=-1,
                   param_distributions={'learning_rate': [0.1, 0.01],
                                        'min_samples_leaf': [1, 100, 100],
                                        'min_samples_split': [2, 5],
                                        'n_estimators': [100, 500],
                                        'tol': [0.001, 0.0001]},
                   scoring='roc_auc', verbose=20)

In [6]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = gb_model.predict_proba(x_train)[:,1]
actual = y_train.values.ravel()

# Kolmogorov-Smirnov statistic. 
# The higher the ks_stat, the more efficient is the model at capturing the target (Ones)

cutoffs = np.linspace(0.01,0.99,99)
KS_all = []
for cutoff in cutoffs:
    
    predicted = (predicted_train > cutoff).astype(int)
    len(predicted)
    TP = ((predicted == 1) & (actual == 1)).sum()
    TN = ((predicted == 0) & (actual == 0)).sum()
    FP = ((predicted == 1) & (actual == 0)).sum()
    FN = ((predicted == 0) & (actual == 1)).sum()

    P = TP+FN
    N = TN+FP

    KS = (TP/P)-(FP/N)
    KS_all.append(KS)

# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[KS_all == max(KS_all)][0]

predicted_val = gb_model.predict_proba(x_val)[:,1]
val_classes = (predicted_val>cutoff_optimum).astype(int)

cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)
print("roc_auc_score:", roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

roc_auc_score: 0.7352767522470984
confusion_matrix: 
 [[9271 6656]
 [ 899 3072]]
classification_report: 
               precision    recall  f1-score   support

           0       0.91      0.58      0.71     15927
           1       0.32      0.77      0.45      3971

    accuracy                           0.62     19898
   macro avg       0.61      0.68      0.58     19898
weighted avg       0.79      0.62      0.66     19898



In [11]:
from sklearn.naive_bayes import GaussianNB

# model development

params = {'var_smoothing':[1e-08, 1e-09]}

model = GaussianNB()
nb_model = RandomizedSearchCV(model, 
                              param_distributions = params, 
                              n_iter = 10, 
                              scoring = 'roc_auc', 
                              cv = 10, 
                              n_jobs = -1, 
                              verbose = 20)
nb_model.fit(x_train,
             y_train.values.ravel())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done  15 out of  20 | elapsed:    7.8s remaining:    2.5s
[Parallel(n_jobs=-1)]: Done  17 out of  20 | elapsed:    8.2s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done  20 

RandomizedSearchCV(cv=10, estimator=GaussianNB(), n_jobs=-1,
                   param_distributions={'var_smoothing': [1e-08, 1e-09]},
                   scoring='roc_auc', verbose=20)

In [12]:
# selecting the cut-off for determining hardclasses

# prediction on training dataset
predicted_train = nb_model.predict_proba(x_train)[:,1]
actual = y_train.values.ravel()

# Kolmogorov-Smirnov statistic. 
# The higher the ks_stat, the more efficient is the model at capturing the target (Ones)

cutoffs = np.linspace(0.01,0.99,99)
KS_all = []
for cutoff in cutoffs:
    
    predicted = (predicted_train > cutoff).astype(int)
    len(predicted)
    TP = ((predicted == 1) & (actual == 1)).sum()
    TN = ((predicted == 0) & (actual == 0)).sum()
    FP = ((predicted == 1) & (actual == 0)).sum()
    FN = ((predicted == 0) & (actual == 1)).sum()

    P = TP+FN
    N = TN+FP

    KS = (TP/P)-(FP/N)
    KS_all.append(KS)

# list(zip(cutoffs,KS_all))

cutoff_optimum = cutoffs[KS_all == max(KS_all)][0]

predicted_val = nb_model.predict_proba(x_val)[:,1]
val_classes = (predicted_val>cutoff_optimum).astype(int)

cm = confusion_matrix(y_val, 
                     val_classes)
class_report = classification_report(y_val, 
                                    val_classes)
print("roc_auc_score:", roc_auc_score(y_val, predicted_val)) # AUROC represents the likelihood of the model distinguishing observations from two classes.
print("confusion_matrix: \n",cm)
print("classification_report: \n",class_report)

roc_auc_score: 0.5900805657365495
confusion_matrix: 
 [[8229 7698]
 [1355 2616]]
classification_report: 
               precision    recall  f1-score   support

           0       0.86      0.52      0.65     15927
           1       0.25      0.66      0.37      3971

    accuracy                           0.55     19898
   macro avg       0.56      0.59      0.51     19898
weighted avg       0.74      0.55      0.59     19898



In [10]:
submissions=pd.DataFrame({'Junk':gb_model.predict_proba(x_test)[:,1]})
submissions.to_csv('output/submission_4.csv',index=False)