# Model Selection Notebook

## Importing and installing libraries

In [1]:
# installing missingpy for missforest and impyute for mice
!pip install impyute



You should consider upgrading via the 'C:\Users\Joe\Anaconda3\python.exe -m pip install --upgrade pip' command.


In [2]:
!pip install missingpy



You should consider upgrading via the 'C:\Users\Joe\Anaconda3\python.exe -m pip install --upgrade pip' command.


In [2]:
# import the necessary libraries
# basic
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# imputation and resampling
from impute_functions import *
from imblearn.over_sampling import SMOTE, SMOTENC

# models
from catboost import CatBoostClassifier as catboost
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier as xgb
from sklearn.ensemble import AdaBoostClassifier as adaboost
from lightgbm import LGBMClassifier as lightgbm
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# metrics
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, make_scorer
from sklearn.metrics import accuracy_score

# misc.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_validate
import pickle
from sklearn.model_selection import GridSearchCV
from feature_engineer import *



In [3]:
# warning suppression
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

## Data entry

In [4]:
# read data
names = (['ID', 'Expense', 'Income', 'Loan type', 'Occupation type', 
         'Age', 'Score1', 'Score2', 'Score3', 'Score4', 'Score5'])

X = pd.read_csv('dataset/train_x.csv', index_col = 'ID', names = names, skiprows = 1)
y = pd.read_csv('dataset/train_y.csv', index_col = 'ID')

In [None]:
# Here we encode some categoricals into labels
X = ManualEncoder(X)
X.to_csv('Encoded_X.csv')

In [None]:
# for ready access, we save some temporary files
X = pd.read_csv('Encoded_X.csv', index_col = 'ID')
y = pd.read_csv('dataset/train_y.csv', index_col = 'ID')

In [None]:
X

In [None]:
# random forest imputing the data ([2,3,4] are the column indices of the categorical features)
X_rf = rf_imputer(X, [2,3,4])
X_rf = pd.DataFrame(X_rf, columns=X.columns).set_index(np.arange(1,80001))
X_rf.index.name = 'ID'
X_rf.to_csv('RF_imputed_X.csv')

In [5]:
# for easy access
X = pd.read_csv('RF_imputed_X.csv', index_col = 'ID')
y = pd.read_csv('dataset/train_y.csv', index_col = 'ID')
X

Unnamed: 0_level_0,Expense,Income,Loan type,Occupation type,Age,Score1,Score2,Score3,Score4,Score5
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1830.943788,14767.28013,1.0,1.0,1.0,0.016885,205.196182,22.521523,600.911200,3464.613291
2,1645.302546,15272.26775,1.0,1.0,0.0,0.240375,194.266317,5.349117,600.888816,3374.921455
3,1555.026392,17482.49734,0.0,1.0,0.0,0.213921,183.529871,-1.054954,598.596944,3331.304886
4,1681.233164,16257.66493,0.0,1.0,0.0,0.303909,191.228965,6.971750,602.447203,3392.275849
5,1777.648916,16316.29914,1.0,0.0,1.0,0.300104,224.074728,11.218489,605.947340,3438.864083
...,...,...,...,...,...,...,...,...,...,...
79996,1470.317116,16659.49663,0.0,2.0,0.0,0.208757,172.526308,-5.332184,596.648261,3290.377932
79997,1923.617480,14910.36890,1.0,1.0,0.0,0.201837,186.252458,15.425841,600.855069,3509.388221
79998,1711.147154,15962.25945,0.0,1.0,1.0,0.226396,196.098150,8.161353,601.360722,3406.734018
79999,1673.822523,15525.27413,1.0,2.0,0.0,0.101090,186.390184,3.381393,596.750750,3388.700770


In [6]:
# basic clean deletes rows with nan y and scales the columns from the passed list (all non categorical)
non_cat_cols_X = ['Expense','Income','Score1','Score2','Score3','Score4','Score5']

# calling the basic clean from impute_functions.py
X, y = basic_clean(X, y, non_cat_cols_X)

In [7]:
# setting the index column name
X.index.name = 'ID'

# Model evaluation prior to feature engineering

## Splitting

In [8]:
# 80:20 model split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [9]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60877 entries, 44382 to 15795
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Expense          60877 non-null  float64
 1   Income           60877 non-null  float64
 2   Loan type        60877 non-null  float64
 3   Occupation type  60877 non-null  float64
 4   Age              60877 non-null  float64
 5   Score1           60877 non-null  float64
 6   Score2           60877 non-null  float64
 7   Score3           60877 non-null  float64
 8   Score4           60877 non-null  float64
 9   Score5           60877 non-null  float64
dtypes: float64(10)
memory usage: 5.1 MB


The data types are all float. The parameters income, loan_type, Occupation type and Age are in reality categorical but since all but Occupation type are already one-hot encoded, we may consider them as integers/floats 

In [10]:
# let us look at the data imbalance
y_train['Label'].value_counts()

0.0    56812
1.0     4065
Name: Label, dtype: int64

We must correct this huge imbalance

## Oversampling

In [11]:
# we use smote categorical oversampling
oversampler = SMOTENC(random_state= 42, categorical_features=[2,3,4], sampling_strategy=2/3)
X_train, y_train = oversampler.fit_resample(X_train, y_train)

In [12]:
# resetting the ID as it is removed during oversampling
X_train.index.name = 'ID'

In [13]:
# a function that creates dummy columns for the occupation type category that has three categories
X_train, y_train = dummy_creator(X_train, y_train)
X_test, y_test = dummy_creator(X_test, y_test)

In [151]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94686 entries, 0 to 94685
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Expense    94686 non-null  float64
 1   Income     94686 non-null  float64
 2   Loan type  94686 non-null  float64
 3   Age        94686 non-null  float64
 4   Score1     94686 non-null  float64
 5   Score2     94686 non-null  float64
 6   Score3     94686 non-null  float64
 7   Score4     94686 non-null  float64
 8   Score5     94686 non-null  float64
 9   Y          94686 non-null  uint8  
 10  Z          94686 non-null  uint8  
dtypes: float64(9), uint8(2)
memory usage: 6.7 MB



## Model Evaluation (pre-feature engineering)

In [114]:
# A data table to store the model performances
results = pd.DataFrame(columns = ['Model name','Feature_Engineered_data(Y/N)','F1-score(weighted)','Accuracy'])

In [115]:
# We try the train data on a variety of models; catboost, extra-trees, xgboost, adaboost

In [116]:
X_train

Unnamed: 0_level_0,Expense,Income,Loan type,Age,Score1,Score2,Score3,Score4,Score5,Y,Z
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,-2.247941,-1.359844,0.0,0.0,-0.144554,-0.903721,-1.115572,-1.056263,-2.247964,0,1
1,0.029614,0.924177,1.0,1.0,0.643451,0.909074,-0.114357,0.950666,0.029617,0,0
2,-0.548192,-1.079564,0.0,0.0,-0.354195,-0.646945,0.414402,-0.492169,-0.548195,1,0
3,-1.053660,0.110537,1.0,1.0,0.687367,0.226460,-0.807300,0.262248,-1.053669,1,0
4,1.063227,-1.374961,0.0,1.0,-1.598246,0.109126,2.085999,0.062605,1.063222,0,0
...,...,...,...,...,...,...,...,...,...,...,...
94681,-0.299023,-0.979383,1.0,0.0,1.310431,-0.206143,-1.718584,-0.342753,-0.299024,0,1
94682,0.615423,1.336259,1.0,1.0,-1.896311,1.201401,0.108299,-0.501115,0.615429,1,0
94683,0.318102,1.619585,1.0,1.0,-2.093309,1.039731,0.027950,-0.715338,0.318108,1,0
94684,0.166797,1.198599,0.0,0.0,-0.123428,-0.261709,-0.282120,-0.375612,0.166801,1,0


In [117]:
# Extra trees Classifier
model = ExtraTreesClassifier()
# Scoring
model.fit(X_train,y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds))
results = results.append({'Model name':'Extra Trees',
                          'Feature_Engineered_data(Y/N)':'N',
                          'F1-score(weighted)':f1_score(y_test, preds, average = 'weighted'),
                          'Accuracy':accuracy_score(y_test, preds)},
                        ignore_index = True)

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     14252
         1.0       0.87      0.86      0.86       968

    accuracy                           0.98     15220
   macro avg       0.93      0.92      0.93     15220
weighted avg       0.98      0.98      0.98     15220



In [118]:
# catboost classifier
cat_params = {'depth': 10}
model = catboost(verbose = False, **cat_params)

# Scoring
model.fit(X_train,y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds))
results = results.append({'Model name':'Catboost',
                          'Feature_Engineered_data(Y/N)':'N',
                          'F1-score(weighted)':f1_score(y_test, preds, average = 'weighted'),
                          'Accuracy':accuracy_score(y_test, preds)},
                        ignore_index = True)

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     14252
         1.0       0.86      0.88      0.87       968

    accuracy                           0.98     15220
   macro avg       0.93      0.94      0.93     15220
weighted avg       0.98      0.98      0.98     15220



While the 'class 0' scores are nearly the same, the extra trees classifier improves the precision for 'class 1' at the cost of lowered recall

In [119]:
# trying to Removing Score5 as concluded in the feature enginnering report
model = ExtraTreesClassifier()

model.fit(X_train.drop('Score5', axis =1),y_train)
preds = model.predict(X_test.drop('Score5', axis= 1))
print(classification_report(y_test, preds))

# this lack of change verifies that feature 'score5' can be dropped as per the feature engineering result without any change in results

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     14252
         1.0       0.88      0.86      0.87       968

    accuracy                           0.98     15220
   macro avg       0.93      0.92      0.93     15220
weighted avg       0.98      0.98      0.98     15220



In [120]:
# xgboost classifier
xgb_params = {'max_depth': 17, 'min_child_weight': 1}
model = xgb(**xgb_params)

model.fit(X_train,y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds))
results = results.append({'Model name':'XGBoost',
                          'Feature_Engineered_data(Y/N)':'N',
                          'F1-score(weighted)':f1_score(y_test, preds, average = 'weighted'),
                          'Accuracy':accuracy_score(y_test, preds)},
                        ignore_index = True)

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     14252
         1.0       0.85      0.87      0.86       968

    accuracy                           0.98     15220
   macro avg       0.92      0.93      0.92     15220
weighted avg       0.98      0.98      0.98     15220



In [121]:
# adaboost classifier
model = adaboost()
model.fit(X_train,y_train)
preds = model.predict(X_test)
print('Adaboost Classifier:')
print(classification_report(y_test, preds))
results = results.append({'Model name':'Adaboost',
                          'Feature_Engineered_data(Y/N)':'N',
                          'F1-score(weighted)':f1_score(y_test, preds, average = 'weighted'),
                          'Accuracy':accuracy_score(y_test, preds)},
                        ignore_index = True)

Adaboost Classifier:
              precision    recall  f1-score   support

         0.0       0.99      0.92      0.95     14252
         1.0       0.42      0.85      0.56       968

    accuracy                           0.92     15220
   macro avg       0.71      0.88      0.76     15220
weighted avg       0.95      0.92      0.93     15220



The Catboost classifier gives the best results in terms of macro and weighted F1-scores

## Model Evaluation (after feature Engineering)

In [14]:
# test train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# we use smote categorical oversampling
oversampler = SMOTENC(random_state= 42, categorical_features=[2,3,4], sampling_strategy=2/3)
X_train, y_train = oversampler.fit_resample(X_train, y_train)

# reset index name as smote removes the index name
X_train.index.name = 'ID'

In [15]:
# the feature engineering function from feature_engineer.py on train and test data
X_train, X_train_bins = feature_engineer(X_train.reset_index(), False)
X_test, X_test_bins = feature_engineer(X_test.reset_index(), False)

In [16]:
# a function that creates dummy columns for the occupation type category that has three categories
X_train, y_train = dummy_creator(X_train, y_train)
X_test, y_test = dummy_creator(X_test, y_test)

In [17]:
# This conversion is done so that catboost may consider these features 
# as categorical features as they are already one-hot encoded (two categories)
X_train['Loan type'] = X_train['Loan type'].astype(int)
X_train.Age = X_train.Age.astype(int)
X_train['Age X Loan type'] = X_train['Age X Loan type'].astype(int)

X_test['Loan type'] = X_test['Loan type'].astype(int)
X_test.Age = X_test.Age.astype(int)
X_test['Age X Loan type'] = X_test['Age X Loan type'].astype(int)

##### Trying out various Classifiers:

In [126]:
# Extra trees Classifier
model = ExtraTreesClassifier()

#fit the model and predict
model.fit(X_train,y_train)
preds = model.predict(X_test)
print('Extra Trees Classifier:')

#shows us the classification report
print(classification_report(y_test, preds, digits= 3))

# add the data to the results dataframe
results = results.append({'Model name':'Extra Trees',
                          'Feature_Engineered_data(Y/N)':'Y',
                          'F1-score(weighted)':f1_score(y_test, preds, average = 'weighted'),
                          'Accuracy':accuracy_score(y_test, preds)},
                        ignore_index = True)

Extra Trees Classifier:
              precision    recall  f1-score   support

         0.0      0.990     0.991     0.990     14252
         1.0      0.867     0.847     0.857       968

    accuracy                          0.982     15220
   macro avg      0.928     0.919     0.924     15220
weighted avg      0.982     0.982     0.982     15220



In [127]:
# Catboost Classifier
cat_params = {'verbose': False, 'depth': 10}
model = catboost(**cat_params)
model.fit(X_train,y_train)
preds = model.predict(X_test)
print('Catboost Classifier:')
print(classification_report(y_test, preds, digits= 3))
results = results.append({'Model name':'Catboost',
                          'Feature_Engineered_data(Y/N)':'Y',
                          'F1-score(weighted)':f1_score(y_test, preds, average = 'weighted'),
                          'Accuracy':accuracy_score(y_test, preds)},
                        ignore_index = True)

Catboost Classifier:
              precision    recall  f1-score   support

         0.0      0.992     0.990     0.991     14252
         1.0      0.860     0.879     0.869       968

    accuracy                          0.983     15220
   macro avg      0.926     0.935     0.930     15220
weighted avg      0.983     0.983     0.983     15220



In [128]:
# XGBoost Classifier
xgb_params = {'max_depth': 17, 'min_child_weight': 1}
model = xgb(**xgb_params)
model.fit(X_train,y_train)
preds = model.predict(X_test)
print('XGBoost Classifier:')
print(classification_report(y_test, preds, digits= 3))
results = results.append({'Model name':'XGBoost',
                          'Feature_Engineered_data(Y/N)':'Y',
                          'F1-score(weighted)':f1_score(y_test, preds, average = 'weighted'),
                          'Accuracy':accuracy_score(y_test, preds)},
                        ignore_index = True)

XGBoost Classifier:
              precision    recall  f1-score   support

         0.0      0.991     0.989     0.990     14252
         1.0      0.849     0.873     0.861       968

    accuracy                          0.982     15220
   macro avg      0.920     0.931     0.926     15220
weighted avg      0.982     0.982     0.982     15220



In [129]:
# Adaboost Classifier
model = adaboost()
model.fit(X_train,y_train)
preds = model.predict(X_test)
print('Adaboost Classifier:')
print(classification_report(y_test, preds, digits= 3))
results = results.append({'Model name':'Adaboost',
                          'Feature_Engineered_data(Y/N)':'Y',
                          'F1-score(weighted)':f1_score(y_test, preds, average = 'weighted'),
                          'Accuracy':accuracy_score(y_test, preds)},
                        ignore_index = True)

Adaboost Classifier:
              precision    recall  f1-score   support

         0.0      0.989     0.930     0.959     14252
         1.0      0.451     0.843     0.588       968

    accuracy                          0.925     15220
   macro avg      0.720     0.887     0.773     15220
weighted avg      0.954     0.925     0.935     15220



In [130]:
# trying out the LightGBM classifier
model = lightgbm()
model.fit(X_train,y_train)
preds = model.predict(X_test)
print('LightGBM Classifier:')
print(classification_report(y_test, preds, digits= 3))
results = results.append({'Model name':'LightGBM',
                          'Feature_Engineered_data(Y/N)':'Y',
                          'F1-score(weighted)':f1_score(y_test, preds, average = 'weighted'),
                          'Accuracy':accuracy_score(y_test, preds)},
                        ignore_index = True)

LightGBM Classifier:
              precision    recall  f1-score   support

         0.0      0.992     0.980     0.986     14252
         1.0      0.751     0.886     0.813       968

    accuracy                          0.974     15220
   macro avg      0.871     0.933     0.899     15220
weighted avg      0.977     0.974     0.975     15220



##### Ensembles of classifiers:

In [131]:
# A soft voting ensemble with ET and catboost, our previously top performing algorithms
model1 = ExtraTreesClassifier()
model2 = catboost(**cat_params)
model = VotingClassifier(estimators = [('et', model1), ('ctboost',model2)], verbose=False, voting = 'soft')
model.fit(X_train,y_train)
preds = model.predict(X_test)
print('Soft Voting Classifier with ET and catboost:')
print(classification_report(y_test, preds, digits= 3))
results = results.append({'Model name':'Soft Voting Ensemble: ET & Catboost',
                          'Feature_Engineered_data(Y/N)':'Y',
                          'F1-score(weighted)':f1_score(y_test, preds, average = 'weighted'),
                          'Accuracy':accuracy_score(y_test, preds)},
                        ignore_index = True)

Soft Voting Classifier with ET and catboost:
              precision    recall  f1-score   support

         0.0      0.992     0.991     0.991     14252
         1.0      0.872     0.875     0.874       968

    accuracy                          0.984     15220
   macro avg      0.932     0.933     0.933     15220
weighted avg      0.984     0.984     0.984     15220



In [132]:
# Soft Voting Classifier with ET, catboost and xgboost:
model1 = ExtraTreesClassifier()
model2 = catboost(**cat_params)
model3 = xgb(**xgb_params)
model = VotingClassifier(estimators = [('et', model1), ('ctboost',model2), ('xgb', model3)], voting = 'soft', verbose=0)
model.fit(X_train,y_train)
preds = model.predict(X_test)
print('Soft Voting Classifier with ET, catboost and xgboost:')
print(classification_report(y_test, preds, digits= 3))
results = results.append({'Model name':'Soft Voting Ensemble: ET, Catboost & XGB',
                          'Feature_Engineered_data(Y/N)':'Y',
                          'F1-score(weighted)':f1_score(y_test, preds, average = 'weighted'),
                          'Accuracy':accuracy_score(y_test, preds)},
                        ignore_index = True)

Soft Voting Classifier with ET, catboost and xgboost:
              precision    recall  f1-score   support

         0.0      0.991     0.991     0.991     14252
         1.0      0.864     0.874     0.869       968

    accuracy                          0.983     15220
   macro avg      0.928     0.932     0.930     15220
weighted avg      0.983     0.983     0.983     15220



In [133]:
# Hard Voting Classifier with ET, catboost and XGBoost
model1 = ExtraTreesClassifier()
model2 = catboost(**cat_params)
model3 = xgb(**xgb_params)
model = VotingClassifier(estimators = [('et', model1), ('ctboost',model2), ('xgb', model3)], voting = 'hard', verbose=0)
model.fit(X_train,y_train)
preds = model.predict(X_test)
print('Hard Voting Classifier with ET, catboost and XGBoost')
print(classification_report(y_test, preds, digits= 3))
results = results.append({'Model name':'Hard Voting Ensemble: ET, Catboost and XGB',
                          'Feature_Engineered_data(Y/N)':'Y',
                          'F1-score(weighted)':f1_score(y_test, preds, average = 'weighted'),
                          'Accuracy':accuracy_score(y_test, preds)},
                        ignore_index = True)

Hard Voting Classifier with ET, catboost and XGBoost
              precision    recall  f1-score   support

         0.0      0.991     0.991     0.991     14252
         1.0      0.867     0.874     0.870       968

    accuracy                          0.983     15220
   macro avg      0.929     0.932     0.931     15220
weighted avg      0.984     0.983     0.983     15220



In [134]:
# Stacking Classifier with ET and catboost
model1 = ExtraTreesClassifier()
model2 = catboost(**cat_params)
model = StackingClassifier(estimators = [('et', model1), ('ctboost',model2)], verbose=0, final_estimator=LogisticRegression())
model.fit(X_train,y_train)
preds = model.predict(X_test)
print('Stacking Classifier with ET and catboost')
print(classification_report(y_test, preds, digits= 3))
results = results.append({'Model name':'Stacking Ensemble: ET & catboost',
                          'Feature_Engineered_data(Y/N)':'Y',
                          'F1-score(weighted)':f1_score(y_test, preds, average = 'weighted'),
                          'Accuracy':accuracy_score(y_test, preds)},
                        ignore_index = True)

Stacking Classifier with ET and catboost
              precision    recall  f1-score   support

         0.0      0.991     0.991     0.991     14252
         1.0      0.869     0.872     0.871       968

    accuracy                          0.984     15220
   macro avg      0.930     0.931     0.931     15220
weighted avg      0.984     0.984     0.984     15220



In [135]:
# Stacking Classifier with ET, catboost and XGBoost
model1 = ExtraTreesClassifier(n_jobs = -1)
model2 = catboost(**cat_params)
model3 = xgb(**xgb_params)
model = StackingClassifier(estimators = [('et', model1), 
                                         ('ctboost',model2), 
                                         ('xgb', model3)], 
                           verbose=0, final_estimator=LogisticRegression(), n_jobs = -1)
model.fit(X_train,y_train)
preds = model.predict(X_test)
print('Stacking Classifier with ET, catboost and XGBoost')
print(classification_report(y_test, preds, digits= 3))
results = results.append({'Model name':'Stacking Ensemble: ET, catboost & XGB',
                          'Feature_Engineered_data(Y/N)':'Y',
                          'F1-score(weighted)':f1_score(y_test, preds, average = 'weighted'),
                          'Accuracy':accuracy_score(y_test, preds)},
                        ignore_index = True)

Stacking Classifier with ET, catboost and XGBoost
              precision    recall  f1-score   support

         0.0      0.991     0.992     0.992     14252
         1.0      0.877     0.872     0.875       968

    accuracy                          0.984     15220
   macro avg      0.934     0.932     0.933     15220
weighted avg      0.984     0.984     0.984     15220



In [136]:
# The summarised results of various classifiers and ensembles
results

Unnamed: 0,Model name,Feature_Engineered_data(Y/N),F1-score(weighted),Accuracy
0,Extra Trees,N,0.982834,0.982917
1,Catboost,N,0.983484,0.983377
2,XGBoost,N,0.981908,0.9818
3,Adaboost,N,0.929381,0.917017
4,Extra Trees,Y,0.981901,0.981997
5,Catboost,Y,0.983268,0.98318
6,XGBoost,Y,0.982178,0.982063
7,Adaboost,Y,0.935067,0.924836
8,LightGBM,Y,0.975043,0.974047
9,Soft Voting Ensemble: ET & Catboost,Y,0.983914,0.983903


In terms of f1-score and accuracy the 'Stacking Ensemble: ET, catboost & XGB' is leading

### GRID SEARCH: <br>The following optimization for model creation have been been used for comparison and not for evaluation.


In [138]:
# grid search on xgboost models
model = xgb()
# initial grid search parameters
grid = {'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
        }
# updated grid search parameters based on previous results ()
grid = {'max_depth':range(9,14,2),
 'min_child_weight':range(1,6,2)
        }
grid = {'max_depth':range(13,20,2),
 'min_child_weight':range(1,6,2)
        }

gsc = GridSearchCV(
    estimator=model,
    param_grid=grid,
    scoring='f1_weighted',
    cv=3
)

grid_result = gsc.fit(X_train, y_train)


print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

for test_mean, param in zip(
        grid_result.cv_results_['mean_test_score'],
        grid_result.cv_results_['params']):
    print("Test : %f with: %r" % (test_mean, param))

Best: 0.982260 using {'max_depth': 19, 'min_child_weight': 1}
Test : 0.981326 with: {'max_depth': 13, 'min_child_weight': 1}
Test : 0.979642 with: {'max_depth': 13, 'min_child_weight': 3}
Test : 0.978532 with: {'max_depth': 13, 'min_child_weight': 5}
Test : 0.981700 with: {'max_depth': 15, 'min_child_weight': 1}
Test : 0.980090 with: {'max_depth': 15, 'min_child_weight': 3}
Test : 0.979061 with: {'max_depth': 15, 'min_child_weight': 5}
Test : 0.982249 with: {'max_depth': 17, 'min_child_weight': 1}
Test : 0.980841 with: {'max_depth': 17, 'min_child_weight': 3}
Test : 0.979615 with: {'max_depth': 17, 'min_child_weight': 5}
Test : 0.982260 with: {'max_depth': 19, 'min_child_weight': 1}
Test : 0.981159 with: {'max_depth': 19, 'min_child_weight': 3}
Test : 0.980027 with: {'max_depth': 19, 'min_child_weight': 5}


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=19,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

We chose 'max_depth' of xgb() to be 17 due to the negligible increase in score. 'min_child_weight' can be set to 1.

In [50]:
# grid search on catboost models
model = catboost(verbose = False)
from sklearn.model_selection import GridSearchCV
# initial grid search parameters
grid = {'depth'         : [6,8,10],
          'learning_rate' : [0.01, 0.05, 0.1],
          'iterations'    : [30, 50, 100]
         }
# updated parameters based on previous results
grid = {'depth'         : [10,20,30],
          'learning_rate' : [0.1, 0.15, 0.2],
          'iterations'    : [100, 150, 200]
         }
grid = {'depth'         : [10, 12],
          'learning_rate' : [0.2,0.25,0.3],
          'iterations'    : [200, 275, 350]
         }
grid = {'depth'         : [12],
          'learning_rate' : [0.25],
          'iterations'    : [350,400,500]
         }
# set learning rate to automatically set
grid = {'depth'         : [12],
          'iterations'    : [500,1000,1500]
         }

gsc = GridSearchCV(
    estimator=model,
    param_grid=grid,
    scoring='f1_weighted',
    cv=3
)

grid_result = gsc.fit(X_train, y_train)


print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

for test_mean, param in zip(
        grid_result.cv_results_['mean_test_score'],
        grid_result.cv_results_['params']):
    print("Test : %f with: %r" % (test_mean, param))

Best: 0.941526 using {'depth': 12, 'iterations': 1500}
Test : 0.939062 with: {'depth': 12, 'iterations': 500}
Test : 0.940189 with: {'depth': 12, 'iterations': 1000}
Test : 0.941526 with: {'depth': 12, 'iterations': 1500}


<catboost.core.CatBoostClassifier at 0x248b7cfda88>

It is seen that an increase in the number of iterations is only increasing the scores by minute amounts. Let the default iterations be used (1000). The depth (max = 16 for the data) can be chosen from [6,8,10] as depth greater than 10 is computationally very expensive

In [18]:
# grid search on catboost models using f1-scores
model = catboost(verbose = False)
from sklearn.model_selection import GridSearchCV
# initial grid search parameters
grid = {'depth' : [6,8,10]
         }

gsc = GridSearchCV(
    estimator=model,
    param_grid=grid,
    scoring='f1_weighted',
    cv=3
)

grid_result = gsc.fit(X_train, y_train)


print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

for test_mean, param in zip(
        grid_result.cv_results_['mean_test_score'],
        grid_result.cv_results_['params']):
    print("Test : %f with: %r" % (test_mean, param))

Best: 0.982351 using {'depth': 10}
Test : 0.972020 with: {'depth': 6}
Test : 0.978821 with: {'depth': 8}
Test : 0.982351 with: {'depth': 10}


<catboost.core.CatBoostClassifier at 0x1b0328535c8>

In [137]:
# grid search on catboost models
model = ExtraTreesClassifier(n_jobs=-1)
from sklearn.model_selection import GridSearchCV

#inital grid search parameters
grid = {'n_estimators': range(50,126,25),
        'max_features': range(3,12,3),
        'min_samples_leaf': range(1,2,3),
        'min_samples_split': range(2,3,4)}
# updated parameters
grid = {'n_estimators': [100],
        'max_features': [7,8,9,10,11,12],
        'min_samples_leaf': [1],
        'min_samples_split': [2]}


gsc = GridSearchCV(
    estimator=model,
    param_grid=grid,
    scoring='f1_weighted',
    cv=3
)

grid_result = gsc.fit(X_train, y_train)


print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

for test_mean, param in zip(
        grid_result.cv_results_['mean_test_score'],
        grid_result.cv_results_['params']):
    print("Test : %f with: %r" % (test_mean, param))

Best: 0.985251 using {'max_features': 12, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Test : 0.984467 with: {'max_features': 7, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Test : 0.984657 with: {'max_features': 8, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Test : 0.984626 with: {'max_features': 9, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Test : 0.984912 with: {'max_features': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Test : 0.985165 with: {'max_features': 11, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Test : 0.985251 with: {'max_features': 12, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


ExtraTreesClassifier(max_features=12, n_jobs=-1)

We chose the max features 12 (full) and min_samples_leaf and min_samples_split are seen to be default values (1 and 2). The best n_estimators also is seen to be the default value (100) 

In [48]:
# grid search on catboost models
model = ExtraTreesClassifier(n_jobs=-1)
from sklearn.model_selection import GridSearchCV

#inital grid search parameters
grid = {'n_estimators': range(50,126,25),
        'max_features': range(3,12,3),
        'min_samples_leaf': range(1,2,3),
        'min_samples_split': range(2,3,4)}
# updated parameters
grid = {'n_estimators': [100],
        'max_features': [7,8,9,10,11,12],
        'min_samples_leaf': [1],
        'min_samples_split': [2]}


gsc = GridSearchCV(
    estimator=model,
    param_grid=grid,
    scoring='r2',
    cv=3
)

grid_result = gsc.fit(X_train, y_train)


print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

for test_mean, param in zip(
        grid_result.cv_results_['mean_test_score'],
        grid_result.cv_results_['params']):
    print("Test : %f with: %r" % (test_mean, param))
    
model = ExtraTreesClassifier(**grid_result.best_params_, n_jobs=-1)

model.fit(X_train, y_train)

Best: 0.946877 using {'max_features': 12, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Test : 0.944695 with: {'max_features': 7, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Test : 0.945540 with: {'max_features': 8, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Test : 0.946068 with: {'max_features': 9, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Test : 0.945962 with: {'max_features': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Test : 0.946420 with: {'max_features': 11, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Test : 0.946877 with: {'max_features': 12, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


ExtraTreesClassifier(max_features=12, n_jobs=-1)

## Reading the test data:

In [19]:
# read data
names = (['ID', 'Expense', 'Income', 'Loan type', 'Occupation type', 
         'Age', 'Score1', 'Score2', 'Score3', 'Score4', 'Score5'])

X_test = pd.read_csv('dataset/test_x.csv', index_col = 'ID', names = names, skiprows = 1)

In [20]:
# using full data
X_train = X.copy()
y_train = y.copy()

# we use smote categorical oversampling
oversampler = SMOTENC(random_state= 42, categorical_features=[2,3,4], sampling_strategy=2/3)
X_train, y_train = oversampler.fit_resample(X_train, y_train)

# reset index name as smote removes the index name
X_train.index.name = 'ID'

# the feature engineering function from feature_engineer.py on train data
X_train, X_train_bins = feature_engineer(X_train.reset_index(), False)

# creating dummy variables for Occupation type
X_train, y_train = dummy_creator(X_train, y_train)

# This conversion is done so that catboost may consider these features 
# as categorical features as they are already one-hot encoded (two categories)
X_train['Loan type'] = X_train['Loan type'].astype(int)
X_train.Age = X_train.Age.astype(int)
X_train['Age X Loan type'] = X_train['Age X Loan type'].astype(int)

In [21]:
# encoding the occupation and loan types
X_test = ManualEncoder(X_test)
# applying the above operations on x_test
X_test, temp = basic_clean(X_test, pd.DataFrame(np.ones(len(X_test))), non_cat_cols_X)
X_test.index.name = 'ID'
X_test, X_test_bins = feature_engineer(X_test.reset_index(), False)
X_test, temp = dummy_creator(X_test, np.ones(len(X_test)))
X_test['Loan type'] = X_test['Loan type'].astype(int)
X_test.Age = X_test.Age.astype(int)
X_test['Age X Loan type'] = X_test['Age X Loan type'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [None]:
# Stacking Classifier with ET, catboost and XGBoost
model1 = ExtraTreesClassifier(n_jobs = -1)
model2 = catboost(depth = 10, task_type = 'GPU', verbose = 0)
model3 = xgb(max_depth= 17, min_child_weight= 1)
model = StackingClassifier(estimators = [('et', model1), 
                                         ('ctboost',model2), 
                                         ('xgb', model3)], 
                           verbose=0, final_estimator=LogisticRegression())
model.fit(X_train,y_train)

In [25]:
# converting predictions to dataframe for saving
preds = model.predict(X_test)
preds = pd.DataFrame(preds, columns = ['Label'])
preds

Unnamed: 0,Label
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
19995,0.0
19996,1.0
19997,1.0
19998,0.0


In [27]:
# saving the data
preds.to_csv('ML_predictions.csv')
# Save to file in the current working directory
pkl_filename = "stacking_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)

In [28]:
preds_probs = model.predict_proba(X_test)
preds_probs = pd.DataFrame(preds_probs[:,1], columns = ['Label'])
preds_probs.to_csv('ML_prob_predictions.csv')

In [29]:
preds_probs

Unnamed: 0,Label
0,0.004333
1,0.005319
2,0.005317
3,0.004333
4,0.004334
...,...
19995,0.009630
19996,0.590222
19997,0.987349
19998,0.012569


In [None]:
def test_predictor(model_filename, X_test_filename):
    # reading X_test
    # read data
    names = (['ID', 'Expense', 'Income', 'Loan type', 'Occupation type', 
             'Age', 'Score1', 'Score2', 'Score3', 'Score4', 'Score5'])

    X_test = pd.read_csv(X_test_filename, index_col = 'ID', names = names, skiprows = 1)
    # X_test_prepreocessing
    # encoding the occupation and loan types
    X_test = ManualEncoder(X_test)
    # applying the above operations on x_test
    X_test, temp = basic_clean(X_test, pd.DataFrame(np.ones(len(X_test))), non_cat_cols_X)
    X_test.index.name = 'ID'
    X_test, X_test_bins = feature_engineer(X_test.reset_index(), False)
    X_test, temp = dummy_creator(X_test, np.ones(len(X_test)))
    X_test['Loan type'] = X_test['Loan type'].astype(int)
    X_test.Age = X_test.Age.astype(int)
    X_test['Age X Loan type'] = X_test['Age X Loan type'].astype(int)
    pkl_filename = model_filename
    # Load from file
    with open(pkl_filename, 'rb') as file:
        pickle_model = pickle.load(file)

    # predict target values
    return pickle_model.predict(X_test)

In [125]:
model_filename = "stacking_model.pkl"
X_test_filename = 'dataset/test_x.csv'
y_preds = test_predictor(model_filename, X_test_filename)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [173]:
test_predictor(model_filename, 'RF_imputed_X.csv')

array([0., 0., 0., ..., 0., 0., 0.])