### In Depth Analysis - Modeling & Hyper parameter tuning

In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set()

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

from scipy import stats

from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

from keras.layers import Input, Dense
from keras.models import Model, Sequential
from keras import regularizers
from keras import optimizers
import tensorflow as tf
from keras import backend as K
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from skopt import BayesSearchCV

from sklearn.metrics import accuracy_score,classification_report, roc_auc_score

Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
##all features
application_train = pd.read_csv('new_application_train_merged.csv', index_col=None, engine='python')
application_test = pd.read_csv('new_application_test_merged.csv', index_col=None, engine='python')


In [3]:
#top 20 features
new_application_train = pd.read_csv('application_train_for_ML.csv', index_col=None, engine='python')
new_application_test = pd.read_csv('application_test_for_ML.csv', index_col=None, engine='python')

## Feature Imputing & Feature Scaling 


In [4]:
new_application_train.shape

(43845, 21)

In [5]:
imputer = SimpleImputer(strategy="median")
scaler = MinMaxScaler(feature_range = (0, 1))

In [6]:
imputer.fit(new_application_train.drop(['TARGET'], axis=1))


SimpleImputer(strategy='median')

In [7]:
train_transformed = imputer.transform(new_application_train.drop(['TARGET'], axis=1))
train_transformed = scaler.fit_transform(train_transformed)


In [8]:
test_transformed = imputer.transform(new_application_test)
test_transformed = scaler.fit_transform(test_transformed)


In [9]:
X_training_set, X_validation_set, y_training_set, y_validation_set = train_test_split(train_transformed, 
                                                                                      new_application_train['TARGET'], test_size=0.33, random_state=42)

In [10]:
def metric_calc(pred):
    print("The accuracy score : ", accuracy_score(y_validation_set,pred))
    print("The classification report is as follows:\n", classification_report(y_validation_set,pred))
    print("Taregt Values:")
    print(pd.DataFrame({'target':pred})['target'].value_counts())
    print("ROC AUC score is: ",roc_auc_score(y_validation_set,pred))

## 1. Logistic Regression

In [11]:
logistic_regressor = LogisticRegression(C = 2)
logistic_regressor.fit(X_training_set,y_training_set)
log_regression_pred = logistic_regressor.predict(X_validation_set)
pd.DataFrame({'target':log_regression_pred})['target'].value_counts()
metric_calc(log_regression_pred)

The accuracy score :  0.6712972562029166
The classification report is as follows:
               precision    recall  f1-score   support

           0       0.65      0.61      0.63      6578
           1       0.69      0.73      0.71      7891

    accuracy                           0.67     14469
   macro avg       0.67      0.67      0.67     14469
weighted avg       0.67      0.67      0.67     14469

Taregt Values:
1    8327
0    6142
Name: target, dtype: int64
ROC AUC score is:  0.6658108026204868


## 2. Random Forest 

In [12]:

random_forest = RandomForestClassifier(n_estimators = 500, random_state = 50, verbose = 1)

In [13]:
random_forest.fit(X_training_set,y_training_set)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   33.7s finished


RandomForestClassifier(n_estimators=500, random_state=50, verbose=1)

In [14]:
random_forest_pred = random_forest.predict(X_validation_set)
metric_calc(random_forest_pred)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


The accuracy score :  0.6746838067592784
The classification report is as follows:
               precision    recall  f1-score   support

           0       0.65      0.61      0.63      6578
           1       0.69      0.73      0.71      7891

    accuracy                           0.67     14469
   macro avg       0.67      0.67      0.67     14469
weighted avg       0.67      0.67      0.67     14469

Taregt Values:
1    8326
0    6143
Name: target, dtype: int64
ROC AUC score is:  0.6692317960672662


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    3.2s finished


In [15]:
random_forest_pred_test = random_forest.predict_proba(test_transformed)
submission_random_forest = application_test[['SK_ID_CURR']]
submission_random_forest['TARGET'] = random_forest_pred_test[:,1]

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    7.8s finished
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_random_forest['TARGET'] = random_forest_pred_test[:,1]


## 2.1 Feature importance of random forest classifier


In [16]:
features = list(set(new_application_train.columns).difference({'TARGET'}))

feature_importance_df = pd.DataFrame({'Feature':features,'Importance':random_forest.feature_importances_})
df = feature_importance_df.sort_values(['Importance'],ascending=False).reset_index()
df = df.drop(['index'],axis=1)
df.head()

Unnamed: 0,Feature,Importance
0,EXT_SOURCE_2,0.176427
1,REGION_RATING_CLIENT,0.162095
2,FLAG_MOBIL,0.123399
3,AMT_CREDIT_x,0.119232
4,NAME_FAMILY_STATUS,0.084473


## 3. Naive Bayes Classifier


In [17]:
bayes_class = GaussianNB()
bayes_class.fit(X_training_set,y_training_set)
bayes_preds = bayes_class.predict(X_validation_set)
metric_calc(bayes_preds)

The accuracy score :  0.6523602183979542
The classification report is as follows:
               precision    recall  f1-score   support

           0       0.62      0.62      0.62      6578
           1       0.68      0.68      0.68      7891

    accuracy                           0.65     14469
   macro avg       0.65      0.65      0.65     14469
weighted avg       0.65      0.65      0.65     14469

Taregt Values:
1    7803
0    6666
Name: target, dtype: int64
ROC AUC score is:  0.6500302040198895


In [18]:
nb_pred_test = bayes_class.predict_proba(test_transformed)
submission_nb = application_test[['SK_ID_CURR']]
submission_nb['TARGET'] = nb_pred_test[:,1]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_nb['TARGET'] = nb_pred_test[:,1]


## 4 .XG Boost

In [19]:
xgb_classifier = XGBClassifier(objective ='binary:logistic')
xgb_classifier.fit(X_training_set,y_training_set)
xgb_pred = xgb_classifier.predict(X_validation_set)
metric_calc(xgb_pred)

The accuracy score :  0.6703987836063308
The classification report is as follows:
               precision    recall  f1-score   support

           0       0.65      0.60      0.62      6578
           1       0.69      0.73      0.71      7891

    accuracy                           0.67     14469
   macro avg       0.67      0.66      0.67     14469
weighted avg       0.67      0.67      0.67     14469

Taregt Values:
1    8370
0    6099
Name: target, dtype: int64
ROC AUC score is:  0.6646329460239638


In [20]:
xgb_pred_test = xgb_classifier.predict_proba(test_transformed)
submission_xgb = application_test[['SK_ID_CURR']]
submission_xgb['TARGET'] = xgb_pred_test[:,1]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_xgb['TARGET'] = xgb_pred_test[:,1]


## 5 . Ensemble Modeling 
https://quantdare.com/what-is-the-difference-between-bagging-and-boosting/

In [21]:
#Voting Approach
def stacked_model(X_training_set):
    """
    This method performs the stacked ensambling of all the models - Random forest, 
    Naive Bayes,Logistic Regression.
    """  
    stacked_predictions = np.array([])

    for element in X_training_set:
         stacked_predictions = np.append(stacked_predictions,stats.mode(element)[0][0])

    return stacked_predictions

Combine all the test results into a multidimensional array to feed into the stacked model

In [22]:
combined_array = (pd.DataFrame({'LR':log_regression_pred,
                                'RF':random_forest_pred,
                                'Bayes':bayes_preds,
                                'XGB':xgb_pred}).values)
stacked_model_pred = stacked_model(combined_array)
metric_calc(stacked_model_pred)

The accuracy score :  0.6767572050590919
The classification report is as follows:
               precision    recall  f1-score   support

           0       0.64      0.66      0.65      6578
           1       0.71      0.69      0.70      7891

    accuracy                           0.68     14469
   macro avg       0.67      0.68      0.68     14469
weighted avg       0.68      0.68      0.68     14469

Taregt Values:
1.0    7634
0.0    6835
Name: target, dtype: int64
ROC AUC score is:  0.6756984867435408


## Observation : Ensemble provides slight better result though not signifacnt improvement 

## 6. Neural Network 

In [23]:
model = Sequential()
new_application_test.shape[1]
model.add(Dense(100, input_dim=new_application_train.drop(['TARGET'], axis=1).shape[1], activation= "relu"))
model.add(Dense(50, activation= "relu"))
model.add(Dense(1,activation= "sigmoid"))
model.summary() #Print model Summary


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               2100      
_________________________________________________________________
dense_1 (Dense)              (None, 50)                5050      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 51        
Total params: 7,201
Trainable params: 7,201
Non-trainable params: 0
_________________________________________________________________


In [24]:
def auroc(y_true, y_pred):
    return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)

In [25]:
model.compile(loss= "binary_crossentropy" , optimizer="adam", metrics=['accuracy', auroc])
model.fit(X_training_set, y_training_set, epochs=250)


Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, there are two
    options available in V2.
    - tf.py_function takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    - tf.numpy_function maintains the semantics of the deprecated tf.py_func
    (it is not differentiable, and manipulates numpy arrays). It drops the
    stateful argument making all functions stateful.
    
Train on 29376 samples
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
E

Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78/250
Epoch 79/250
Epoch 80/250
Epoch 81/250
Epoch 82/250
Epoch 83/250
Epoch 84/250
Epoch 85/250
Epoch 86/250
Epoch 87/250
Epoch 88/250
Epoch 89/250
Epoch 90/250
Epoch 91/250
Epoch 92/250
Epoch 93/250
Epoch 94/250
Epoch 95/250
Epoch 96/250
Epoch 97/250
Epoch 98/250
Epoch 99/250
Epoch 100/250
Epoch 101/250
Epoch 102/250
Epoch 103/250
Epoch 104/250
Epoch 105/250
Epoch 106/250
Epoch 107/250
Epoch 108/250
Epoch 109/250
Epoch 110/250
Epoch 111/250
Epoch 112/250
Epoch 113/250
Epoch 114/250
Epoch 115/250
Epoch 116/250
Epoch 117/250
Epoch 118/250
Epoch 119/250
Epoch 120/250
Epoch 121/250
Epoch 122/250
Epoch 123/250
Epoch 124/250
Epoch 125/250
Epoch 126/250
Epoch 127/250
Epoch 128/250
Epoch 129/250


Epoch 130/250
Epoch 131/250
Epoch 132/250
Epoch 133/250
Epoch 134/250
Epoch 135/250
Epoch 136/250
Epoch 137/250
Epoch 138/250
Epoch 139/250
Epoch 140/250
Epoch 141/250
Epoch 142/250
Epoch 143/250
Epoch 144/250
Epoch 145/250
Epoch 146/250
Epoch 147/250
Epoch 148/250
Epoch 149/250
Epoch 150/250
Epoch 151/250
Epoch 152/250
Epoch 153/250
Epoch 154/250
Epoch 155/250
Epoch 156/250
Epoch 157/250
Epoch 158/250
Epoch 159/250
Epoch 160/250
Epoch 161/250
Epoch 162/250
Epoch 163/250
Epoch 164/250
Epoch 165/250
Epoch 166/250
Epoch 167/250
Epoch 168/250
Epoch 169/250
Epoch 170/250
Epoch 171/250
Epoch 172/250
Epoch 173/250
Epoch 174/250
Epoch 175/250
Epoch 176/250
Epoch 177/250
Epoch 178/250
Epoch 179/250
Epoch 180/250
Epoch 181/250
Epoch 182/250
Epoch 183/250
Epoch 184/250
Epoch 185/250
Epoch 186/250
Epoch 187/250
Epoch 188/250
Epoch 189/250
Epoch 190/250
Epoch 191/250
Epoch 192/250
Epoch 193/250
Epoch 194/250
Epoch 195/250
Epoch 196/250
Epoch 197/250


Epoch 198/250
Epoch 199/250
Epoch 200/250
Epoch 201/250
Epoch 202/250
Epoch 203/250
Epoch 204/250
Epoch 205/250
Epoch 206/250
Epoch 207/250
Epoch 208/250
Epoch 209/250
Epoch 210/250
Epoch 211/250
Epoch 212/250
Epoch 213/250
Epoch 214/250
Epoch 215/250
Epoch 216/250
Epoch 217/250
Epoch 218/250
Epoch 219/250
Epoch 220/250
Epoch 221/250
Epoch 222/250
Epoch 223/250
Epoch 224/250
Epoch 225/250
Epoch 226/250
Epoch 227/250
Epoch 228/250
Epoch 229/250
Epoch 230/250
Epoch 231/250
Epoch 232/250
Epoch 233/250
Epoch 234/250
Epoch 235/250
Epoch 236/250
Epoch 237/250
Epoch 238/250
Epoch 239/250
Epoch 240/250
Epoch 241/250
Epoch 242/250
Epoch 243/250
Epoch 244/250
Epoch 245/250
Epoch 246/250
Epoch 247/250
Epoch 248/250
Epoch 249/250
Epoch 250/250


<tensorflow.python.keras.callbacks.History at 0x2772c5141f0>

In [26]:
pred= model.predict(X_validation_set)


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


In [27]:
nn_pred_test = model.predict(test_transformed)
nn_pred_test_conv = [y[0] for y in nn_pred_test]


In [28]:
submission_nn = application_test[['SK_ID_CURR']]
submission_nn['TARGET'] = nn_pred_test_conv

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_nn['TARGET'] = nn_pred_test_conv


## Observation : With NN we can achieve 0.85 ROC-AUC score after 250 echoes also it has good accuracy

## 7.Hyper Parameter Tuning 

## 7.1 Logistic Regression - Grid Search CV

In [29]:
##Tuning C Value
logistic_regressor = LogisticRegression()

c_space = np.logspace(0.01,0.01,1) 
param_grid = {'C': c_space} 

logreg_cv = GridSearchCV(logistic_regressor, param_grid, cv = 5) 
logreg_cv.fit(X_training_set,y_training_set)


log_regression_pred = logreg_cv.predict(X_validation_set)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [30]:
metric_calc(log_regression_pred)

The accuracy score :  0.6710208030962748
The classification report is as follows:
               precision    recall  f1-score   support

           0       0.65      0.61      0.63      6578
           1       0.69      0.73      0.71      7891

    accuracy                           0.67     14469
   macro avg       0.67      0.67      0.67     14469
weighted avg       0.67      0.67      0.67     14469

Taregt Values:
1    8323
0    6146
Name: target, dtype: int64
ROC AUC score is:  0.6655573493192576


In [31]:
log_regression_pred_test = logreg_cv.predict_proba(test_transformed)
log_regression_pred_test[:,1]
submission_log_regression = application_test[['SK_ID_CURR']]
submission_log_regression['TARGET'] = log_regression_pred_test[:,1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_log_regression['TARGET'] = log_regression_pred_test[:,1]


In [32]:
print("Best score is {}".format(logreg_cv.best_score_)) 


Best score is 0.6668709123299972


## Observation : ROC-AUC Score not much improved after tuning the C value - let's try with XG Boost Tuning

## 7.2 XG Boost - BayesSearchCV

In [33]:
# Classifier
bayes_cv_tuner = BayesSearchCV(
    estimator = XGBClassifier(
        n_jobs = 1,
        objective = 'binary:logistic',
        eval_metric = 'auc',
        silent=1,
        tree_method='approx'
    ),
    search_spaces = {
        'learning_rate': (0.01, 1.0, 'log-uniform'),
        'min_child_weight': (0, 10),
        'max_depth': (0, 50),
        'max_delta_step': (0, 20),
        'subsample': (0.01, 1.0, 'uniform'),
        'colsample_bytree': (0.01, 1.0, 'uniform'),
        'colsample_bylevel': (0.01, 1.0, 'uniform'),
        'reg_lambda': (1e-9, 1000, 'log-uniform'),
        'reg_alpha': (1e-9, 1.0, 'log-uniform'),
        'gamma': (1e-9, 0.5, 'log-uniform'),
        'min_child_weight': (0, 5),
        'n_estimators': (50, 100),
        'scale_pos_weight': (1e-6, 500, 'log-uniform')
    },    
    scoring = 'roc_auc',
    cv = StratifiedKFold(
        n_splits=4,
        shuffle=True,
        random_state=42
    ),
    n_jobs = 3,
    n_iter = 8,   
    verbose = 0,
    refit = True,
    random_state = 42
)

def status_print(optim_result):
    """Status callback durring bayesian hyperparameter search"""
    
    # Get all the models tested so far in DataFrame format
    all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)    
    
    # Get current parameters and the best parameters    
    best_params = pd.Series(bayes_cv_tuner.best_params_)
    print('Model #{}\nBest ROC-AUC: {}\nBest params: {}\n'.format(
        len(all_models),
        np.round(bayes_cv_tuner.best_score_, 4),
        bayes_cv_tuner.best_params_
    ))

In [34]:
result = bayes_cv_tuner.fit(train_transformed, new_application_train['TARGET'].values, callback=status_print)

Model #1
Best ROC-AUC: 0.5372
Best params: OrderedDict([('colsample_bylevel', 0.4160029192647807), ('colsample_bytree', 0.7304484857455519), ('gamma', 0.13031389926541354), ('learning_rate', 0.042815319280763466), ('max_delta_step', 13), ('max_depth', 21), ('min_child_weight', 2), ('n_estimators', 87), ('reg_alpha', 5.497557739289786e-07), ('reg_lambda', 648), ('scale_pos_weight', 275), ('subsample', 0.13556548021189216)])

Model #2
Best ROC-AUC: 0.7363
Best params: OrderedDict([('colsample_bylevel', 0.8390144719977516), ('colsample_bytree', 0.8844821246070537), ('gamma', 4.358684608480795e-07), ('learning_rate', 0.7988179462781242), ('max_delta_step', 17), ('max_depth', 3), ('min_child_weight', 1), ('n_estimators', 68), ('reg_alpha', 0.0005266983003701547), ('reg_lambda', 953), ('scale_pos_weight', 315), ('subsample', 0.9923710598637134)])

Model #3
Best ROC-AUC: 0.7363
Best params: OrderedDict([('colsample_bylevel', 0.8390144719977516), ('colsample_bytree', 0.8844821246070537), ('gam

In [35]:
# kfold cross validation of the XGBoost model

# XGB parameters found by Bayesian optimization above
model = XGBClassifier(
            objective ='binary:logistic',
            colsample_bylevel= 0.8015579071911014, 
            colsample_bytree= 0.44364889457651413, 
            gamma= 3.811128976537413e-05, 
            learning_rate= 0.270039020618534, 
            max_delta_step= 18, 
            max_depth=36, 
            min_child_weight= 2, 
            n_estimators= 83, 
            reg_alpha= 1.5057560255472018e-06, 
            reg_lambda= 659, 
            scale_pos_weight= 256, 
            subsample= 0.8835665823899177)

kfold = StratifiedKFold(n_splits=8, random_state=42)
results = cross_val_score(model, train_transformed, new_application_train['TARGET'].values, cv=kfold, scoring='roc_auc')
print("roc_auc:", "%.3f" % results.mean(), "std:", "%.3f" % results.std()) 



roc_auc: 0.740 std: 0.005


## Observation : After Hyper Tuning XGB ROC-AUC Score improved from 0.666 to 0.740