# Modelling

In [53]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from xgboost.sklearn import XGBClassifier
from tabulate import tabulate
import matplotlib.pyplot as plt
%matplotlib inline

# Reading data, spliting the data set into train and test sets

In [3]:
data = pd.read_csv('data1.csv', encoding = "latin1")
print(data.shape)

(81385, 55)


In [4]:
y = data['Disbursed']
X = data
del X['Disbursed']

In [5]:
y.value_counts()

0    80112
1     1273
Name: Disbursed, dtype: int64

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=865)

In [58]:
y_train.value_counts()

0    60067
1      971
Name: Disbursed, dtype: int64

In [59]:
y_test.value_counts()

0    20045
1      302
Name: Disbursed, dtype: int64

# Simple logistic regression

Let's start with logistic regression. Logistic regression with all parameters set on defaults yields all zeros. It turned out that penalty = 'l1' helps more then class_weigts='balanced'.

In [41]:
model = LogisticRegression(penalty='l1')
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_train)[:,1]
print( "AUC: ", roc_auc_score(y_score=y_pred,y_true=y_train))

AUC:  0.9911475097229653


Extremly high AUC results from a high FPR, which in turn is caused by considerable classes imbalance. From now on we will focus on F1 score instead.

# Removing excess variables

Let's see if thanks to l1 penalty we will be able to remove some unnecessary variables:

In [44]:
z = (model.coef_==0)

In [50]:
np.where(z,data.columns,0)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        'Salary_Account_Punjab National Bank', 0, 0, 'Mobile_Verified_Y',
        0, 0, 0, 'Var2_D', 0, 'Var2_F', 0, 0, 'Source_S127', 0, 0, 0, 0,
        0, 0, 0, 'City2_Co', 0, 'City2_G', 0, 0, 0, 0, 'City2_O', 0, 0,
        0, 0, 0, 'var11_F', 0, 0]], dtype=object)

There are 9 variables which could be removed (I am leaving them).

In [56]:
del X['Salary_Account_Punjab National Bank']
del X['Mobile_Verified_Y']
del X['Var2_D']
del X['Var2_F']
del X['Source_S127']
del X['City2_Co']
del X['City2_G']
del X['City2_O']
del X['var11_F']

# Evaluating the results of log regression

Let's see what the result will be on train set (cut-off threshold = 0.5):

In [21]:
# function which draw confusion matrix in a nicer form
def print_conf(a):
    a_list=a.tolist()
    a_list[0].insert(0,'Real 0')
    a_list[1].insert(0,'Real 1')
    print (tabulate (a_list,headers=['Real/Pred','Pred 0', 'Pred 1']))

In [22]:
predictions = np.where(y_pred>=0.5,1,0)
print_conf(confusion_matrix(y_train, predictions))
print("F1:",f1_score(y_train, predictions))

Real/Pred      Pred 0    Pred 1
-----------  --------  --------
Real 0          59715       352
Real 1            361       610
F1: 0.6311433005690635


This could be easily improved (in terms of F1) by manipulating the cut-off threshold. For exemple:

In [23]:
predictions = np.where(y_pred>=0.3,1,0)
print_conf(confusion_matrix(y_train, predictions))
print("F1:",f1_score(y_train, predictions))

Real/Pred      Pred 0    Pred 1
-----------  --------  --------
Real 0          59302       765
Real 1             75       896
F1: 0.6808510638297872


Let's consider other models.

# Decision tree

In [60]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_train)[:,1]
predictions = np.where(y_pred>0.3,1,0)
print_conf(confusion_matrix(y_train, predictions))
print("F1:",f1_score(y_train, predictions))

Real/Pred      Pred 0    Pred 1
-----------  --------  --------
Real 0          60067         0
Real 1              0       971
F1: 1.0


We can see symptoms of overfitting. To deal with it, in case of this particular model, we will add a limitation on e.g. min_samples_leaf. In general cross validation will do the job.

In [61]:
model = DecisionTreeClassifier(min_samples_leaf=5)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_train)[:,1]
predictions = np.where(y_pred>0.3,1,0)
print_conf(confusion_matrix(y_train, predictions))
print("F1:",f1_score(y_train, predictions))

Real/Pred      Pred 0    Pred 1
-----------  --------  --------
Real 0          59745       322
Real 1             66       905
F1: 0.8234758871701546


# Choosing best model in a loop

The idea is to fit models one by one, for each model to obtain a vector of probabilities by means of crossvalidation and finaly to establish the best cut off threshold in terms of F1 score. The best model will be one with the best F1 score. For the moment we don't take into consideration changing of specific parametres.

In [62]:
models = [LogisticRegression(penalty='l1'),DecisionTreeClassifier(min_samples_leaf=5), 
          LinearDiscriminantAnalysis(), 
          QuadraticDiscriminantAnalysis(), RandomForestClassifier(), XGBClassifier()]
m = ['LogisticRegression','DecisionTreeClassifier','LinearDiscriminantAnalysis', 
          'QuadraticDiscriminantAnalysis', 'RandomForestClassifier', 'XGBClassifier']
i = 0
for model in models:    
    model.fit(X_train, y_train)
    y_pred = cross_val_predict(estimator=model, X=X_train, y=y_train, method="predict_proba",cv = StratifiedKFold(3))[:,1]
    r = []
    for x in  np.arange(0,1,0.05):
        z = y_pred > x
        z = [int(z[x]) for x in range(len(z))]
        f = f1_score(y_train, z)
        r.append(round(f,4))
    print(r)    
    print(m[i], "Best F1:", np.max(r),"threshold:" ,np.argmax(r)*0.05)
    i+=1

  'precision', 'predicted', average, warn_for)


[0.0313, 0.6592, 0.6597, 0.6601, 0.6641, 0.6659, 0.6687, 0.6688, 0.6604, 0.6483, 0.606, 0.5414, 0.4627, 0.3646, 0.2596, 0.1333, 0.0455, 0.0102, 0.0021, 0.0]
LogisticRegression Best F1: 0.6688 threshold: 0.35000000000000003
[0.6129, 0.6129, 0.6129, 0.6103, 0.6166, 0.6119, 0.6079, 0.5967, 0.5853, 0.5713, 0.5581, 0.5581, 0.5124, 0.5099, 0.4843, 0.4744, 0.4412, 0.4282, 0.4077, 0.4077]
DecisionTreeClassifier Best F1: 0.6166 threshold: 0.2
[0.0313, 0.6592, 0.6592, 0.6592, 0.6592, 0.6592, 0.6592, 0.6592, 0.6592, 0.6592, 0.6592, 0.6592, 0.6592, 0.6592, 0.6592, 0.6592, 0.6592, 0.6592, 0.6592, 0.6592]
LinearDiscriminantAnalysis Best F1: 0.6592 threshold: 0.05




[0.0315, 0.1618, 0.173, 0.1804, 0.1858, 0.1907, 0.1957, 0.2003, 0.2041, 0.208, 0.2125, 0.2159, 0.22, 0.2245, 0.2296, 0.2345, 0.2418, 0.2497, 0.2603, 0.279]
QuadraticDiscriminantAnalysis Best F1: 0.279 threshold: 0.9500000000000001
[0.4367, 0.4367, 0.64, 0.64, 0.643, 0.643, 0.607, 0.607, 0.5416, 0.5416, 0.4393, 0.4393, 0.3255, 0.3255, 0.1764, 0.1764, 0.0496, 0.0496, 0.0082, 0.0082]
RandomForestClassifier Best F1: 0.643 threshold: 0.2
[0.0313, 0.6592, 0.6594, 0.6627, 0.6708, 0.6704, 0.6745, 0.6721, 0.667, 0.6402, 0.6133, 0.5632, 0.5094, 0.4131, 0.2671, 0.1266, 0.0399, 0.0061, 0.0021, 0.0]
XGBClassifier Best F1: 0.6745 threshold: 0.30000000000000004


  'precision', 'predicted', average, warn_for)


XGBoost gave the best result. Experiments showed that manipulating XGBoost parameters doesn't help. Let's establish the very best threshold.

In [63]:
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = cross_val_predict(estimator=model, X=X_train, y=y_train, method="predict_proba",cv = StratifiedKFold(3))[:,1]
r = []
for x in  np.arange(0.25,0.35,0.01):
    z = y_pred > x
    z = [int(z[x]) for x in range(len(z))]
    f = f1_score(y_train, z)
    r.append(round(f,4))
print(r)    
print("Best F1:", np.max(r),"threshold:" ,0.25+np.argmax(r)*0.01)

[0.6704, 0.6704, 0.6737, 0.6735, 0.6749, 0.6745, 0.6732, 0.6743, 0.6728, 0.6723]
Best F1: 0.6749 threshold: 0.29


Let's see confusion matrix for the chosen model:

In [64]:
threshold = 0.29
predictions = np.where(y_pred>threshold,1,0)
print_conf(confusion_matrix(y_train, predictions))
print("F1:",f1_score(y_train, predictions))

Real/Pred      Pred 0    Pred 1
-----------  --------  --------
Real 0          59322       745
Real 1             97       874
F1: 0.6749034749034749


# On the test set

In [66]:
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:,1]
threshold = 0.29
predictions = np.where(y_pred>threshold,1,0)
print_conf(confusion_matrix(y_test, predictions))
print("F1:",f1_score(y_test, predictions))

Real/Pred      Pred 0    Pred 1
-----------  --------  --------
Real 0          19758       287
Real 1             26       276
F1: 0.638150289017341


# Solution
XGBClassifier() with cut-off threshold = 0.29. FD1 score on the test set = 0.64