In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
import math 

In [2]:
# Load libraries
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier


In [3]:
df = pd.read_csv('C:/Users/Kosta/Desktop/Modelling/train.csv', header=0)
pd.set_option('display.max_columns', 25)

In [4]:
df.head(5)

Unnamed: 0,customer_id,limit_bal,sex,education,marriage,age,pay_1,pay_2,pay_3,pay_4,pay_5,pay_6,bill_amt1,bill_amt2,bill_amt3,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,default_oct
0,1,1500,2.0,1.0,2.0,23.0,0.0,0.0,0.0,2.0,2.0,0.0,1452,1503,1482,1463,938.0,698.0,75,150,86,0,50.0,50.0,yes
1,2,8500,2.0,2.0,2.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,8079,8175,8300,8364,8275.0,8425.0,300,400,315,300,325.0,305.0,no
2,3,1000,1.0,1.0,2.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,733,831,896,933,772.0,794.0,150,150,150,24,105.0,110.0,no
3,4,10500,1.0,1.0,1.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,7049,7011,7077,7190,7229.0,7340.0,255,260,258,260,265.0,307.0,no
4,5,10500,2.0,2.0,1.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,4487,4501,3533,3558,3592.0,3496.0,180,155,145,130,135.0,200.0,no


In [5]:
#df.dtypes

In [6]:
# convert the values into categories for categorical values
marriage_vals = df.marriage.unique().astype('int')
df['marriage'] = df['marriage'].astype("category", categories=marriage_vals, ordered=False)
# add marriage level that wasn't defined as "Others" category
df.loc[df.marriage==0, 'marriage'] = 3

# get all uniques values for education
education_vals = np.sort(df.education.unique()).astype('int')
df['education'] = df['education'].astype("category", categories=education_vals, ordered=True)
# combine education levels that are not defined into "Others" category
df.loc[(df.education==0) | (df.education==5) | (df.education==6), 'education'] = 4

# get unique identifiers for sex variable
sex_vals = df.sex.unique().astype('int')
df['sex'] = df['sex'].astype("category", categories=sex_vals, ordered=False)

In [7]:
# add a numeric target instead of string
df.loc[df['default_oct'] == 'yes', 'default'] = 1
df.loc[df['default_oct'] == 'no', 'default'] = 0

In [8]:
# combine the values of -2 and 0 into the same bin as -1 for "duly paid"
df.loc[(df.pay_1==-2) | (df.pay_1==0), 'pay_1'] = -1
df.loc[(df.pay_2==-2) | (df.pay_2==0), 'pay_2'] = -1
df.loc[(df.pay_3==-2) | (df.pay_3==0), 'pay_3'] = -1
df.loc[(df.pay_4==-2) | (df.pay_4==0), 'pay_4'] = -1
df.loc[(df.pay_5==-2) | (df.pay_5==0), 'pay_5'] = -1
df.loc[(df.pay_6==-2) | (df.pay_6==0), 'pay_6'] = -1

In [9]:
#impute continuous values by taking their median since the distribution of the variables is very skewed
df.pay_amt5 = df.pay_amt5.fillna(df.pay_amt5.median())
df.pay_amt6 = df.pay_amt6.fillna(df.pay_amt6.median())

df.bill_amt5 = df.bill_amt5.fillna(df.bill_amt5.median())
df.bill_amt6 = df.bill_amt6.fillna(df.bill_amt6.median())

In [10]:
# impute categorical values and keep the imputed values
pay_5_imputed = df.pay_5.median()
pay_6_imputed = df.pay_6.median()

df.pay_5 = df.pay_5.fillna(pay_5_imputed)
df.pay_6 = df.pay_6.fillna(pay_6_imputed)

In [11]:
df.describe()

Unnamed: 0,customer_id,limit_bal,age,pay_1,pay_2,pay_3,pay_4,pay_5,pay_6,bill_amt1,bill_amt2,bill_amt3,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,default
count,24001.0,24001.0,24001.0,24001.0,24001.0,24001.0,24001.0,24001.0,24001.0,24001.0,24001.0,24001.0,24001.0,24001.0,24001.0,24001.0,24001.0,24001.0,24001.0,24001.0,24001.0,24001.0
mean,12001.0,8351.302029,35.498438,-0.415733,-0.530686,-0.55581,-0.618974,-0.680305,-0.684263,2574.245865,2467.107037,2359.795592,2175.804383,2015.902837,1904.590121,281.079955,298.237073,263.109704,243.367943,237.489563,247.358402,0.221199
std,6928.636241,6475.59245,9.222021,1.143092,1.149124,1.126527,1.074681,1.002289,0.986689,3706.812804,3585.952168,3499.639871,3244.170936,3053.391501,2948.341617,846.819525,1215.481369,899.406651,782.138847,776.511265,840.040275,0.415063
min,1.0,500.0,21.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-8279.0,-3488.0,-3075.0,-8500.0,-4066.0,-10452.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6001.0,2500.0,28.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,177.0,147.0,130.0,116.0,93.0,76.0,49.0,41.0,19.0,15.0,13.0,12.0,0.0
50%,12001.0,7000.0,34.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1123.0,1059.0,1007.0,953.0,906.0,855.5,105.0,100.0,90.0,75.0,75.0,75.0,0.0
75%,18001.0,12000.0,41.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3375.0,3202.0,3015.0,2752.0,2495.0,2370.0,250.0,250.0,227.0,200.0,200.0,200.0,0.0
max,24001.0,50000.0,79.0,8.0,8.0,8.0,8.0,8.0,8.0,48225.0,49196.0,83204.0,44579.0,46358.0,48083.0,43677.0,84212.0,44802.0,31050.0,21326.0,26433.0,1.0


In [12]:
# drop customer id since it isnt a predictor and the original target (since now we recoded it)
del df['customer_id']
del df['default_oct']

In [50]:
# try to get closer to equal split between defaults and non defaults
# undersampling non-events
yes = df.loc[df['default'] == 1]
no = df.loc[df['default'] == 0]

chosen_no = no.sample(n=7500, random_state=12345)

df_reshaped = yes.append(chosen_no)
df_reshaped = df

# split into training and validation
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df_reshaped.ix[:, df.columns != 'default'], df_reshaped['default'], test_size=0.8, random_state=0)



In [15]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# gradient boosting
param_grid_gb = { 
    'n_estimators': [700, 1250],
    'max_features': ['auto', 'log2'],
    'max_depth': [3,5,7,9],
    'learning_rate': [0.01]  
}

gbm = GradientBoostingClassifier()
gs_cv_gbm = GridSearchCV(estimator=gbm,param_grid=param_grid_gb, scoring='log_loss', n_jobs=11, cv=5, verbose=2)

In [16]:
# fit the model
gs_cv_gbm.fit(X_train,y_train)
# calculate accuracy
preds = gs_cv_gbm.predict_proba(X_test)[:,1]
log_gb = log_loss(y_test, preds)
acc_gb = math.exp(-log_gb)
# print log loss and accuracy
print(log_gb, acc_gb)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=11)]: Done  19 tasks      | elapsed:   21.7s
[Parallel(n_jobs=11)]: Done  80 out of  80 | elapsed:  3.4min finished


(0.43524997975891749, 0.6471028842346632)


In [18]:
# TRY A DIFFERENT MODEL

In [21]:
# random forest
param_grid_rf = { 
    'n_estimators': [250,500, 750],
    'max_features': ['auto', 'log2'],
    'max_depth': [3,5,7,9],
    'oob_score':[True],
    'class_weight':['balanced_subsample',None],
    'criterion':['gini','entropy']
}

# since scoring is done on OOB observations - no need for CV
rmf = RandomForestClassifier()
gs_rf = GridSearchCV(estimator=rmf,param_grid=param_grid_rf, scoring='log_loss', n_jobs=11, verbose=2)

In [22]:
# build the model
gs_rf.fit(X_train,y_train)
# get the validation log loss
preds_rf = gs_rf.predict_proba(X_test)[:,1]
log_rf = log_loss(y_test, preds_rf)
acc = math.exp(-log_rf)
# print log loss and accuracy
print(log_rf, acc)

Fitting 3 folds for each of 96 candidates, totalling 288 fits


[Parallel(n_jobs=11)]: Done  19 tasks      | elapsed:   20.8s
[Parallel(n_jobs=11)]: Done 140 tasks      | elapsed:  2.4min
[Parallel(n_jobs=11)]: Done 288 out of 288 | elapsed:  4.5min finished


(0.43750415788812091, 0.6456458418991347)


In [74]:
# neural network
from sklearn.neural_network import MLPClassifier
import ipykernel
param_grid_nn = { 
   'hidden_layer_sizes':[(25,20,15),(100,50,25,15,5)],
   'activation':['logistic','tanh', 'relu'],
    'solver':['lbfgs', 'adam'],
    'alpha':[0.0001,0.00005],
   'learning_rate':['constant','adaptive'],
   'max_iter':[250],
   'random_state':[2415],
   'verbose' : [True],
   'early_stopping' : [True]
}

nnet = MLPClassifier()
gs_nn = GridSearchCV(estimator=nnet,param_grid=param_grid_nn, scoring='accuracy', n_jobs=11, verbose=3)

gs_nn.fit(X_train,y_train)
#gs_nn.get_params().keys()



Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=11)]: Done  10 tasks      | elapsed:   12.1s
[Parallel(n_jobs=11)]: Done 106 tasks      | elapsed:  1.7min


Iteration 1, loss = 0.54995451
Validation score: 0.785417
Iteration 2, loss = 0.52941041
Validation score: 0.785417
Iteration 3, loss = 0.52632000
Validation score: 0.785417
Iteration 4, loss = 0.52596471
Validation score: 0.785417
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.


[Parallel(n_jobs=11)]: Done 144 out of 144 | elapsed:  1.9min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params={}, iid=True, n_jobs=11,
       param_grid={'max_iter': [250], 'verbose': [True], 'random_state': [2415], 'hidden_layer_sizes': [(25, 20, 15), (100, 50, 25, 15, 5)], 'alpha': [0.0001, 5e-05], 'activation': ['logistic', 'tanh', 'relu'], 'solver': ['lbfgs', 'adam'], 'learning_rate': ['constant', 'adaptive'], 'early_stopping': [True]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=3)

In [68]:
# get the validation log loss
preds_nnet = gs_nn.predict_proba(X_test)[:,1]
log_nnet = log_loss(y_test, preds_nnet)
acc_nnet = math.exp(-log_nnet)
print(log_nnet, acc_nnet)

(0.51760863386285139, 0.5959439655544209)
