# Load & Imputation

In [19]:
"""
Load dataset
"""
import pandas as pd
import numpy as np
path_to_file = "../.csv"
data = pd.read_csv(path_to_file)
data.head(2)

Unnamed: 0,SeriousDlqin2yrs,age,MonthlyIncome,NumberOfDependents,RevolvingUtilizationOfUnsecuredLines,DebtRatio,NumberOfOpenCreditLinesAndLoans,NumberRealEstateLoansOrLines,NumberOfTime30.59DaysPastDueNotWorse,NumberOfTime60.89DaysPastDueNotWorse,NumberOfTimes90DaysLate
0,1,45.0,9120.0,2.0,0.766127,0.802982,13,6,2.0,0.0,0.0
1,0,40.0,2600.0,1.0,0.957151,0.121876,4,0,0.0,0.0,0.0
2,0,38.0,3042.0,0.0,0.65818,0.085113,2,0,1.0,0.0,1.0
3,0,30.0,3300.0,0.0,0.23381,0.03605,5,0,0.0,0.0,0.0
4,0,49.0,63588.0,0.0,0.907239,0.024926,7,1,1.0,0.0,0.0


In [20]:
#Count of NaN
(data.isnull().sum() / data.shape[0] *100).round(4)

SeriousDlqin2yrs                         0.0000
age                                      0.0007
MonthlyIncome                           19.8207
NumberOfDependents                       2.6160
RevolvingUtilizationOfUnsecuredLines     0.0000
DebtRatio                                0.0000
NumberOfOpenCreditLinesAndLoans          0.0000
NumberRealEstateLoansOrLines             0.0000
NumberOfTime30.59DaysPastDueNotWorse     0.1793
NumberOfTime60.89DaysPastDueNotWorse     0.1793
NumberOfTimes90DaysLate                  0.1793
dtype: float64

In [16]:
#IMPUTATION
#from sklearn.preprocessing import Imputer
data = data.fillna(data.mean())

In [17]:
#Count of NaN
data.isnull().sum()

SeriousDlqin2yrs                        0
age                                     0
MonthlyIncome                           0
NumberOfDependents                      0
RevolvingUtilizationOfUnsecuredLines    0
DebtRatio                               0
NumberOfOpenCreditLinesAndLoans         0
NumberRealEstateLoansOrLines            0
NumberOfTime30.59DaysPastDueNotWorse    0
NumberOfTime60.89DaysPastDueNotWorse    0
NumberOfTimes90DaysLate                 0
dtype: int64

In [39]:
#Missingness in monthly income 
1-data.groupby('SeriousDlqin2yrs')['MonthlyIncome'].count() / data.groupby('SeriousDlqin2yrs')['SeriousDlqin2yrs'].count()

SeriousDlqin2yrs
0    0.200480
1    0.166467
dtype: float64

# Split Prep

In [6]:
import sklearn.cross_validation as cv
X = data.iloc[:,1:]
y=data.iloc[:,0]
x_train, x_test, y_train, y_test = cv.train_test_split(X, 
                                                       y, 
                                                       train_size=0.7, 
                                                       random_state=0)

In [35]:
import sklearn.metrics as met

def get_error(x_train, y_train, x_test, y_test, model, show = True):
    model.fit(x_train, y_train)
    train_error = 1 - model.score(x_train, y_train)
    test_error = 1 - model.score(x_test, y_test)
    #train_auc = met.roc_auc_score(x_train, y_train)
    #test_auc = met.roc_auc_score(x_test, y_test)
    if show:
        print "The training error is: %.5f" %train_error 
        print "The test     error is: %.5f" %test_error
        #print "The training auc is: %.5f" %train_auc
        #print "The test     auc is: %.5f" %test_auc
    return [train_error, test_error]

# PCA & Logit #

In [28]:
from sklearn.decomposition import PCA
pca = PCA()
pca.set_params(n_components = 2)
x_train2 = pca.fit_transform(x_train)
x_test2 = pca.transform(x_test)

In [29]:
pca.components_

array([[ -3.79083751e-05,  -9.99999611e-01,  -5.20498879e-06,
         -1.57454232e-04,   8.66249821e-04,  -3.17494032e-05,
         -1.01282992e-05,   6.12813315e-08,   2.94429767e-07,
          6.68538152e-07],
       [  2.82591776e-04,   8.66057739e-04,  -3.64120295e-05,
          1.09983067e-03,   9.99998951e-01,   2.11674910e-04,
          1.07646894e-04,   1.16046638e-06,  -1.13163082e-06,
         -4.26586031e-06]])

In [30]:
pca.explained_variance_ratio_

array([ 0.98494995,  0.01460915])

In [31]:
pca.mean_ 

array([  5.22926314e+01,   6.66764077e+03,   7.55302477e-01,
         6.34003875e+00,   3.51009652e+02,   8.45268571e+00,
         1.01940000e+00,   2.46877863e-01,   6.52162409e-02,
         9.04831361e-02])

In [36]:
#Logistic regression
import sklearn.linear_model as lm
logit = lm.LogisticRegression(C=1e5)
get_error(x_train2, y_train, x_test2, y_test, logit)

The training error is: 0.06658
The test     error is: 0.06744


[0.066580952380952407, 0.067444444444444418]

In [53]:
zeroes = [0]*len(y_train)
1-met.accuracy_score(zeroes,y_train)

0.066580952380952407

# Logit

In [63]:
#Logistic regression
logit = lm.LogisticRegression(C=1e5)
get_error(x_train, y_train, x_test, y_test, logit)

The training error is: 0.06358
The test     error is: 0.06436


[0.063580952380952405, 0.06435555555555561]

In [57]:
#Logistic regression
logit = lm.LogisticRegressionCV(Cs=np.logspace(-5, 5, 100), cv=3)
get_error(x_train, y_train, x_test, y_test, logit)

The training error is: 0.06368
The test     error is: 0.06464


[0.063676190476190442, 0.064644444444444393]

from sklearn import grid_search
logit = lm.LogisticRegression()
para_grid = [{'fit_intercept': [False, True], 'C':np.logspace(-5, 5, 100)}]
para_search = grid_search.GridSearchCV(logit, para_grid, scoring='accuracy', cv=3).fit(x_train, y_train)

print "Best score:" , para_search.best_score_
print "Best params:" , para_search.best_params_
logit_best = para_search.best_estimator_ #No need to refit on (x,y) because GridSearchCV saves the best model
print np.sum(abs(logit_best.coef_) < 1e-4) #All coefficients greater than 1e-4
print "Train error:" , (1-logit_best.score(x_train,y_train))
print "Test error:" , (1-logit_best.score(x_test,y_test))

# Various model defaults

In [59]:
from sklearn import linear_model, neighbors, naive_bayes
logit = linear_model.LogisticRegression()
knn = neighbors.KNeighborsClassifier()
gnb = naive_bayes.GaussianNB()
mnb = naive_bayes.MultinomialNB()

In [61]:
logit.fit(x_train, y_train)
knn.fit(x_train, y_train)
gnb.fit(x_train, y_train)
mnb.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [62]:
score_train = [logit.score(x_train, y_train),  knn.score(x_train, y_train),   gnb.score(x_train, y_train),   mnb.score(x_train, y_train)]
score_test = [logit.score(x_test, y_test), knn.score(x_test, y_test), gnb.score(x_test, y_test), mnb.score(x_test, y_test)]
print np.array([score_train, score_test])

[[ 0.93640952  0.93585714  0.92408571  0.21490476]
 [ 0.93566667  0.93046667  0.92437778  0.21104444]]
