# First iteration

## Part 1: Get data and build model

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from dateutil.relativedelta import relativedelta
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score
from joblib import Parallel, delayed
pd.set_option('display.max_columns', None)

In [2]:
def get_data():
    dir = 'D:\\Backups\\StemData\\'
    #file_list = ['sample_orig_2016.txt', 'sample_svcg_2016.txt']
    file = 'sample_orig_2016.txt'
    file1 = 'sample_svcg_2016.txt'

    raw = pd.read_csv(dir+file, sep='|', header=None)
    raw.columns = ['credit_score', 'first_pmt_date', 'first_time', 'mat_date', 'msa', 'mi_perc', 'units',
                    'occ_status', 'ocltv', 'odti', 'oupb', 'oltv', 'oint_rate', 'channel', 'ppm', 'fixed_rate',
                    'state', 'prop_type','zip','loan_num', 'loan_purpose','oterm','num_borrowers', 'seller_name',
                    'servicer_name','exceed_conform']

    raw1 = pd.read_csv(dir+file1, sep='|', header=None)
    raw1.columns = ['loan_num', 'yearmon', 'curr_upb','curr_delinq','loan_age','remain_months', 'repurchased',
                     'modified', 'zero_bal','zero_date','curr_rate','curr_def_upb', 'ddlpi','mi_rec','net_proceeds',
                     'non_mi_rec', 'exp', 'legal_costs','maint_exp','tax_insur', 'misc_exp', 'loss','mod_exp']

    data = pd.merge(raw, raw1, on='loan_num', how='inner')
    #print(data.loan_num)

    # all data must have the following: credit_score, ocltv, odti, oltv, oint_rate, curr_upb
    # remove any datapoints with missing values from the above features
    #data.drop(['seller_name', 'servicer_name', 'first_pmt_date', 'mat_date', 'yearmon'], axis=1, inplace=True)
    data.drop(['seller_name', 'servicer_name', 'first_pmt_date', 'mat_date'], axis=1, inplace=True)
    data.dropna(subset=['credit_score', 'odti', 'oltv', 'oint_rate', 'curr_upb'], how='any',inplace=True)
    #data.fillna(value=0, inplace=True, axis=1)
    #data = data[data.curr_delinq!=4]
    #data = data.apply(pd.to_numeric, args=('ignore',))
    data.credit_score = pd.to_numeric(data['credit_score'], errors='coerce')
    #= data.apply(pd.to_numeric, args=('ignore',))
    data.yearmon = pd.to_datetime(data['yearmon'], format='%Y%m')
    data.fillna(value=0, inplace=True, axis=1)
    
    return data

In [3]:
raw = get_data()
raw.isnull().values.any()

False

In [4]:
raw.sort_values(['loan_num'], ascending=True).groupby(['yearmon'], as_index=False)  ##consider move this into the next func
raw.set_index(['loan_num', 'yearmon'], inplace=True) ## consider move this into the next func

In [5]:
raw.isnull().values.any()

False

In [8]:
def process_data(data):
    #data.sort_values(['loan_num'], ascending=True).groupby(['yearmon'], as_index=False)  ##consider move this out
    #data.set_index(['loan_num', 'yearmon'], inplace=True) ## consider move this out
    y = data['curr_delinq']
    #data['prev_delinq'] = data.curr_delinq.shift(1) ## needs attention here
    data['prev_delinq'] = data.groupby(level=0)['curr_delinq'].shift(1)
    print(sum(data.prev_delinq.isnull()))
    data.fillna(value=0, inplace=True, axis=1)
    data.drop(['curr_delinq'], axis=1, inplace=True)
    print(y.shape)
    ## how many classes are y?
    ## remove y from X
    dummy = pd.get_dummies(data, columns=['msa','first_time', 'occ_status', 'channel', 'ppm', 'fixed_rate',
                                  'state', 'prop_type', 'loan_purpose', 'exceed_conform', 'repurchased']).values
    y = label_binarize(y, classes=[0, 1, 2, 3])
    X = scale(dummy, with_mean=False)
    return X,y

In [9]:
train, target = process_data(raw)

24981
(203642,)


In [10]:
print(train.shape)
print(train)
print(target.shape)
#raw.curr_delinq.value_counts()

(203642, 510)
[[ 16.29859015   0.           4.01181039 ...,  13.85781488   0.           0.        ]
 [ 16.29859015   0.           4.01181039 ...,  13.85781488   0.           0.        ]
 [ 16.29859015   0.           4.01181039 ...,  13.85781488   0.           0.        ]
 ..., 
 [ 16.03500109   0.           4.01181039 ...,  13.85781488   0.           0.        ]
 [ 16.38645317   2.16318079   4.01181039 ...,  13.85781488   0.           0.        ]
 [ 17.66046695   2.59581694   4.01181039 ...,  13.85781488   0.           0.        ]]
(203642, 4)


In [11]:
def gridSearch_nn(X, y):
    #X_train, y_train, X_test, y_test = train_test_split(X,y)
    mlp = MLPClassifier(solver='sgd', alpha=1e-5, shuffle=True, learning_rate='invscaling',
         verbose=True)
    #mlp.fit(X_train, y_train)
    #mat = mlp.predict_proba(X_test)
    parameters = {'hidden_layer_sizes':[(519, 363, 363, 363, 363)]}
    sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2)  ## no need for this given 50000 random sample
    gs = GridSearchCV(estimator=mlp, param_grid=parameters, n_jobs=6, cv=sss, scoring='roc_auc',verbose=5)
    gs.fit(X, y)
    clf = gs.best_estimator_
    print(clf)
    print(gs.best_score_)
    mat = clf.predict_proba(X)
    print(mat)
    
    return clf, gs.best_score_, mat


if  __name__== '__main__':
    clf, score, mat = gridSearch_nn(train, target)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=6)]: Done   2 out of   5 | elapsed:  8.3min remaining: 12.5min
[Parallel(n_jobs=6)]: Done   5 out of   5 | elapsed:  8.7min finished


Iteration 1, loss = 0.03741315
Iteration 2, loss = 0.02926684
Iteration 3, loss = 0.02915577
Iteration 4, loss = 0.02907879
Iteration 5, loss = 0.02901845
Iteration 6, loss = 0.02896933
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(519, 363, 363, 363, 363),
       learning_rate='invscaling', learning_rate_init=0.001, max_iter=200,
       momentum=0.9, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='sgd', tol=0.0001,
       validation_fraction=0.1, verbose=True, warm_start=False)
0.827419046077
[[  9.99150020e-01   7.73622220e-04   7.25456295e-05   2.70122254e-05]
 [  9.99144372e-01   7.78282464e-04   7.33733577e-05   2.73359475e-05]
 [  9.99138413e-01   7.83410700e-04   7.41925811e-05   2.77180000e-05]
 ..., 
 [  9.98991775e-01   8.32815388e-04 

In [12]:
print(mat)

[[  9.99150020e-01   7.73622220e-04   7.25456295e-05   2.70122254e-05]
 [  9.99144372e-01   7.78282464e-04   7.33733577e-05   2.73359475e-05]
 [  9.99138413e-01   7.83410700e-04   7.41925811e-05   2.77180000e-05]
 ..., 
 [  9.98991775e-01   8.32815388e-04   1.01924261e-04   3.96039855e-05]
 [  9.98875235e-01   2.20859735e-03   1.06974046e-04   3.22993161e-05]
 [  9.97078941e-01   2.01109207e-03   2.08743697e-04   5.42923340e-05]]


##### Note

http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html