## Classification Bakeoff


In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.decomposition import PCA


In [4]:
data = pd.read_csv('train_data.csv')
data.drop(['ID'], axis=1, inplace=True)

In [5]:
data.columns

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month'],
      dtype='object')

In [6]:
data.shape

(23999, 24)

In [7]:
X = data.drop('default payment next month', axis=1)
y = data['default payment next month']

In [None]:
data

## First Bad Model

In [67]:
Xbad = data['PAY_0']
ybad = data['default payment next month']

In [68]:
logreg = LogisticRegression()
logreg.fit(np.array(Xbad).reshape(-1,1), ybad)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [70]:
print(classification_report(logreg.predict(np.array(Xbad).reshape(-1,1)), ybad))

              precision    recall  f1-score   support

           0       0.96      0.83      0.89     21501
           1       0.33      0.70      0.45      2498

    accuracy                           0.82     23999
   macro avg       0.64      0.77      0.67     23999
weighted avg       0.89      0.82      0.85     23999



### Engineering

In [9]:
# def engineer(df):
#     df = df.copy()
#     df['bill_total'] = df['BILL_AMT1'] + df['BILL_AMT2'] + df['BILL_AMT3'] + df['BILL_AMT4'] + df['BILL_AMT5'] + df['BILL_AMT6']
#     df['pay_total'] = df['PAY_AMT1'] + df['PAY_AMT2'] + df['PAY_AMT3'] + df['PAY_AMT4'] + df['PAY_AMT5'] + df['PAY_AMT6']
#     df['debt'] = df['bill_total'] - df['pay_total']
#     df['months_behind'] = df['PAY_0'] + df['PAY_2'] + df['PAY_3'] + df['PAY_4'] + df['PAY_5'] + df['PAY_6']
#     df['debt_ratio'] = df['debt'] / df['LIMIT_BAL']
    
#     ohe = OneHotEncoder(sparse = False)
#     ohe_categories = pd.DataFrame(ohe.fit_transform(df[['PAY_0']]), columns = ohe.get_feature_names(['PAY_0']))
#     df = pd.concat([df, ohe_categories], axis = 1)
#     return df

data = pd.read_csv('train_data.csv')
data.drop(['ID'], axis=1, inplace=True)

ohe = OneHotEncoder(sparse = False)
ohe_categories = pd.DataFrame(ohe.fit_transform(data[['SEX', 'MARRIAGE', 'EDUCATION', 'PAY_0']]))
X = pd.concat([X, ohe_categories], axis = 1)
X['ratio'] = X['BILL_AMT1'] / X['LIMIT_BAL']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [11]:
X_train

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,15,16,17,18,19,20,21,22,23,ratio
5326,100000,1,1,2,32,0,0,0,0,2,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.838310
18251,80000,1,3,2,48,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000025
962,90000,2,2,2,25,0,-1,-1,-1,-1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.307311
9075,160000,2,2,1,60,-2,-2,-2,-2,-2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.100375
8498,60000,2,3,3,48,0,0,2,2,-1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.885267
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15859,10000,1,3,1,41,1,4,3,2,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.071100
22670,70000,2,1,2,26,1,2,2,2,2,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.401514
4906,90000,1,2,1,31,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.781433
15179,180000,2,5,2,30,-2,-2,-2,-2,-2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


In [12]:
features = [0,1,2,3,4,5,6,7,8,9,10,12,13,14,15,16,17,18,19,20,21,22,23, 'AGE', 'ratio', 'LIMIT_BAL', 'PAY_AMT1']

X2_train = X_train[features]

ss = StandardScaler()
X2_train = ss.fit_transform(X2_train)

pca = PCA()
X2_train = pca.fit_transform(X2_train)

model2 = RandomForestClassifier()
model2.fit(X2_train, y_train)

print(classification_report(y_train, model2.predict(X2_train)))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13986
           1       1.00      1.00      1.00      4013

    accuracy                           1.00     17999
   macro avg       1.00      1.00      1.00     17999
weighted avg       1.00      1.00      1.00     17999



In [13]:
X2_test = X_test[features]
X2_test = ss.transform(X2_test)
X2_test = pca.transform(X2_test)

print(classification_report(y_test, model2.predict(X2_test)))

              precision    recall  f1-score   support

           0       0.83      0.93      0.88      4705
           1       0.57      0.32      0.41      1295

    accuracy                           0.80      6000
   macro avg       0.70      0.63      0.64      6000
weighted avg       0.78      0.80      0.78      6000



## Model 3

In [99]:
gbc = GradientBoostingClassifier(n_estimators=200, learning_rate=.2, min_samples_split=2, verbose=1)
gbc.fit(X2_train, y_train)

      Iter       Train Loss   Remaining Time 
         1           0.9907           55.30s
         2           0.9565           56.22s
         3           0.9351           54.42s
         4           0.9206           52.89s
         5           0.9105           52.17s
         6           0.9020           51.75s
         7           0.8959           51.23s
         8           0.8914           50.70s
         9           0.8875           50.11s
        10           0.8843           49.69s
        20           0.8651           44.53s
        30           0.8544           40.97s
        40           0.8449           37.73s
        50           0.8368           35.28s
        60           0.8300           32.55s
        70           0.8237           30.12s
        80           0.8171           27.71s
        90           0.8093           25.41s
       100           0.8036           23.04s
       200           0.7443            0.00s


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.2, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=200,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=1,
                           warm_start=False)

In [100]:
print(classification_report(y_train, gbc.predict(X2_train)))

              precision    recall  f1-score   support

           0       0.85      0.97      0.91     14028
           1       0.79      0.42      0.55      3971

    accuracy                           0.85     17999
   macro avg       0.82      0.69      0.73     17999
weighted avg       0.84      0.85      0.83     17999



In [101]:
print(classification_report(y_test, gbc.predict(X2_test)))

              precision    recall  f1-score   support

           0       0.83      0.95      0.88      4663
           1       0.63      0.33      0.43      1337

    accuracy                           0.81      6000
   macro avg       0.73      0.64      0.66      6000
weighted avg       0.79      0.81      0.78      6000



## Model 4

In [89]:
model4 = LogisticRegression()
model4.fit(X2_train, y_train)

print(classification_report(y_train, model4.predict(X2_train)))

              precision    recall  f1-score   support

           0       0.83      0.96      0.89     14028
           1       0.69      0.33      0.44      3971

    accuracy                           0.82     17999
   macro avg       0.76      0.64      0.67     17999
weighted avg       0.80      0.82      0.79     17999



In [92]:
print(classification_report(y_test, model4.predict(X2_test)))

              precision    recall  f1-score   support

           0       0.83      0.96      0.89      4663
           1       0.71      0.32      0.44      1337

    accuracy                           0.82      6000
   macro avg       0.77      0.64      0.67      6000
weighted avg       0.81      0.82      0.79      6000



## Model 5

In [14]:
estimators = [('knn', KNeighborsClassifier(n_neighbors = 20)), ('rf', RandomForestClassifier(n_estimators = 100)),('log', LogisticRegression(solver = 'liblinear')),('grad', GradientBoostingClassifier())]

stack = StackingClassifier(estimators = estimators, final_estimator = LogisticRegression(), cv = 5, verbose=1)
stack.fit(X2_train, y_train)

print(classification_report(y_train, stack.predict(X2_train)))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    4.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   48.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.2min finished


              precision    recall  f1-score   support

           0       0.84      0.96      0.89     13986
           1       0.72      0.34      0.47      4013

    accuracy                           0.82     17999
   macro avg       0.78      0.65      0.68     17999
weighted avg       0.81      0.82      0.80     17999



In [15]:
print(classification_report(y_test, stack.predict(X2_test)))

              precision    recall  f1-score   support

           0       0.84      0.96      0.89      4705
           1       0.68      0.31      0.43      1295

    accuracy                           0.82      6000
   macro avg       0.76      0.64      0.66      6000
weighted avg       0.80      0.82      0.79      6000



## tweak Model 5

In [16]:
features2 = [14,15,16,17,18,19,20,21,22,23, 'ratio', 'LIMIT_BAL', 'PAY_AMT1', 'PAY_AMT2']
X3_train = X_train[features2]

stack.fit(X3_train, y_train)
print(classification_report(y_train, stack.predict(X3_train)))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    9.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   12.5s finished


              precision    recall  f1-score   support

           0       0.84      0.96      0.90     13986
           1       0.75      0.37      0.50      4013

    accuracy                           0.83     17999
   macro avg       0.79      0.67      0.70     17999
weighted avg       0.82      0.83      0.81     17999



In [17]:
X3_test = X_test[features2]

print(classification_report(y_test, stack.predict(X3_test)))

              precision    recall  f1-score   support

           0       0.84      0.96      0.89      4705
           1       0.68      0.33      0.44      1295

    accuracy                           0.82      6000
   macro avg       0.76      0.64      0.67      6000
weighted avg       0.80      0.82      0.80      6000



## Experimentation

In [18]:
svc = SVC()
svc.fit(X3_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [19]:
print(classification_report(y_train, svc.predict(X3_train)))

              precision    recall  f1-score   support

           0       0.78      1.00      0.87     13986
           1       0.00      0.00      0.00      4013

    accuracy                           0.78     17999
   macro avg       0.39      0.50      0.44     17999
weighted avg       0.60      0.78      0.68     17999



  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
print(classification_report(y_test, stack.predict(X3_test)))

              precision    recall  f1-score   support

           0       0.84      0.96      0.89      4705
           1       0.68      0.33      0.44      1295

    accuracy                           0.82      6000
   macro avg       0.76      0.64      0.67      6000
weighted avg       0.80      0.82      0.80      6000



## Export Preds


In [21]:
holdout = pd.read_csv('test_features.csv')
holdout.drop(['ID'], axis=1, inplace=True)

In [22]:

ohe = OneHotEncoder(sparse = False)
ohe_categories = pd.DataFrame(ohe.fit_transform(holdout[['SEX', 'MARRIAGE', 'EDUCATION', 'PAY_0']]))
holdout = pd.concat([holdout, ohe_categories], axis = 1)
holdout['ratio'] = holdout['BILL_AMT1'] / holdout['LIMIT_BAL']

features2 = [14,15,16,17,18,19,20,21,22,23, 'ratio', 'LIMIT_BAL', 'PAY_AMT1', 'PAY_AMT2']

holdout = holdout[features2]

In [24]:
preds.to_csv('credit_default_preds_kdmb.csv', index=False, header=False)