In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE



In [2]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_features.csv')

In [3]:
train = train.replace({'EDUCATION': 6}, 5)
train = train.replace({'EDUCATION': 0}, 5)
train = train.replace({'MARRIAGE': 0}, 3)
train = train.drop('ID', axis=1)

test = test.replace({'EDUCATION': 6}, 5)
test = test.replace({'EDUCATION': 0}, 5)
test = test.replace({'MARRIAGE': 0}, 3)
test = test.drop('ID', axis=1)
test = test.drop('Unnamed: 0', axis=1)

In [16]:
len(test)

6001

In [167]:
# for i in train.columns:
#     print(i, train[i].value_counts(), '\n\n')

In [72]:
train['PAY_2'].value_counts()

 0    12613
-1     4851
 2     3131
-2     3006
 3      258
 4       77
 1       19
 5       18
 7       16
 6        9
 8        1
Name: PAY_2, dtype: int64

In [71]:
test['PAY_2'].value_counts()

 0    3117
-1    1199
 2     796
-2     776
 3      68
 4      22
 1       9
 5       7
 7       4
 6       3
Name: PAY_2, dtype: int64

In [4]:
X_train = train.drop('default payment next month', axis=1)
y_train = train['default payment next month']

In [89]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print(classification_report(y_train, rf.predict(X_train)))

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print(classification_report(y_train, knn.predict(X_train)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18691
           1       1.00      1.00      1.00      5308

    accuracy                           1.00     23999
   macro avg       1.00      1.00      1.00     23999
weighted avg       1.00      1.00      1.00     23999



KeyboardInterrupt: 

In [5]:
categorical_feature_names = ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']

In [111]:
def encode_and_concat_feature_train(X_train, feature_name):
    """
    Helper function for transforming training data.  It takes in the full X dataframe and
    feature name, makes a one-hot encoder, and returns the encoder as well as the dataframe
    with that feature transformed into multiple columns of 1s and 0s
    """
    # make a one-hot encoder and fit it to the training data
    ohe = OneHotEncoder(categories="auto", handle_unknown="ignore")
    single_feature_df = X_train[[feature_name]]
    ohe.fit(single_feature_df)

    # call helper function that actually encodes the feature and concats it
    X_train = encode_and_concat_feature(X_train, feature_name, ohe)
    
    return ohe, X_train

In [112]:
def encode_and_concat_feature(X, feature_name, ohe):
    """
    Helper function for transforming a feature into multiple columns of 1s and 0s. Used
    in both training and testing steps.  Takes in the full X dataframe, feature name, 
    and encoder, and returns the dataframe with that feature transformed into multiple
    columns of 1s and 0s
    """
    # create new one-hot encoded df based on the feature
    single_feature_df = X[[feature_name]]
    feature_array = ohe.transform(single_feature_df).toarray()
    ohe_df = pd.DataFrame(feature_array, columns=ohe.categories_[0], index=X.index)
    
    # drop the old feature from X and concat the new one-hot encoded df
    X = X.drop(feature_name, axis=1)
    X = pd.concat([X, ohe_df], axis=1)
    
    return X

In [119]:
train.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,350000,1,1,2,37,-2,-2,-2,-2,-2,...,466,466,316,316,316,466,466,316,316,0
1,50000,2,2,1,37,2,2,2,0,0,...,13026,13268,13497,5500,0,580,600,600,600,0
2,50000,2,1,2,23,-1,-1,-1,-1,-1,...,4800,9810,660,2548,2321,4800,9810,660,2980,0
3,20000,1,3,1,56,0,0,0,0,2,...,13784,13420,13686,1508,1216,1116,0,490,658,0
4,110000,2,2,2,32,0,0,0,0,0,...,108829,110557,106082,5400,5400,4100,4100,4100,4200,0


In [113]:
encoders = {}

for categorical_feature in categorical_feature_names:
    ohe, X_train = encode_and_concat_feature_train(X_train, categorical_feature)
    encoders[categorical_feature] = ohe

In [122]:
X_train.head()

Unnamed: 0,LIMIT_BAL,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,...,-2,-1,0,2,3,4,5,6,7,8
0,350000,37,316,316,316,466,466,316,316,316,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,50000,37,40994,43594,38222,13026,13268,13497,5500,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,50000,23,3430,2478,2299,4800,9810,660,2548,2321,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20000,56,10847,12176,12884,13784,13420,13686,1508,1216,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,110000,32,108159,106812,108464,108829,110557,106082,5400,5400,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [123]:
X_train.columns

Index(['LIMIT_BAL',       'AGE', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3',
       'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',  'PAY_AMT1',  'PAY_AMT2',
        'PAY_AMT3',  'PAY_AMT4',  'PAY_AMT5',  'PAY_AMT6',           1,
                 2,           1,           2,           3,           4,
                 5,           1,           2,           3,          -2,
                -1,           0,           1,           2,           3,
                 4,           5,           6,           7,           8,
                -2,          -1,           0,           1,           2,
                 3,           4,           5,           6,           7,
                 8,          -2,          -1,           0,           1,
                 2,           3,           4,           5,           6,
                 7,           8,          -2,          -1,           0,
                 1,           2,           3,           4,           5,
                 6,           7,           8,          -2,      

In [67]:
X_train.shape

(23999, 88)

In [21]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print(classification_report(y_train, rf.predict(X_train)))

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print(classification_report(y_train, knn.predict(X_train)))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18691
           1       1.00      1.00      1.00      5308

    accuracy                           1.00     23999
   macro avg       1.00      1.00      1.00     23999
weighted avg       1.00      1.00      1.00     23999

              precision    recall  f1-score   support

           0       0.83      0.95      0.89     18691
           1       0.66      0.32      0.43      5308

    accuracy                           0.81     23999
   macro avg       0.75      0.64      0.66     23999
weighted avg       0.79      0.81      0.79     23999



In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=42)

In [23]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print(classification_report(y_test, rf.predict(X_test)))

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print(classification_report(y_test, knn.predict(X_test)))

              precision    recall  f1-score   support

           0       0.83      0.95      0.89      4657
           1       0.65      0.35      0.46      1343

    accuracy                           0.81      6000
   macro avg       0.74      0.65      0.67      6000
weighted avg       0.79      0.81      0.79      6000

              precision    recall  f1-score   support

           0       0.79      0.91      0.85      4657
           1       0.37      0.18      0.24      1343

    accuracy                           0.75      6000
   macro avg       0.58      0.54      0.54      6000
weighted avg       0.70      0.75      0.71      6000



In [25]:
classifiers = [RandomForestClassifier(random_state=42), KNeighborsClassifier()]

In [33]:
param_grid_rf = {
    'n_estimators' : [10, 100],
    'criterion' : ['gini', 'entropy'],
    'max_features' : ['auto', 'sqrt', 'log2'],
    'max_depth' : [10, 50, 100],
    'class_weight' : [None, 'balanced', 'balanced_subsample']
}

param_grid_knn = {
    'n_neighbors': [5,10,15],
    'weights': ['uniform', 'distance'],
    'metric': ['minkowski', 'euclidean', 'manhattan'],
    'n_jobs': [-1]
}

param_grids_list= [param_grid_rf, param_grid_knn]

In [34]:
i=0

In [35]:
for clf in classifiers:
        
    cv_clf = GridSearchCV(estimator=clf, param_grid=param_grids_list[i], cv=5)
    cv_clf.fit(X_train, y_train)
    i+=1
    
    name = clf.__class__.__name__
    print("="*60)
    print(name)
    print('This is the best params', cv_clf.best_params_)

    print('****Results****')    
    print(classification_report(X_train, y_train))
    
    print("="*60)

RandomForestClassifier
This is the best params {'class_weight': None, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'auto', 'n_estimators': 100}
****Results****


ValueError: Classification metrics can't handle a mix of multiclass-multioutput and binary targets

In [6]:
ohe = OneHotEncoder(categories="auto", handle_unknown="ignore")
ohe.fit(X_train[categorical_feature_names])
feature_array_train = ohe.transform(X_train[categorical_feature_names]).toarray()
ohe_df_train = pd.DataFrame(feature_array_train, columns=ohe.get_feature_names(categorical_feature_names), index=X_train.index)

X_train = X_train.drop(categorical_feature_names, axis=1)
X_train = pd.concat([X_train, ohe_df_train], axis=1)

In [7]:
feature_array = ohe.transform(test[categorical_feature_names]).toarray()
ohe_df = pd.DataFrame(feature_array, columns=ohe.get_feature_names(categorical_feature_names), index=test.index)

# drop the old feature from X and concat the new one-hot encoded df
X_test = test.drop(categorical_feature_names, axis=1)
X_test = pd.concat([X_test, ohe_df], axis=1)

In [164]:
X_test

Unnamed: 0,LIMIT_BAL,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,...,PAY_6_-2,PAY_6_-1,PAY_6_0,PAY_6_2,PAY_6_3,PAY_6_4,PAY_6_5,PAY_6_6,PAY_6_7,PAY_6_8
0,240000,44,4221,2188,2701,2427,1104,2362,2188,2701,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,50000,41,49782,50741,48468,39203,28913,26636,1816,1753,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,20000,41,8777,10461,11583,13079,14546,16149,2000,1600,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,320000,34,-532,-532,-528,-1336,-1336,-1336,0,4,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,120000,23,64049,65984,66825,68820,69776,71297,3000,2500,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5996,200000,30,182967,194924,75635,94454,60875,41221,15349,10,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5997,100000,36,78063,79346,77977,79071,76918,81713,3500,3300,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5998,50000,42,24225,24046,24574,24348,16130,16467,500,1200,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5999,70000,32,70178,71312,71314,71595,70040,68927,2900,2766,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [172]:
X_train

Unnamed: 0,LIMIT_BAL,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,...,PAY_6_-2,PAY_6_-1,PAY_6_0,PAY_6_2,PAY_6_3,PAY_6_4,PAY_6_5,PAY_6_6,PAY_6_7,PAY_6_8
0,350000,37,316,316,316,466,466,316,316,316,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,50000,37,40994,43594,38222,13026,13268,13497,5500,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,50000,23,3430,2478,2299,4800,9810,660,2548,2321,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20000,56,10847,12176,12884,13784,13420,13686,1508,1216,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,110000,32,108159,106812,108464,108829,110557,106082,5400,5400,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23994,50000,24,12806,13490,0,0,0,0,7522,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23995,250000,44,6826,3900,4482,43546,38051,576,3924,4482,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23996,50000,26,41827,42650,45930,44891,47654,48721,1800,4000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
23997,110000,38,195437,176420,63142,39854,9293,-1288,5000,10000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [160]:
ohe_df_train

Unnamed: 0,SEX_1,SEX_2,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,EDUCATION_5,MARRIAGE_1,MARRIAGE_2,MARRIAGE_3,...,PAY_6_-2,PAY_6_-1,PAY_6_0,PAY_6_2,PAY_6_3,PAY_6_4,PAY_6_5,PAY_6_6,PAY_6_7,PAY_6_8
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23994,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23995,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23996,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
23997,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
rf = RandomForestClassifier(class_weight=None, criterion='entropy', max_depth = 10, max_features='auto', n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [12]:
y_preds = rf.predict(X_test)

In [174]:
pd.DataFrame(y_preds).to_csv('credit_default_preds_hmdcus.csv', index=False)

In [14]:
test_predictions

array([0, 0, 0, ..., 1, 0, 1])

In [15]:
len(X_test)

6001

In [21]:
y_test = pd.read_csv('chisea_bakeoff_classification_answers.csv',header = None)[1]

In [26]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.84      0.96      0.89      4673
           1       0.69      0.36      0.47      1328

    accuracy                           0.82      6001
   macro avg       0.77      0.66      0.68      6001
weighted avg       0.81      0.82      0.80      6001

