## MIDS 207 Final Project
### Team Members: Rick Chen, Julian Rippert, Jimmy Zhu
### Model Type: K Nearest Neighbors

In [1]:
# general imports
import pandas as pd
import numpy as np
import warnings

# preprocessing and hyperparameter turning libraries
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform

from sklearn.decomposition import PCA

# KNN model and evaluation libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Dataset Description

The objective of this competition is to predict the probability that a customer does not pay back their credit card balance amount in the future based on their monthly customer profile. The target binary variable is calculated by observing 18 months performance window after the latest credit card statement, and if the customer does not pay due amount in 120 days after their latest statement date it is considered a default event.

The dataset contains aggregated profile features for each customer at each statement date. Features are anonymized and normalized, and fall into the following general categories:

D_* = Delinquency variables
S_* = Spend variables
P_* = Payment variables
B_* = Balance variables
R_* = Risk variables
with the following features being categorical:

['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

Your task is to predict, for each customer_ID, the probability of a future payment default (target = 1).

Note that the negative class has been subsampled for this dataset at 5%, and thus receives a 20x weighting in the scoring metric.

# Loading in the Data

In [41]:
df = pd.read_feather('train_data.ftr')

In [42]:
print(df.head())
print(df.shape)

                                         customer_ID        S_2       P_2  \
0  0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2017-03-09  0.938477   
1  0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2017-04-07  0.936523   
2  0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2017-05-28  0.954102   
3  0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2017-06-13  0.960449   
4  0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2017-07-16  0.947266   

       D_39       B_1       B_2       R_1       S_3      D_41       B_3  ...  \
0  0.001734  0.008728  1.006836  0.009224  0.124023  0.008774  0.004707  ...   
1  0.005775  0.004925  1.000977  0.006153  0.126709  0.000798  0.002714  ...   
2  0.091492  0.021652  1.009766  0.006817  0.123962  0.007599  0.009422  ...   
3  0.002455  0.013687  1.002930  0.001372  0.117188  0.000685  0.005531  ...   
4  0.002483  0.015190  1.000977  0.007607  0.117310  0.004654  0.009308  ...   

   D_137  D_138     D_139     D_140     D_141  D_142    

We see that our data has over 5.5 million data points with 190 features (excluding our target variable)
As this data is will be too large for processing and modelling will take too long, let's take a small sample of the dataset.
We will take 10% of the dataset for just over 500 thousand datapoints and sort by date (S_2 column)

In [43]:
df = df.sample(frac = 0.1)
df = df.sort_values(by = 'S_2')

In [44]:
# drop the customer ID and date columns as they won't be useful for modeling
df = df.drop(columns = ['customer_ID', 'S_2'])
df.head(2)

Unnamed: 0,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
714456,0.769043,0.505859,0.021439,0.810059,0.007526,0.139648,0.006222,0.086487,,0.051422,...,,,0.005733,0.005043,0.001645,,0.009499,0.005981,0.009201,0
253753,0.941895,0.147339,0.045074,1.005859,0.001395,0.172852,0.009575,0.013054,,0.012032,...,,,0.000599,0.003731,0.007896,,0.003036,0.009445,0.001812,0


In [45]:
#drop additional columns with high nan values per Julian's EDA
del_cols = ['D_42', 'D_49', 'D_50', 'D_53', 'D_56', 'S_9', 'B_17', 'D_66', 'D_73', 'D_76', 'D_77', 'R_9', 'D_82', 'B_29',
'D_87', 'D_88', 'D_105', 'D_106', 'R_26', 'D_108', 'D_110', 'D_111', 'B_39', 'B_42', 'D_132',
'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_142']

df = df.drop(columns = del_cols)



In order to impute values for our non-numerical categorical values, we'll encode them as numerical values

In [51]:
print('D_63 before encoding: ', df['D_63'].value_counts())
print('D_64 before encoding: ', df['D_64'].value_counts())
cat_map = {"D_63":  {"CO": 0, "CR": 1, "CL": 2, "XZ": 3, "XM": 4, "XL": 5},
            "D_64": {"O": 1, "U": 2, "R": 3, '': 4 }}
df = df.replace(cat_map)
print('D_63 after encoding: ', df['D_63'].value_counts())
print('D_64 after encoding:', df['D_64'].value_counts())
print(df['D_63'])

D_63 before encoding:  0    411981
1     92933
2     43964
3      2556
4      1047
5       664
Name: D_63, dtype: int64
D_64 before encoding:  1     292027
2     151401
3      84148
4      21833
-1      3736
Name: D_64, dtype: int64
D_63 after encoding:  0    411981
1     92933
2     43964
3      2556
4      1047
5       664
Name: D_63, dtype: int64
D_64 after encoding: 1     292027
2     151401
3      84148
4      21833
-1      3736
Name: D_64, dtype: int64
714456     0
253753     1
2397731    1
2090355    1
793076     0
          ..
1182267    0
4291363    2
1923456    1
5176163    0
3594238    0
Name: D_63, Length: 553145, dtype: category
Categories (6, int64): [2, 0, 1, 5, 4, 3]


Take a look at the remaining nan values in our dataset. We'll impute the value that is assigned using KNN with 5 neighbors

In [47]:
df.isnull().sum()

P_2        4646
D_39          0
B_1           0
B_2         211
R_1           0
          ...  
D_141     10238
D_143     10238
D_144      4016
D_145     10238
target        0
Length: 158, dtype: int64

In [55]:
df2 = df[['D_63', 'D_64']]
df1 = df.drop(columns = ['D_63', 'D_64'])

columns1 = df1.columns
columns2 = df2.columns
imp1 = SimpleImputer(missing_values=np.NaN)
imp2 = SimpleImputer(missing_values = np.NaN, strategy = 'most_frequent')
df1 = pd.DataFrame(imp.fit_transform(df1))
df2 = pd.DataFrame(imp.fit_transform(df2))
df2.columns = columns2
df1.columns = columns1

  mode = stats.mode(array)


In [10]:
df['D_63'] = d_63
df['D_64'] = d_64
imp=SimpleImputer(missing_values=np.NaN, strategy = 'most_frequent')
df=pd.DataFrame(imp.fit_transform(df))



In [65]:
df_clean = pd.concat([df1, df2], axis = 1)
df_clean.isnull().sum()

P_2       0
D_39      0
B_1       0
B_2       0
R_1       0
         ..
D_144     0
D_145     0
target    0
D_63      0
D_64      0
Length: 158, dtype: int64

In [21]:
df.shape
columns = columns.union(['D_63', 'D_64'])
print(columns)
df.head()

Index(['B_1', 'B_10', 'B_11', 'B_12', 'B_13', 'B_14', 'B_15', 'B_16', 'B_18',
       'B_19',
       ...
       'S_24', 'S_25', 'S_26', 'S_27', 'S_3', 'S_5', 'S_6', 'S_7', 'S_8',
       'target'],
      dtype='object', length=158)


Unnamed: 0,B_1,B_10,B_11,B_12,B_13,B_14,B_15,B_16,B_18,B_19,...,S_24,S_25,S_26,S_27,S_3,S_5,S_6,S_7,S_8,target
0,0.889648,0.213135,0.0168,1.001953,0.00658,0.107727,0.004398,0.000887,0.051575,0.006981,...,0.000305,0.009659,0.000241,0.00526,0.001027,0.001487,0.003918,0.0,1,1
1,0.450439,0.266357,0.027985,1.005859,0.006554,0.069641,0.004959,0.009064,0.028763,0.118515,...,0.006416,0.004494,0.002565,0.003717,0.00214,0.006191,0.006012,0.0,1,1
2,0.221802,0.388916,0.211304,0.010834,0.003588,0.496338,0.006454,0.215332,0.493408,0.129883,...,0.001792,0.003218,0.006783,0.001347,0.003843,0.006901,0.003092,1.0,0,1
3,0.984375,0.235596,0.017776,0.48999,0.001529,0.335938,9.4e-05,0.027405,0.005035,0.001378,...,0.007076,0.002264,0.007904,0.001025,0.000427,0.001878,0.009796,0.0,0,1
4,0.552734,0.503418,0.359863,0.039307,0.001715,0.165405,0.008209,0.331543,0.009781,0.506836,...,0.001105,0.004333,0.007607,0.002146,0.006802,0.006981,0.006317,1.0,0,1


In [67]:

#df.columns = columns

X = df_clean.drop(columns = ['target'])
Y = df_clean['target']

In [68]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, shuffle = False, random_state = 4)

X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.25, shuffle = False, random_state = 4)

In [69]:
print(X_train.shape, Y_train.shape)
print(X_val.shape, Y_val.shape)

(331887, 157) (331887,)
(110629, 157) (110629,)


# Feature Engineering and PCA

As KNN models are more efficient with less features, we will want to reduce the number of features in our model. A lot of this analysis was borrowed from Julian Rippert's feature engineering section. 

In [75]:
pca = PCA(n_components = 15)

X_train_pca = pca.fit_transform(X_train)
X_val_pca = pca.fit_transform(X_val)
X_test_pca = pca.fit_transform(X_test)

In [76]:
print(X_train_pca.shape, X_val_pca.shape, X_test_pca.shape)

(331887, 15) (110629, 15) (110629, 15)


### Build the model

A KNN model is a very simple model that looks at the distance (generally based off the euclidean distance) from neighboring points. Based off the distance between neighboring points, the model will determine how to classify the value of question by a simple majority. Therefore if we are looking at the nearest 3 neighbors like the example below, and a majority of the closest neighbors are class B, then we will classify this point as class B as well. 

![KNN Model](image.png)

In [77]:
knn = KNeighborsClassifier()
knn.fit(X_train_pca, Y_train)

KNeighborsClassifier()

In [78]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    Y_pred = knn.predict(X_test_pca)
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

         0.0       0.74      0.74      0.74     82208
         1.0       0.24      0.24      0.24     28421

    accuracy                           0.61    110629
   macro avg       0.49      0.49      0.49    110629
weighted avg       0.61      0.61      0.61    110629



We can run gridsearchcv to cross validate and search for the model with the best hyperparameters. I only adjust the number of neighbors as adding other hyperparameters takes really long (10s of hours). Other hyperparameters woth adjusting include the p value and the leaf node size.  

In [79]:
n_neighbors = list(range(1, 50))
hyperparameters = dict(n_neighbors = n_neighbors)

In [80]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    knn_2 = KNeighborsClassifier()
    clf = GridSearchCV(knn_2, hyperparameters, cv = 10, n_jobs = -1)
    best_model = clf.fit(X_train_pca, Y_train)

In [None]:
print('Best leafsize:' , best_model.best_estimator_.get_params()['leaf_size'])
print('Best n_neighbors:' , best_model.best_estimator_.get_params()['n_neighbors'])
#print('Best p:' , best_model.best_estimator_.get_params()['p'])

Best leafsize: 30
Best n_neighbors: 27


We find that the optimal number number of neighbors for our KNN model is 29. Let's use that to build a KNN model on our training data and test it against our test data.

In [83]:
knn_best = KNeighborsClassifier(leaf_size =30, n_neighbors = 27)
knn_best.fit(X_train_pca, Y_train)
Y_pred = knn_best.predict(X_test_pca)
print(classification_report(Y_test, Y_pred))

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


              precision    recall  f1-score   support

         0.0       0.74      0.84      0.79     82208
         1.0       0.27      0.17      0.21     28421

    accuracy                           0.67    110629
   macro avg       0.51      0.50      0.50    110629
weighted avg       0.62      0.67      0.64    110629



# Submission file on Kaggle

We now want to create the submission file:
The sample submission file asks us to create a csv file with a list of each customer_ID and a corresponding binary value indicating whether or not they will default.

I had trouble running this on my personal computer, so I have put in skeleton code without running it. 

In [2]:
df_test = pd.read_feather('test_data.ftr')
df_test.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-02-19,0.631348,0.001912,0.010727,0.814453,0.007545,0.168701,0.009972,0.002348,...,,,,,0.004669,,,,0.008278,
1,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-03-25,0.586914,0.005276,0.011024,0.811035,0.001817,0.241333,0.000166,0.009132,...,,,,0.000142,0.00494,0.009018,,0.003695,0.003754,0.00146
2,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-04-25,0.608887,0.003326,0.016388,1.004883,0.000114,0.26709,0.004196,0.004192,...,,,,7.4e-05,0.002113,0.004658,,0.003155,0.002155,0.006481
3,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-05-20,0.614746,0.009064,0.021667,0.816406,0.00972,0.188965,0.004124,0.015327,...,,,,0.004742,0.006393,0.00289,,0.006042,0.005207,0.007858
4,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-06-15,0.591797,0.23877,0.01593,0.810547,0.002026,0.180054,0.000731,0.011284,...,,,,0.008133,0.00433,0.008385,,0.001008,0.00742,0.009468


In [None]:
ids = df_test.pop('customer_ID')
del_cols = ['D_42', 'D_49', 'D_50', 'D_53', 'D_56', 'S_9', 'B_17', 'D_66', 'D_73', 'D_76', 'D_77', 'R_9', 'D_82', 'B_29',
'D_87', 'D_88', 'D_105', 'D_106', 'R_26', 'D_108', 'D_110', 'D_111', 'B_39', 'B_42', 'D_132',
'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_142', 'S_2']

df = df.drop(columns = del_cols)

Take the model trained fit on the training data, and apply it to the test data

In [None]:
Y_test_pred = knn.predict(df['target'])
submission_df = pd.concat([ids, Y_test_pred])
submission_df.to_csv(file_name, sep='\t')