In [9]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 50)
from knn_algorithm import * 
from sklearn.model_selection import train_test_split as splitter
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [2]:
#credit card default csv
df_raw = pd.read_csv('cc_default.csv')
df = df_raw.iloc[1:]
df.columns = df_raw.iloc[0]
len(df)

30000

In [3]:
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
1,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
2,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
5,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


This is a pretty common data set with 30,000 instances of credit card users with whether or not they will default on their debt. A more detailed summary of this data set is provided here: [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients). We'll use this to see how out knn made from scratch does!

In [63]:
#A quick note:
df['default payment next month'].value_counts()

0    23364
1     6636
Name: default payment next month, dtype: int64

Remember to consider the fact that there is almost a 4:1 imbalance of defaults to non-defaults. Always consider this before you start diving in.

In [4]:
#We want to use mainly non-categorical data so we'll just pick some normal rows.
#Pick a couple relavant rows:
df.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month'],
      dtype='object', name=0)

In [19]:
numerical_columns = ['LIMIT_BAL', 'EDUCATION', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
data_raw = df[numerical_columns]
target = df['default payment next month']

In [16]:
#Remember to scale data
scaler = StandardScaler()
scaler.fit(data_raw)
data_scaled = pd.DataFrame(scaler.transform(data_raw), columns = data_raw.columns)

In [17]:
data_scaled.head()

Unnamed: 0,LIMIT_BAL,EDUCATION,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,-1.13672,0.185828,-1.24602,1.794564,1.782348,-0.696663,-0.666599,-1.530046,-1.486041,-0.642501,...,-0.667993,-0.672497,-0.663059,-0.652724,-0.341942,-0.227086,-0.296801,-0.308063,-0.314136,-0.293382
1,-0.365981,0.185828,-1.029047,-0.874991,1.782348,0.138865,0.188746,0.234917,1.992316,-0.659219,...,-0.639254,-0.621636,-0.606229,-0.597966,-0.341942,-0.213588,-0.240005,-0.24423,-0.314136,-0.180878
2,-0.597202,0.185828,-0.161156,0.014861,0.111736,0.138865,0.188746,0.234917,0.253137,-0.29856,...,-0.482408,-0.44973,-0.417188,-0.39163,-0.250292,-0.191887,-0.240005,-0.24423,-0.248683,-0.012122
3,-0.905498,0.185828,0.164303,0.014861,0.111736,0.138865,0.188746,0.234917,0.253137,-0.057491,...,0.032846,-0.232373,-0.186729,-0.156579,-0.221191,-0.169361,-0.228645,-0.237846,-0.244166,-0.23713
4,-0.905498,0.185828,2.334029,-0.874991,0.111736,-0.696663,0.188746,0.234917,0.253137,-0.578618,...,-0.161189,-0.346997,-0.348137,-0.331482,-0.221191,1.335034,0.271165,0.266434,-0.269039,-0.255187


In [21]:
#Start by holding out about a third of our data
X_train, X_test, y_train, y_test = splitter(data_scaled, target, test_size = 0.33)
len(X_test)/len(data_scaled)

0.33

In [53]:
knn_model = KNNClassifier(5)

In [54]:
knn_model.fit(X_train,y_train)

In [55]:
#We'll see how it does on 500 predictions
predictions = []
for i in range(len(X_test.iloc[:500])):
    predictions.append(knn_model.predict_fast(X_test.iloc[i]))

In [56]:
df_results = pd.DataFrame({'predicted': predictions, 'actual': list(y_test)[:500]})

In [57]:
df_results['correct'] = np.absolute(df_results['predicted'].values.astype(int) - df_results['actual'].values.astype(int))

In [60]:
df_results.correct.value_counts()[0]/len(df_results)

0.792

So without much work at all, the algorithm correctly classified almost 80% of the unseen test data. But, how did it do with the people who actually defaulted?

In [67]:
df_defaulted = df_results[df_results['actual'] == '1']
df_not_defaulted = df_results[df_results['actual'] == '0']

In [68]:
df_defaulted.correct.value_counts()

1    70
0    39
Name: correct, dtype: int64

In [69]:
df_not_defaulted.correct.value_counts()

0    357
1     34
Name: correct, dtype: int64

As expected, we were way more accurate at predicting the people who did not default. In a real word application, the credit card company would have to utilize techniques to force the model to more accurately detect the defaulted users (**increase recall**) even at the expense of total accuracy. 