In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics

# **Part 1 - Data Preprocessing**

**1.1 Importing the Dataset**

In [2]:
#https://www.kaggle.com/uciml/default-of-credit-card-clients-dataset
#https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients
df = pd.read_csv('UCI_Credit_Card.csv', index_col=0)
df  #default.payment.next.month is the label where 1 = defaulted, 0 = paid

Unnamed: 0_level_0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
2,120000.0,2,2,2,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
3,90000.0,2,2,2,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
4,50000.0,2,2,1,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
5,50000.0,1,2,1,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29996,220000.0,1,3,1,39,0,0,0,0,0,0,188948.0,192815.0,208365.0,88004.0,31237.0,15980.0,8500.0,20000.0,5003.0,3047.0,5000.0,1000.0,0
29997,150000.0,1,3,2,43,-1,-1,-1,-1,0,0,1683.0,1828.0,3502.0,8979.0,5190.0,0.0,1837.0,3526.0,8998.0,129.0,0.0,0.0,0
29998,30000.0,1,2,2,37,4,3,2,-1,0,0,3565.0,3356.0,2758.0,20878.0,20582.0,19357.0,0.0,0.0,22000.0,4200.0,2000.0,3100.0,1
29999,80000.0,1,3,1,41,1,-1,0,0,0,-1,-1645.0,78379.0,76304.0,52774.0,11855.0,48944.0,85900.0,3409.0,1178.0,1926.0,52964.0,1804.0,1


**1.2 Data Cleanup**

In [3]:
df['default.payment.next.month'].unique() #default.payment.next.month is the label where 1 = defaulted, 0 = paid

array([1, 0])

In [4]:
df.isnull().sum() #no null data which is nice

LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default.payment.next.month    0
dtype: int64

**1.3 Separating Inputs and Label**

In [5]:
X = df.iloc[:, :-1].values #take all rows and index columns from 0 to 23
y = df.iloc[:, -1].values #take all rows of column 24 which is the label

print(X)
print(y[0:10])

[[2.0000e+04 2.0000e+00 2.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [1.2000e+05 2.0000e+00 2.0000e+00 ... 1.0000e+03 0.0000e+00 2.0000e+03]
 [9.0000e+04 2.0000e+00 2.0000e+00 ... 1.0000e+03 1.0000e+03 5.0000e+03]
 ...
 [3.0000e+04 1.0000e+00 2.0000e+00 ... 4.2000e+03 2.0000e+03 3.1000e+03]
 [8.0000e+04 1.0000e+00 3.0000e+00 ... 1.9260e+03 5.2964e+04 1.8040e+03]
 [5.0000e+04 1.0000e+00 2.0000e+00 ... 1.0000e+03 1.0000e+03 1.0000e+03]]
[1 1 0 0 0 0 0 0 0 0]


**1.4 Train Test Split**

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.24)

X_train

array([[3.600e+05, 1.000e+00, 2.000e+00, ..., 5.036e+03, 5.022e+03,
        6.010e+03],
       [5.000e+04, 1.000e+00, 1.000e+00, ..., 1.000e+03, 2.320e+03,
        3.000e+03],
       [1.800e+05, 2.000e+00, 1.000e+00, ..., 2.280e+02, 1.060e+02,
        1.601e+03],
       ...,
       [2.400e+05, 2.000e+00, 2.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [2.400e+05, 2.000e+00, 3.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [9.000e+04, 1.000e+00, 1.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00]])

**1.5 Feature Scaling**

In [7]:
#so here we're just scaling all our values using the following formula: z = (x - u) / s
#x is the input, u is the mean, s is the standard deviation
#https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
#we do this because models normally perform better when numerical input variables are scaled to a standard range
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

X_test

array([[ 4.71039902,  0.81031819,  0.18785412, ...,  5.00710075,
        11.52754137,  1.13417582],
       [-0.44299548, -1.23408313,  1.45241682, ..., -0.18291293,
        -0.11863719, -0.01464715],
       [-0.51991182, -1.23408313, -1.07670859, ..., -0.11811633,
        -0.11863719, -0.12567135],
       ...,
       [-0.90449349, -1.23408313,  0.18785412, ..., -0.28010784,
        -0.24709572,  0.81864502],
       [-0.44299548, -1.23408313,  0.18785412, ...,  0.38081753,
        -0.20213524, -0.29220766],
       [-0.36607914, -1.23408313,  0.18785412, ..., -0.11811633,
        -0.11863719, -0.1534274 ]])

# **Part 2 - Training and Evaluating Each Model**

**2.1 Logistic Regression**

In [40]:
lr = LogisticRegression(max_iter=200)
lr.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [41]:
pred_lr = lr.predict(X_test)
pred_lr

array([0, 0, 0, ..., 0, 0, 0])

In [42]:
print(classification_report(y_test,pred_lr))

              precision    recall  f1-score   support

           0       0.82      0.97      0.89      5617
           1       0.71      0.24      0.35      1583

    accuracy                           0.81      7200
   macro avg       0.76      0.60      0.62      7200
weighted avg       0.79      0.81      0.77      7200



In [43]:
print(confusion_matrix(y_test,pred_lr)) 

[[5461  156]
 [1209  374]]


**2.2 Random Forest**

In [12]:
rf = RandomForestClassifier(n_estimators=1000)
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [13]:
pred_rf = rf.predict(X_test)

In [14]:
print(classification_report(y_test,pred_rf)) 

              precision    recall  f1-score   support

           0       0.84      0.94      0.89      5617
           1       0.65      0.38      0.48      1583

    accuracy                           0.82      7200
   macro avg       0.75      0.66      0.69      7200
weighted avg       0.80      0.82      0.80      7200



In [15]:
print(confusion_matrix(y_test,pred_rf)) 

[[5298  319]
 [ 980  603]]


**2.3 Decision Tree**

In [16]:
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [17]:
pred_dt = dt.predict(X_test)

In [18]:
print(classification_report(y_test,pred_dt)) 

              precision    recall  f1-score   support

           0       0.83      0.82      0.82      5617
           1       0.39      0.41      0.40      1583

    accuracy                           0.73      7200
   macro avg       0.61      0.61      0.61      7200
weighted avg       0.73      0.73      0.73      7200



In [19]:
print(confusion_matrix(y_test,pred_dt)) 

[[4580 1037]
 [ 927  656]]
