In [2]:
# import the proper libraries for this problem
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error, r2_score
from collections import Counter

from sklearn.linear_model import LogisticRegression

In [3]:
# import data from data file
data = pd.read_csv("Data/default_of_credit_card_clients_csv.csv", skiprows = 1)
data

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,220000,1,3,1,39,0,0,0,0,...,88004,31237,15980,8500,20000,5003,3047,5000,1000,0
29996,29997,150000,1,3,2,43,-1,-1,-1,-1,...,8979,5190,0,1837,3526,8998,129,0,0,0
29997,29998,30000,1,2,2,37,4,3,2,-1,...,20878,20582,19357,0,0,22000,4200,2000,3100,1
29998,29999,80000,1,3,1,41,1,-1,0,0,...,52774,11855,48944,85900,3409,1178,1926,52964,1804,1


In [4]:
# check the null values and the dtypes for the dataset 
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   ID                          30000 non-null  int64
 1   LIMIT_BAL                   30000 non-null  int64
 2   SEX                         30000 non-null  int64
 3   EDUCATION                   30000 non-null  int64
 4   MARRIAGE                    30000 non-null  int64
 5   AGE                         30000 non-null  int64
 6   PAY_0                       30000 non-null  int64
 7   PAY_2                       30000 non-null  int64
 8   PAY_3                       30000 non-null  int64
 9   PAY_4                       30000 non-null  int64
 10  PAY_5                       30000 non-null  int64
 11  PAY_6                       30000 non-null  int64
 12  BILL_AMT1                   30000 non-null  int64
 13  BILL_AMT2                   30000 non-null  int64
 14  BILL_A

In [5]:
# since there is no data preprocessing to be done create X, y varible for the train test split
X = data.drop(columns = "default payment next month")

In [6]:
# create y variable which will hold only the outcomes
y = data["default payment next month"].values.reshape(-1, 1).ravel()

In [7]:
# Since the data is not balanced we should use an over sampler to assist in balancing the data
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(sampling_strategy = 1.0, random_state = 78).fit_resample(
            X, y)

In [8]:
# use train test split to split data to test the data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state = 1, stratify = y_resampled, train_size = .50)

In [9]:
X_train.shape

(23364, 24)

In [10]:
y_train.shape

(23364,)

In [11]:
Counter(y_train)

Counter({1: 11682, 0: 11682})

In [12]:
# Since the data is not balanced we should use an over sampler to assist in balancing the data
#from imblearn.over_sampling import SMOTE
#X_resampled, y_resampled = SMOTE(sampling_strategy = 1.0, random_state = 78).fit_resample(
#            X_train, y_train)

In [13]:
#Counter(y_test)

In [14]:
# Scale the data using the X_train and X_test
scaler = StandardScaler()
# fit the scaler to the train and test sets
data_scaler = scaler.fit(X_train, X_test)

In [15]:
# scale the training and testing data
X_train_scaled = data_scaler.transform(X_train)
X_test_scaled = data_scaler.transform(X_test)

In [16]:
X_train_scaled.shape

(23364, 24)

In [17]:
# create varible to hold the XGBclassifier to assist in the hyper parameter tunning
#xgbc = GradientBoostingClassifier()

In [18]:
# Build hyperparameter tuning to assist in getting the optimal parameters for the classifier
from sklearn.model_selection import GridSearchCV

In [19]:
# select the paramenters you would like to tune
# fill in the values you want to iterate over
# the more values the longer this will take to run
grid_search = {"solver":['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
               "max_iter":[100,150],
               "C":[0.01, 0.50, 1]
              }
              

In [20]:
classifier = LogisticRegression()
classifier

LogisticRegression()

In [21]:
#pass the grid serch varaible into the GridSearchCV 
param_search = GridSearchCV(estimator=classifier, param_grid = grid_search, scoring = "f1") #scoring = "roc_auc")

In [22]:
# fit the param_search to the training data
param_search.fit(X_train_scaled, y_train)



GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': [0.01, 0.5, 1], 'max_iter': [100, 150],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                                    'saga']},
             scoring='f1')

In [23]:
param_search.best_params_

{'C': 0.01, 'max_iter': 100, 'solver': 'saga'}

In [24]:
# Build model
#model = GradientBoostingClassifier(n_estimators = 100, 
 #                                          learning_rate = 1, 
  #                                         max_depth = 2, 
   #                                        min_samples_leaf = 2, 
    #                                       min_samples_split = 4, 
     #                                      random_state = 78
      #                            )

In [25]:
classifier = LogisticRegression(C = 0.01, max_iter = 100, solver = "saga")
classifier

LogisticRegression(C=0.01, solver='saga')

In [26]:
# Fit model
classifier.fit(X_train_scaled, y_train)



LogisticRegression(C=0.01, solver='saga')

In [27]:
classifier.score(X_train_scaled, y_train)

0.7249186783085089

In [28]:
classifier.score(X_test_scaled, y_test)

0.719140558123609

In [29]:
# calculate the predictions
predictions = classifier.predict(X_test_scaled)

In [30]:
# Model Evaluation
# create a confusion matrix to easily read the data evaluation
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Defaulted", "Actual Non-Defaulted"], columns=["Predicted Default", "Predicted Non-Default"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [31]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted Default,Predicted Non-Default
Actual Defaulted,8407,3275
Actual Non-Defaulted,3287,8395


Accuracy Score : 0.719140558123609
Classification Report
              precision    recall  f1-score   support

           0       0.72      0.72      0.72     11682
           1       0.72      0.72      0.72     11682

    accuracy                           0.72     23364
   macro avg       0.72      0.72      0.72     23364
weighted avg       0.72      0.72      0.72     23364



In [32]:
classifier.score(X_train_scaled, y_train
                )

0.7249186783085089

In [33]:
classifier.score(X_test_scaled, y_test)

0.719140558123609

In [34]:
# 1 is defaulted, 0 is not defaulted 

In [35]:
# Use SVM Classification to try and obtain better results
from sklearn.svm import SVC

In [45]:
svc = SVC()

In [97]:
# SVC hyperparameter tunning
grid_search_svc = {"kernel":['linear', 'poly', 'rbf'],
               "max_iter":[100,150],
               "C":[10, 25, 50]
              }

In [98]:
param_search_svc = GridSearchCV(estimator=svc, param_grid = grid_search_svc, scoring = "f1")

In [99]:
param_search_svc.fit(X_train_scaled, y_train)



GridSearchCV(estimator=SVC(),
             param_grid={'C': [10, 25, 50], 'kernel': ['linear', 'poly', 'rbf'],
                         'max_iter': [100, 150]},
             scoring='f1')

In [100]:
param_search_svc.best_params_

{'C': 10, 'kernel': 'poly', 'max_iter': 150}

In [101]:
svc_2 = SVC(kernel = "linear", max_iter = 150, C = 10)

In [102]:
svc_2.fit(X_train_scaled, y_train)



SVC(C=10, kernel='linear', max_iter=150)

In [103]:
svc_predictions = svc_2.predict(X_test_scaled)

In [104]:
# Model Evaluation
# create a confusion matrix to easily read the data evaluation
svc_cm = confusion_matrix(y_test, svc_predictions)
svc_cm_df = pd.DataFrame(
    cm, index=["Actual Defaulted", "Actual Non-Defaulted"], columns=["Predicted Default", "Predicted Non-Default"]
)

# Calculating the accuracy score
svc_acc_score = accuracy_score(y_test, svc_predictions)

In [105]:
# Displaying results
print("Confusion Matrix")
display(svc_cm_df)
print(f"Accuracy Score : {svc_acc_score}")
print("Classification Report")
print(classification_report(y_test, svc_predictions))

Confusion Matrix


Unnamed: 0,Predicted Default,Predicted Non-Default
Actual Defaulted,8407,3275
Actual Non-Defaulted,3287,8395


Accuracy Score : 0.5352251326827598
Classification Report
              precision    recall  f1-score   support

           0       0.72      0.12      0.20     11682
           1       0.52      0.95      0.67     11682

    accuracy                           0.54     23364
   macro avg       0.62      0.54      0.44     23364
weighted avg       0.62      0.54      0.44     23364

