<a href="https://colab.research.google.com/github/jessepoljak/Project_2_Credit_Default/blob/Erik/Project2_Loans.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import libraries
!pip install ucimlrepo
!pip install imbalanced-learn


Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [80]:
# Import dependencies
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import RandomizedSearchCV

In [23]:
# Loading the dataset from the UC Irvine repository
default_of_credit_card_clients = fetch_ucirepo(id=350)

In [24]:
# Extract features (X) and target labels (y) from the dataset
X = default_of_credit_card_clients.data.features
y = default_of_credit_card_clients.data.targets

# Combine the features and targets into a single DataFrame
df = pd.concat([X, y], axis=1)

# Rename columns for better readability
df = df.rename(columns={'X1' : 'LIMIT_BAL',	'X2' : 'SEX',	'X3' : 'EDUCATION',	'X4' : 'MARRIAGE',	'X5' : 'AGE',	'X6' : 'PAY_1',	'X7' : 'PAY_2',	'X8' : 'PAY_3',	'X9' : 'PAY_4',	'X10' : 'PAY_5',	'X11' : 'PAY_6',	'X12' : 'BILL_AMT1',	'X13' : 'BILL_AMT2',	'X14' : 'BILL_AMT3',	'X15' : 'BILL_AMT4',	'X16' : 'BILL_AMT5',	'X17' : 'BILL_AMT6',	'X18' : 'PAY_AMT1',	'X19' : 'PAY_AMT2',	'X20' : 'PAY_AMT3',	'X21' : 'PAY_AMT4',	'X22' : 'PAY_AMT5',	'X23' : 'PAY_AMT6',	'Y' : 'default payment next month'})

df = df.rename(columns={'default payment next month' : 'default_payment_next_month'})

df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default_payment_next_month
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [25]:
# Drop protected data
df.drop(columns = ['SEX', 'EDUCATION', 'MARRIAGE', 'AGE'], axis = 1, inplace =True)
df.head()

Unnamed: 0,LIMIT_BAL,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default_payment_next_month
0,20000,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1
1,120000,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [26]:
# Add const for VIF
df = sm.add_constant(df)

In [27]:
# Make X and Y dataframes
X = df.copy()
X.drop('default_payment_next_month', axis = 1, inplace = True)
y = df['default_payment_next_month']

In [28]:
# Check for multicollinearity
# Create an empty DataFrame to store VIF values
vif_data = pd.DataFrame()

# Assign column names of feature matrix (X) to a "Variable" column
vif_data["Variable"] = X.columns

# Calculate VIF for each feature in the dataset
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data)

     Variable        VIF
0       const   2.905049
1   LIMIT_BAL   1.455471
2       PAY_1   1.919362
3       PAY_2   3.170910
4       PAY_3   3.656754
5       PAY_4   4.286599
6       PAY_5   4.723479
7       PAY_6   3.254998
8   BILL_AMT1  14.016253
9   BILL_AMT2  25.861623
10  BILL_AMT3  21.769360
11  BILL_AMT4  20.345385
12  BILL_AMT5  24.990437
13  BILL_AMT6  15.024144
14   PAY_AMT1   1.707834
15   PAY_AMT2   2.236767
16   PAY_AMT3   1.756443
17   PAY_AMT4   1.648366
18   PAY_AMT5   1.687470
19   PAY_AMT6   1.169698


In [29]:
# Drop the bill_amts
X.drop(columns = ['BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6'], axis = 1, inplace =True)
X.head()

Unnamed: 0,const,LIMIT_BAL,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,1.0,20000,2,2,-1,-1,-2,-2,3913,0,689,0,0,0,0
1,1.0,120000,-1,2,0,0,0,2,2682,0,1000,1000,1000,0,2000
2,1.0,90000,0,0,0,0,0,0,29239,1518,1500,1000,1000,1000,5000
3,1.0,50000,0,0,0,0,0,0,46990,2000,2019,1200,1100,1069,1000
4,1.0,50000,-1,0,-1,0,0,0,8617,2000,36681,10000,9000,689,679


In [30]:
# Check again for multicollinearity
# Create an empty DataFrame to store VIF values
vif_data = pd.DataFrame()

# Assign column names of feature matrix (X) to a "Variable" column
vif_data["Variable"] = X.columns

# Calculate VIF for each feature in the dataset
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data)

     Variable       VIF
0       const  2.902471
1   LIMIT_BAL  1.432972
2       PAY_1  1.918190
3       PAY_2  3.163298
4       PAY_3  3.634330
5       PAY_4  4.273279
6       PAY_5  4.699473
7       PAY_6  3.188339
8   BILL_AMT1  1.322268
9    PAY_AMT1  1.201367
10   PAY_AMT2  1.179960
11   PAY_AMT3  1.190324
12   PAY_AMT4  1.145153
13   PAY_AMT5  1.108730
14   PAY_AMT6  1.113887


In [31]:
# Drop the const feature
X.drop(columns = ['const'], axis = 1, inplace =True)
X.head()

Unnamed: 0,LIMIT_BAL,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,20000,2,2,-1,-1,-2,-2,3913,0,689,0,0,0,0
1,120000,-1,2,0,0,0,2,2682,0,1000,1000,1000,0,2000
2,90000,0,0,0,0,0,0,29239,1518,1500,1000,1000,1000,5000
3,50000,0,0,0,0,0,0,46990,2000,2019,1200,1100,1069,1000
4,50000,-1,0,-1,0,0,0,8617,2000,36681,10000,9000,689,679


In [10]:
# Check for imbalance
y.value_counts()

Unnamed: 0_level_0,count
default_payment_next_month,Unnamed: 1_level_1
0,23364
1,6636


In [11]:
# Train, test, split the data
X_train, X_test, y_train, y_test = train_test_split(X, y)


In [12]:
# Scale the data
data_to_scale = ['LIMIT_BAL',
                'BILL_AMT1',
                'PAY_AMT1',
                'PAY_AMT2',
                'PAY_AMT3',
                'PAY_AMT4',
                'PAY_AMT5',
                'PAY_AMT6',]
# Initialize scaler
scaler = StandardScaler()
# Scale the features on training data
X_train[data_to_scale] = scaler.fit_transform(X_train[data_to_scale])

# Initialize scaler
scaler = StandardScaler()
# Scale the features on test data
X_test[data_to_scale] = scaler.fit_transform(X_test[data_to_scale])

In [13]:
# Balance the data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
y_train_res.value_counts()

Unnamed: 0_level_0,count
default_payment_next_month,Unnamed: 1_level_1
1,17508
0,17508


In [87]:
# Check Variance score for the random forest model
scores = cross_val_score(RandomForestClassifier(), X_train_res, y_train_res, cv=5, scoring='neg_mean_squared_error')

variance = np.var(-scores)
print(variance)

0.0010732003141995885


In [61]:
# Train Random Forest before parameter tuning
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_res, y_train_res)

In [62]:
# Make predictions
y_pred = clf.predict(X_test)
# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.83      0.85      5856
           1       0.47      0.53      0.50      1644

    accuracy                           0.76      7500
   macro avg       0.66      0.68      0.67      7500
weighted avg       0.78      0.76      0.77      7500



In [54]:
# Hyperparameter Tuning using RandomizedSearchCV
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9]
}

In [58]:
# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42), param_distributions=param_grid, n_iter=20, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)

# Fit the model
random_search.fit(X_train_res, y_train_res)

# Best hyperparameters
print("Best Parameters:", random_search.best_params_)

# Best model
best_model = random_search.best_estimator_

Best Parameters: {'n_estimators': 25, 'max_leaf_nodes': 9, 'max_features': 'sqrt', 'max_depth': 9}


In [59]:
# Train Random Forest with parameter tuning
clf = RandomForestClassifier(n_estimators= 25, max_leaf_nodes= 9, max_features= 'sqrt', max_depth= 9, random_state= 42)
clf.fit(X_train_res, y_train_res)

In [60]:
# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.83      0.85      5856
           1       0.50      0.59      0.54      1644

    accuracy                           0.78      7500
   macro avg       0.69      0.71      0.70      7500
weighted avg       0.79      0.78      0.78      7500

