In [None]:
#Mounting Google Drive to access files

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Importing All our Dependencies

import pandas as pd
import os
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import sklearn as skl
import tensorflow as tf
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.preprocessing import StandardScaler


In [None]:
#Read in the DataSet

train_df = pd.read_csv("/content/drive/MyDrive/lastproject/resources/train.csv")
train_df.head()

Unnamed: 0,ID,CustomerID,Month,Name,Age,SSN,Occupation,AnnualIncome,MonthlyInhandSalary,NumBankAccounts,...,CreditMix,OutstandingDebt,CreditUtilizationRatio,CreditHistoryAge,PaymentofMinAmount,TotalEMIpermonth,Amountinvestedmonthly,PaymentBehaviour,MonthlyBalance,CreditScore
0,0x1602,CUS0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.415295,HighspentSmallvaluepayments,312.494089,Good
1,0x1603,CUS0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.94496,,No,49.574949,118.280222,LowspentLargevaluepayments,284.629162,Good
2,0x1604,CUS0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521,LowspentMediumvaluepayments,331.209863,Good
3,0x1605,CUS0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.458074,LowspentSmallvaluepayments,223.45131,Good
4,0x1606,CUS0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153,HighspentMediumvaluepayments,341.489231,Good


In [None]:
#Preprocessing Data.
#Dropping columns to clear up any "background noise" for our learning models.

columns_to_drop = ["ID", "CustomerID", "Month", "Name", "Age",
                   "SSN", "Occupation", "TypeofLoan", "InterestRate", "ChangedCreditLimit","MonthlyInhandSalary",
                   "PaymentofMinAmount", "TotalEMIpermonth", "Amountinvestedmonthly",
                   "PaymentBehaviour"]

train_clean_df = train_df.drop(columns=columns_to_drop)
train_clean_df.head()


Unnamed: 0,AnnualIncome,NumBankAccounts,NumCreditCard,NumofLoan,Delayfromduedate,NumofDelayedPayment,NumCreditInquiries,CreditMix,OutstandingDebt,CreditUtilizationRatio,CreditHistoryAge,MonthlyBalance,CreditScore
0,19114.12,3,4,4,3,7.0,4.0,,809.98,26.82262,22 Years and 1 Months,312.494089,Good
1,19114.12,3,4,4,-1,,4.0,Good,809.98,31.94496,,284.629162,Good
2,19114.12,3,4,4,3,7.0,4.0,Good,809.98,28.609352,22 Years and 3 Months,331.209863,Good
3,19114.12,3,4,4,5,4.0,4.0,Good,809.98,31.377862,22 Years and 4 Months,223.45131,Good
4,19114.12,3,4,4,6,,4.0,Good,809.98,24.797347,22 Years and 5 Months,341.489231,Good


In [None]:
#Beginning the process of converting the "CreditHistoryAge" from a String to a Float

train_clean_df['CreditHistoryAge'] = train_clean_df['CreditHistoryAge'].str[:2]
train_clean_df

Unnamed: 0,AnnualIncome,NumBankAccounts,NumCreditCard,NumofLoan,Delayfromduedate,NumofDelayedPayment,NumCreditInquiries,CreditMix,OutstandingDebt,CreditUtilizationRatio,CreditHistoryAge,MonthlyBalance,CreditScore
0,19114.12,3,4,4,3,7.0,4.0,,809.98,26.822620,22,312.494089,Good
1,19114.12,3,4,4,-1,,4.0,Good,809.98,31.944960,,284.629162,Good
2,19114.12,3,4,4,3,7.0,4.0,Good,809.98,28.609352,22,331.209863,Good
3,19114.12,3,4,4,5,4.0,4.0,Good,809.98,31.377862,22,223.451310,Good
4,19114.12,3,4,4,6,,4.0,Good,809.98,24.797347,22,341.489231,Good
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,39628.99,4,6,2,23,7.0,3.0,,502.38,34.663572,31,479.866228,Poor
99996,39628.99,4,6,2,18,7.0,3.0,,502.38,40.565631,31,496.651610,Poor
99997,39628.99,4,6,2,27,6.0,3.0,Good,502.38,41.255522,31,516.809083,Poor
99998,39628.99,4,6,2,20,,3.0,Good,502.38,33.638208,31,319.164979,Standard


In [None]:
#Removing any Null values and replacing them with zero

train_clean_df = train_clean_df.dropna(subset=['CreditMix'])
train_clean_df = train_clean_df.fillna(0)
train_clean_df

Unnamed: 0,AnnualIncome,NumBankAccounts,NumCreditCard,NumofLoan,Delayfromduedate,NumofDelayedPayment,NumCreditInquiries,CreditMix,OutstandingDebt,CreditUtilizationRatio,CreditHistoryAge,MonthlyBalance,CreditScore
1,19114.12,3,4,4,-1,0.0,4.0,Good,809.98,31.944960,0,284.629162,Good
2,19114.12,3,4,4,3,7.0,4.0,Good,809.98,28.609352,22,331.209863,Good
3,19114.12,3,4,4,5,4.0,4.0,Good,809.98,31.377862,22,223.451310,Good
4,19114.12,3,4,4,6,0.0,4.0,Good,809.98,24.797347,22,341.489231,Good
5,19114.12,3,4,4,8,4.0,4.0,Good,809.98,27.262259,22,340.479212,Good
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99991,20002.88,10,8,5,33,25.0,9.0,Bad,3571.70,37.140784,6,337.362988,Standard
99993,39628.99,4,6,2,23,0.0,3.0,Good,502.38,29.135447,31,400.104466,Standard
99997,39628.99,4,6,2,27,6.0,3.0,Good,502.38,41.255522,31,516.809083,Poor
99998,39628.99,4,6,2,20,0.0,3.0,Good,502.38,33.638208,31,319.164979,Standard


In [None]:
#Converting String to Int
train_clean_df['CreditHistoryAge'] = pd.to_numeric(train_clean_df['CreditHistoryAge'], errors='coerce')

In [None]:
#Binning Target Column
bin_mapping = {
    'Good': 'Good/Standard',
    'Standard': 'Good/Standard',
    'Poor': 'Poor'
}

# Create a new column for the binned categories
train_clean_df['target'] = train_clean_df['CreditScore'].map(bin_mapping)
train_clean_df = train_clean_df.drop(columns = 'CreditScore')
train_clean_df

Unnamed: 0,AnnualIncome,NumBankAccounts,NumCreditCard,NumofLoan,Delayfromduedate,NumofDelayedPayment,NumCreditInquiries,CreditMix,OutstandingDebt,CreditUtilizationRatio,CreditHistoryAge,MonthlyBalance,target
1,19114.12,3,4,4,-1,0.0,4.0,Good,809.98,31.944960,0,284.629162,Good/Standard
2,19114.12,3,4,4,3,7.0,4.0,Good,809.98,28.609352,22,331.209863,Good/Standard
3,19114.12,3,4,4,5,4.0,4.0,Good,809.98,31.377862,22,223.451310,Good/Standard
4,19114.12,3,4,4,6,0.0,4.0,Good,809.98,24.797347,22,341.489231,Good/Standard
5,19114.12,3,4,4,8,4.0,4.0,Good,809.98,27.262259,22,340.479212,Good/Standard
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99991,20002.88,10,8,5,33,25.0,9.0,Bad,3571.70,37.140784,6,337.362988,Good/Standard
99993,39628.99,4,6,2,23,0.0,3.0,Good,502.38,29.135447,31,400.104466,Good/Standard
99997,39628.99,4,6,2,27,6.0,3.0,Good,502.38,41.255522,31,516.809083,Poor
99998,39628.99,4,6,2,20,0.0,3.0,Good,502.38,33.638208,31,319.164979,Good/Standard


In [None]:
#Binning CreditMix Column
bin_map = {
    'Good': 'Good',
    'Standard': 'Good',
    'Bad': 'Bad'
}

# Create a new column for the binned categories
train_clean_df['CreditMixBin'] = train_clean_df['CreditMix'].map(bin_map)
train_clean_df = train_clean_df.drop(columns = 'CreditMix')
train_clean_df

Unnamed: 0,AnnualIncome,NumBankAccounts,NumCreditCard,NumofLoan,Delayfromduedate,NumofDelayedPayment,NumCreditInquiries,OutstandingDebt,CreditUtilizationRatio,CreditHistoryAge,MonthlyBalance,target,CreditMixBin
1,19114.12,3,4,4,-1,0.0,4.0,809.98,31.944960,0,284.629162,Good/Standard,Good
2,19114.12,3,4,4,3,7.0,4.0,809.98,28.609352,22,331.209863,Good/Standard,Good
3,19114.12,3,4,4,5,4.0,4.0,809.98,31.377862,22,223.451310,Good/Standard,Good
4,19114.12,3,4,4,6,0.0,4.0,809.98,24.797347,22,341.489231,Good/Standard,Good
5,19114.12,3,4,4,8,4.0,4.0,809.98,27.262259,22,340.479212,Good/Standard,Good
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99991,20002.88,10,8,5,33,25.0,9.0,3571.70,37.140784,6,337.362988,Good/Standard,Bad
99993,39628.99,4,6,2,23,0.0,3.0,502.38,29.135447,31,400.104466,Good/Standard,Good
99997,39628.99,4,6,2,27,6.0,3.0,502.38,41.255522,31,516.809083,Poor,Good
99998,39628.99,4,6,2,20,0.0,3.0,502.38,33.638208,31,319.164979,Good/Standard,Good


In [None]:
#Confirming all the changes made above have been completed properly

train_clean_df.dtypes

AnnualIncome              float64
NumBankAccounts             int64
NumCreditCard               int64
NumofLoan                   int64
Delayfromduedate            int64
NumofDelayedPayment       float64
NumCreditInquiries        float64
OutstandingDebt           float64
CreditUtilizationRatio    float64
CreditHistoryAge            int64
MonthlyBalance            float64
target                     object
CreditMixBin               object
dtype: object

In [None]:
# Split target column from dataset
y = train_clean_df['target']
X = train_clean_df.drop(columns='target',axis=1)

In [None]:
# Preview the data
X[:5]

Unnamed: 0,AnnualIncome,NumBankAccounts,NumCreditCard,NumofLoan,Delayfromduedate,NumofDelayedPayment,NumCreditInquiries,OutstandingDebt,CreditUtilizationRatio,CreditHistoryAge,MonthlyBalance,CreditMixBin
1,19114.12,3,4,4,-1,0.0,4.0,809.98,31.94496,0,284.629162,Good
2,19114.12,3,4,4,3,7.0,4.0,809.98,28.609352,22,331.209863,Good
3,19114.12,3,4,4,5,4.0,4.0,809.98,31.377862,22,223.45131,Good
4,19114.12,3,4,4,6,0.0,4.0,809.98,24.797347,22,341.489231,Good
5,19114.12,3,4,4,8,4.0,4.0,809.98,27.262259,22,340.479212,Good


In [None]:
# Print first ten entries for target
y[:10]

1     Good/Standard
2     Good/Standard
3     Good/Standard
4     Good/Standard
5     Good/Standard
6     Good/Standard
7     Good/Standard
8     Good/Standard
9     Good/Standard
11    Good/Standard
Name: target, dtype: object

In [None]:
X = pd.get_dummies(X)
# Preview the data
X.head()

Unnamed: 0,AnnualIncome,NumBankAccounts,NumCreditCard,NumofLoan,Delayfromduedate,NumofDelayedPayment,NumCreditInquiries,OutstandingDebt,CreditUtilizationRatio,CreditHistoryAge,MonthlyBalance,CreditMixBin_Bad,CreditMixBin_Good
1,19114.12,3,4,4,-1,0.0,4.0,809.98,31.94496,0,284.629162,0,1
2,19114.12,3,4,4,3,7.0,4.0,809.98,28.609352,22,331.209863,0,1
3,19114.12,3,4,4,5,4.0,4.0,809.98,31.377862,22,223.45131,0,1
4,19114.12,3,4,4,6,0.0,4.0,809.98,24.797347,22,341.489231,0,1
5,19114.12,3,4,4,8,4.0,4.0,809.98,27.262259,22,340.479212,0,1


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42)


In [None]:
# Scale the data
X_scaler = skl.preprocessing.StandardScaler()
X_scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Train a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.8573075380914194

In [None]:
# Importing classification report, confusion matrix and accuracy score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
# Train a Decision Tree Classifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)


0.8117983159582999

In [None]:
# Fit an AdaBoost Model
clf = AdaBoostClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.7835304731355253

In [None]:
# Fit a gradient boost model
clf = GradientBoostingClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8072373696872494

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, y_pred)

In [None]:
#Assessing how our model performed

print("Confusion Matrix")

print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix
Accuracy Score : 0.8072373696872494
Classification Report
               precision    recall  f1-score   support

Good/Standard       0.84      0.90      0.87     14148
         Poor       0.70      0.58      0.64      5804

     accuracy                           0.81     19952
    macro avg       0.77      0.74      0.75     19952
 weighted avg       0.80      0.81      0.80     19952



In [None]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:15]

[(0.23636865430945517, 'OutstandingDebt'),
 (0.09700193074513488, 'Delayfromduedate'),
 (0.09010244526785605, 'MonthlyBalance'),
 (0.08650588758496071, 'AnnualIncome'),
 (0.08584078274452349, 'CreditUtilizationRatio'),
 (0.0773892941735139, 'NumCreditInquiries'),
 (0.06548294167445956, 'CreditHistoryAge'),
 (0.05983831104978804, 'NumofDelayedPayment'),
 (0.05066756291455785, 'NumCreditCard'),
 (0.04872242360191823, 'NumBankAccounts'),
 (0.046035894211067144, 'NumofLoan'),
 (0.028690050959004895, 'CreditMixBin_Bad'),
 (0.027353820763759955, 'CreditMixBin_Good')]

In [None]:
train_clean_df.to_csv("data.csv", index=False)  # Specify the desired filename


In [None]:
from google.colab import files

files.download("data.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>