In [2]:
# Import modules
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pathlib import Path
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.preprocessing import StandardScaler


In [3]:
# Read the CSV file into a Pandas DataFrame
credit_data_df = pd.read_csv(
    Path('cleaned_data_v2.csv')   
)

# Review the DataFrame
credit_data_df.head()

Unnamed: 0,Age,Occupation,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,23,Scientist,19114.12,3,4,3,4,3,7,11.27,...,_,809.98,26.82,22.1,No,49.57,80.42,High_spent_Small_value_payments,312.49,Good
1,23,Scientist,19114.12,3,4,3,4,5,4,6.27,...,Good,809.98,31.38,22.3,No,49.57,199.46,Low_spent_Small_value_payments,223.45,Good
2,23,Scientist,19114.12,3,4,3,4,3,8,11.27,...,Good,809.98,22.54,22.6,No,49.57,178.34,Low_spent_Small_value_payments,244.57,Good
3,28,_______,34847.84,2,4,6,1,3,4,5.42,...,Good,605.03,24.46,26.6,No,18.82,104.29,Low_spent_Small_value_payments,470.69,Standard
4,28,Teacher,34847.84,2,4,6,1,7,1,7.42,...,Good,605.03,38.55,26.7,No,18.82,40.39,High_spent_Large_value_payments,484.59,Good


In [4]:
# Seperate the features, X,  from the target variable, y
y = credit_data_df['Credit_Score']
X = credit_data_df.drop(columns='Credit_Score')

In [5]:
# Preview the features data
X.head()

Unnamed: 0,Age,Occupation,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance
0,23,Scientist,19114.12,3,4,3,4,3,7,11.27,4,_,809.98,26.82,22.1,No,49.57,80.42,High_spent_Small_value_payments,312.49
1,23,Scientist,19114.12,3,4,3,4,5,4,6.27,4,Good,809.98,31.38,22.3,No,49.57,199.46,Low_spent_Small_value_payments,223.45
2,23,Scientist,19114.12,3,4,3,4,3,8,11.27,4,Good,809.98,22.54,22.6,No,49.57,178.34,Low_spent_Small_value_payments,244.57
3,28,_______,34847.84,2,4,6,1,3,4,5.42,2,Good,605.03,24.46,26.6,No,18.82,104.29,Low_spent_Small_value_payments,470.69
4,28,Teacher,34847.84,2,4,6,1,7,1,7.42,2,Good,605.03,38.55,26.7,No,18.82,40.39,High_spent_Large_value_payments,484.59


In [6]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)

In [7]:
# Review the features data
X.head()

Unnamed: 0,Age,Annual_Income,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,...,Credit_Mix_Standard,Credit_Mix__,Payment_of_Min_Amount_No,Payment_of_Min_Amount_Yes,Payment_Behaviour_High_spent_Large_value_payments,Payment_Behaviour_High_spent_Medium_value_payments,Payment_Behaviour_High_spent_Small_value_payments,Payment_Behaviour_Low_spent_Large_value_payments,Payment_Behaviour_Low_spent_Medium_value_payments,Payment_Behaviour_Low_spent_Small_value_payments
0,23,19114.12,3,4,3,4,3,7,11.27,4,...,False,True,True,False,False,False,True,False,False,False
1,23,19114.12,3,4,3,4,5,4,6.27,4,...,False,False,True,False,False,False,False,False,False,True
2,23,19114.12,3,4,3,4,3,8,11.27,4,...,False,False,True,False,False,False,False,False,False,True
3,28,34847.84,2,4,6,1,3,4,5.42,2,...,False,False,True,False,False,False,False,False,False,True
4,28,34847.84,2,4,6,1,7,1,7.42,2,...,False,False,True,False,True,False,False,False,False,False


In [8]:
# Split the dataset using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [9]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

In [28]:
# Import the KNeighborsClassifier module from sklearn
from sklearn.neighbors import KNeighborsClassifier

# Instantiate the KNeighborsClassifier model with n_neighbors = 7 
knn = KNeighborsClassifier(n_neighbors=7)

In [29]:
# Train the model using the training data
knn.fit(X_train_scaled, y_train)

In [30]:
# Create predictions using the testing data
y_pred = knn.predict(X_test_scaled)

In [31]:
# Print the classification report comparing the testing data to the model predictions
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        Good       0.50      0.58      0.54      2225
        Poor       0.64      0.55      0.59      3458
    Standard       0.71      0.72      0.72      6698

    accuracy                           0.65     12381
   macro avg       0.62      0.62      0.62     12381
weighted avg       0.65      0.65      0.65     12381



In [32]:
# Print confusion matrix
confusion_matrix(y_pred,y_test)

array([[1297,  384,  911],
       [ 141, 1905,  942],
       [ 787, 1169, 4845]])