Loading Essential Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

Cleaning the Dataset

In [None]:
df = pd.read_csv('credit_risk.csv')

df = df.dropna() # dropping all NaN values
df = df.drop_duplicates() #dropping duplicates

df.head()

Unnamed: 0,Age,Gender,Education Level,Marital Status,Income,Credit Score,Loan Amount,Loan Purpose,Employment Status,Years at Current Job,Payment History,Debt-to-Income Ratio,Assets Value,Number of Dependents,City,State,Country,Previous Defaults,Marital Status Change,Risk Rating
0,49,Male,PhD,Divorced,72799.0,688.0,45713.0,Business,Unemployed,19,Poor,0.154313,120228.0,0.0,Port Elizabeth,AS,Cyprus,2.0,2,Low
2,21,Non-binary,Master's,Single,55687.0,600.0,36623.0,Home,Employed,8,Fair,0.362398,180700.0,3.0,South Scott,OK,Luxembourg,3.0,2,Medium
3,59,Male,Bachelor's,Single,26508.0,622.0,26541.0,Personal,Unemployed,2,Excellent,0.454964,157319.0,3.0,Robinhaven,PR,Uganda,4.0,2,Medium
10,42,Non-binary,Master's,Single,116212.0,707.0,24771.0,Home,Employed,11,Excellent,0.114134,212198.0,3.0,Matthewborough,NH,French Guiana,0.0,2,Medium
16,55,Male,High School,Married,70978.0,706.0,36970.0,Personal,Unemployed,19,Excellent,0.266941,54041.0,3.0,Christophermouth,MO,Tonga,1.0,0,Medium


Encoding and Standardizing all Variables

In [None]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.4-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.4-py2.py3-none-any.whl (82 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.0/82.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.6.4


In [None]:
import category_encoders as ce # for target encoding

x = df[df.columns[:-1]] #feature variables
y = df['Risk Rating'] #target variable

#Defining categorical, high cardinality and numerical features
categorical = ["Gender","Education Level", "Marital Status", "Loan Purpose",
              "Employment Status", "Payment History"]

high_cardinality = ["City", "State", "Country"]

numerical = ["Age", "Income", "Credit Score", "Loan Amount", "Years at Current Job",
             "Debt-to-Income Ratio","Assets Value",	"Number of Dependents",
             "Previous Defaults", "Marital Status Change"]

# Encoding the target variable
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

#Column Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical),  # OneHotEncoding for categorical columns
        ("high_card", ce.TargetEncoder(cols=high_cardinality), high_cardinality),
        # Target Encoding for high-cardinality columns

        ("num", StandardScaler(), numerical)  # Standard scaling for numerical columns
    ],
    remainder="passthrough"  # Keep any remaining columns as is (if needed)
)

# Fit and transform the features using the preprocessor
x_processed = preprocessor.fit_transform(x, y_encoded)

# Retrieve the feature names for the transformed data
categorical_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical)
numerical_feature_names = numerical
high_cardinality_feature_names = high_cardinality  # These are target encoded, so they retain the original column names

# Combine all the feature names into a final list
final_columns = list(categorical_feature_names) + numerical_feature_names + high_cardinality_feature_names

# Convert the transformed feature data into a DataFrame with the original format
x_processed_df = pd.DataFrame(x_processed, columns=final_columns)

# Display the transformed DataFrame (if needed)
x_processed_df.head()

Unnamed: 0,Gender_Female,Gender_Male,Gender_Non-binary,Education Level_Bachelor's,Education Level_High School,Education Level_Master's,Education Level_PhD,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,...,Loan Amount,Years at Current Job,Debt-to-Income Ratio,Assets Value,Number of Dependents,Previous Defaults,Marital Status Change,City,State,Country
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.367233,0.0895,-0.195727,1.398339,1.652247,-1.375542,-0.496597,-1.410156,-0.002328,1.247493
1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,-1.530506,-0.497597,-1.74352,0.697471,-0.247308,0.07572,0.253847,0.703599,0.697924,1.247493
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.044997,-1.498701,-1.356572,-0.079884,-1.283429,0.721313,-0.036306,0.703599,1.398175,1.247493
3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,-0.107202,1.57896,0.138456,-0.216357,0.270752,-1.65577,0.64473,0.703599,-1.40283,1.247493
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.773891,0.027023,0.120868,0.724225,1.652247,-0.590035,-1.317964,0.703599,-0.702579,-1.217735


TEST, TRAIN SPLIT

In [None]:
# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x_processed_df, y_encoded, test_size=0.2, random_state=42)

# Display the shape of the train and test sets
print(f"Training set: {x_train.shape}, Test set: {x_test.shape}")

Training set: (4572, 35), Test set: (1144, 35)


CLASSIFICATION MODLES

In [None]:
# Support Vector Machines
from sklearn.svm import SVC

def evaluate_svm(x_train, y_train, x_test, y_test):
  svm = SVC(kernel='rbf', gamma ='scale')
  svm.fit(x_train, y_train)
  pred = svm.predict(x_test)

  #Prining Performance Metrics
  print("Accuracy Score:", accuracy_score(y_test, pred))
  print("Confusion Matrix:\n", confusion_matrix(y_test, pred))
  print("Classification Report:\n", classification_report(y_test, pred))


In [None]:
evaluate_svm(x_train, y_train, x_test, y_test)

Accuracy Score: 0.6206293706293706
Confusion Matrix:
 [[  0 103   0]
 [  0 706   0]
 [  0 331   4]]
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       103
           1       0.62      1.00      0.76       706
           2       1.00      0.01      0.02       335

    accuracy                           0.62      1144
   macro avg       0.54      0.34      0.26      1144
weighted avg       0.68      0.62      0.48      1144



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

def evaluate_decision_tree(x_train, y_train, x_test, y_test, max_depth=None):
    # Initialize the Decision Tree Classifier
    dt = DecisionTreeClassifier(max_depth=max_depth)

    # Train the model
    dt.fit(x_train, y_train)

    # Predict on the test set
    pred = dt.predict(x_test)

    # Print performance metrics
    print("Accuracy Score:", accuracy_score(y_test, pred))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred))
    print("\nClassification Report:\n", classification_report(y_test, pred))


In [None]:
evaluate_decision_tree(x_train, y_train, x_test, y_test, max_depth=100)

Accuracy Score: 0.8933566433566433

Confusion Matrix:
 [[ 84  17   2]
 [ 28 635  43]
 [  2  30 303]]

Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.82      0.77       103
           1       0.93      0.90      0.91       706
           2       0.87      0.90      0.89       335

    accuracy                           0.89      1144
   macro avg       0.85      0.87      0.86      1144
weighted avg       0.90      0.89      0.89      1144



In [None]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

def evaluate_random_forest(x_train, y_train, x_test, y_test, n_estimators=500):
    # Initialize the RandomForestClassifier
    rf = RandomForestClassifier(n_estimators=n_estimators)

    # Train the model
    rf.fit(x_train, y_train)

    # Predict on the test set
    pred = rf.predict(x_test)

    # Print performance metrics
    print("Accuracy Score:", accuracy_score(y_test, pred))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred))
    print("\nClassification Report:\n", classification_report(y_test, pred))

In [None]:
evaluate_random_forest(x_train, y_train,x_test, y_test)

Accuracy Score: 0.9204545454545454

Confusion Matrix:
 [[ 75  28   0]
 [  1 696   9]
 [  0  53 282]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.73      0.84       103
           1       0.90      0.99      0.94       706
           2       0.97      0.84      0.90       335

    accuracy                           0.92      1144
   macro avg       0.95      0.85      0.89      1144
weighted avg       0.93      0.92      0.92      1144



Evaluating Models after Oversampling

In [None]:
#Oversampling the train data
from imblearn.over_sampling import SMOTE

#over sampling the train dataset
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

#checking the size of my train dataset and each class/risk rating
print(x_train_resampled.shape)
print(y_train_resampled.shape)

(8154, 35)
(8154,)


In [None]:
# SVM after oversampling
evaluate_svm(x_train_resampled, y_train_resampled, x_test, y_test)

Accuracy Score: 0.6966783216783217
Confusion Matrix:
 [[ 19  83   1]
 [ 27 593  86]
 [  4 146 185]]
Classification Report:
               precision    recall  f1-score   support

           0       0.38      0.18      0.25       103
           1       0.72      0.84      0.78       706
           2       0.68      0.55      0.61       335

    accuracy                           0.70      1144
   macro avg       0.59      0.53      0.54      1144
weighted avg       0.68      0.70      0.68      1144



In [None]:
# Decision Tree after oversampling
evaluate_decision_tree(x_train_resampled, y_train_resampled, x_test, y_test, max_depth=100)

Accuracy Score: 0.8828671328671329

Confusion Matrix:
 [[ 87  13   3]
 [ 31 625  50]
 [  3  34 298]]

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.84      0.78       103
           1       0.93      0.89      0.91       706
           2       0.85      0.89      0.87       335

    accuracy                           0.88      1144
   macro avg       0.83      0.87      0.85      1144
weighted avg       0.89      0.88      0.88      1144



In [None]:
# Random Forest after oversampling
evaluate_random_forest(x_train_resampled, y_train_resampled,x_test, y_test)

Accuracy Score: 0.9178321678321678

Confusion Matrix:
 [[ 84  19   0]
 [ 15 665  26]
 [  0  34 301]]

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.82      0.83       103
           1       0.93      0.94      0.93       706
           2       0.92      0.90      0.91       335

    accuracy                           0.92      1144
   macro avg       0.90      0.89      0.89      1144
weighted avg       0.92      0.92      0.92      1144

