In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('credit_risk_dataset.csv')

In [3]:
print(df.head())
print(df.info())
print(df.describe())
print(df.columns)

   person_age  person_income person_home_ownership  person_emp_length  \
0          22          59000                  RENT              123.0   
1          21           9600                   OWN                5.0   
2          25           9600              MORTGAGE                1.0   
3          23          65500                  RENT                4.0   
4          24          54400                  RENT                8.0   

  loan_intent loan_grade  loan_amnt  loan_int_rate  loan_status  \
0    PERSONAL          D      35000          16.02            1   
1   EDUCATION          B       1000          11.14            0   
2     MEDICAL          C       5500          12.87            1   
3     MEDICAL          C      35000          15.23            1   
4     MEDICAL          C      35000          14.27            1   

   loan_percent_income cb_person_default_on_file  cb_person_cred_hist_length  
0                 0.59                         Y                           3  


In [4]:
print("Number of missing values (before filling):")
print(df.isnull().sum())

Number of missing values (before filling):
person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64


In [5]:
# Fill missing values in 'person_emp_length' with the median value of that column
df['person_emp_length'].fillna(df['person_emp_length'].median(), inplace=True)

# Fill missing values in 'loan_int_rate' with the median value of that column
df['loan_int_rate'].fillna(df['loan_int_rate'].median(), inplace=True)

print("Missing values after filling:")
print(df.isnull().sum()) # Re-checking to confirm no more missing values

Missing values after filling:
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['person_emp_length'].fillna(df['person_emp_length'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['loan_int_rate'].fillna(df['loan_int_rate'].median(), inplace=True)


In [7]:
# Create a list of categorical columns
categorical_cols = df.select_dtypes(include='object').columns.tolist()
print("Identified categorical columns:")
print(categorical_cols)

Identified categorical columns:
['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']


In [8]:
# Apply One-Hot Encoding
# 'drop_first=True' is used to avoid multicollinearity
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("\nDataFrame head after One-Hot Encoding:")
print(df.head())

print("\nColumns after One-Hot Encoding:")
print(df.columns)


DataFrame head after One-Hot Encoding:
   person_age  person_income  person_emp_length  loan_amnt  loan_int_rate  \
0          22          59000              123.0      35000          16.02   
1          21           9600                5.0       1000          11.14   
2          25           9600                1.0       5500          12.87   
3          23          65500                4.0      35000          15.23   
4          24          54400                8.0      35000          14.27   

   loan_status  loan_percent_income  cb_person_cred_hist_length  \
0            1                 0.59                           3   
1            0                 0.10                           2   
2            1                 0.57                           3   
3            1                 0.53                           2   
4            1                 0.55                           4   

   person_home_ownership_OTHER  person_home_ownership_OWN  ...  \
0                        Fal

In [9]:
# 'loan_status' is our target variable (y)
y = df['loan_status']

# All other columns are features (X)
X = df.drop('loan_status', axis=1) # Drop 'loan_status' column from X

print("Features (X) head:")
print(X.head())
print("\nTarget Variable (y) head:")
print(y.head())

Features (X) head:
   person_age  person_income  person_emp_length  loan_amnt  loan_int_rate  \
0          22          59000              123.0      35000          16.02   
1          21           9600                5.0       1000          11.14   
2          25           9600                1.0       5500          12.87   
3          23          65500                4.0      35000          15.23   
4          24          54400                8.0      35000          14.27   

   loan_percent_income  cb_person_cred_hist_length  \
0                 0.59                           3   
1                 0.10                           2   
2                 0.57                           3   
3                 0.53                           2   
4                 0.55                           4   

   person_home_ownership_OTHER  person_home_ownership_OWN  \
0                        False                      False   
1                        False                       True   
2         

In [10]:
from sklearn.model_selection import train_test_split

# Spliting data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (26064, 22)
X_test shape: (6517, 22)
y_train shape: (26064,)
y_test shape: (6517,)


In [11]:
from sklearn.linear_model import LogisticRegression

# Initialize Logistic Regression model
# 'liblinear' solver is often good for smaller datasets
model = LogisticRegression(random_state=42, solver='liblinear')

# Training the model on the training data (X_train, y_train)
model.fit(X_train, y_train)

print("Logistic Regression Model trained successfully.")

Logistic Regression Model trained successfully.


In [12]:
# Make predictions on the X_test data using the trained model
y_pred = model.predict(X_test)

print("Predictions completed.")
print("First 10 predictions:", y_pred[:10]) # Display the first 10 predictions
print("Actual y_test values (first 10):", y_test.head(10).tolist()) # Display the first 10 actual values

Predictions completed.
First 10 predictions: [0 0 0 0 0 0 0 0 0 0]
Actual y_test values (first 10): [0, 0, 0, 1, 1, 0, 0, 0, 0, 0]


In [13]:
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

# Print Classification Report
# This includes Precision, Recall, and F1-Score
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Calculate ROC-AUC Score
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print(f"\nROC-AUC Score: {roc_auc:.4f}")

# Check Accuracy Score (for additional information)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy Score: {accuracy:.4f}")

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.98      0.89      5072
           1       0.76      0.24      0.36      1445

    accuracy                           0.81      6517
   macro avg       0.79      0.61      0.63      6517
weighted avg       0.81      0.81      0.77      6517


ROC-AUC Score: 0.7846
Accuracy Score: 0.8148


In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

print("--- Decision Tree Classifier ---")

# Initialize Decision Tree Classifier model
# Use random_state=42 for consistent results
dt_model = DecisionTreeClassifier(random_state=42)

# Train the model on the training data (X_train, y_train)
dt_model.fit(X_train, y_train)

print("Decision Tree Model trained successfully.")

# Make predictions on the X_test data
y_pred_dt = dt_model.predict(X_test)

# Evaluate the model
print("\nClassification Report (Decision Tree):")
print(classification_report(y_test, y_pred_dt))

# Calculate ROC-AUC Score
roc_auc_dt = roc_auc_score(y_test, dt_model.predict_proba(X_test)[:, 1])
print(f"\nROC-AUC Score (Decision Tree): {roc_auc_dt:.4f}")

# Calculate Accuracy Score
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Accuracy Score (Decision Tree): {accuracy_dt:.4f}")

--- Decision Tree Classifier ---
Decision Tree Model trained successfully.

Classification Report (Decision Tree):
              precision    recall  f1-score   support

           0       0.93      0.92      0.93      5072
           1       0.73      0.77      0.75      1445

    accuracy                           0.88      6517
   macro avg       0.83      0.84      0.84      6517
weighted avg       0.89      0.88      0.89      6517


ROC-AUC Score (Decision Tree): 0.8439
Accuracy Score (Decision Tree): 0.8845


In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

print("\n--- Random Forest Classifier ---")

# Initialize Random Forest Classifier model
# n_estimators=100 means 100 Decision Trees will be used
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data (X_train, y_train)
rf_model.fit(X_train, y_train)

print("Random Forest Model trained successfully.")

# Make predictions on the X_test data
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
print("\nClassification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf))

# Calculate ROC-AUC Score
roc_auc_rf = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])
print(f"\nROC-AUC Score (Random Forest): {roc_auc_rf:.4f}")

# Calculate Accuracy Score
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy Score (Random Forest): {accuracy_rf:.4f}")


--- Random Forest Classifier ---
Random Forest Model trained successfully.

Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       0.93      0.99      0.96      5072
           1       0.94      0.72      0.82      1445

    accuracy                           0.93      6517
   macro avg       0.93      0.85      0.89      6517
weighted avg       0.93      0.93      0.93      6517


ROC-AUC Score (Random Forest): 0.9342
Accuracy Score (Random Forest): 0.9286
