In [79]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report
from sklearn.preprocessing import StandardScaler

In [80]:
# Load the processed data
df = pd.read_csv('data/processed/loan.csv')
print(df.head(10), len(df.columns))

         ID  Client_Income  Car_Owned  Bike_Owned  Active_Loan  House_Own  \
0  12142509         6750.0          0           0            1          0   
1  12138936        20250.0          1           0            1          0   
2  12181264        18000.0          0           0            1          0   
3  12188929        15750.0          0           0            1          1   
4  12133385        33750.0          1           0            1          0   
5  12191614        11250.0          0           1            1          1   
6  12128086        15750.0          1           1            0          1   
7  12215264        13500.0          0           0            1          1   
8  12159147        13500.0          1           1            0          1   
9  12130547        12150.0          0           0            0          1   

   Child_Count  Credit_Amount  Loan_Contract_Type  Loan_Annuity  ...  \
0            0       61190.55                 1.0       3416.85  ...   
1       

In [None]:
scaler = StandardScaler()

In [101]:
feature_cols = [col for col in df.columns if col not in ["Default", "ID"]]
Y = df["Default"]
X = df[feature_cols]

In [102]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.3, random_state=42, stratify=Y
)

In [103]:
X_train[feature_cols] = scaler.fit_transform(X_train[feature_cols])
X_test[feature_cols] = scaler.transform(X_test[feature_cols])

In [104]:
# classification
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train, y_train) 

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [105]:
y_pred = model.predict(X_test)

In [106]:
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Accuracy: 0.9279481357879475
              precision    recall  f1-score   support

           0       0.93      1.00      0.96     33603
           1       0.95      0.11      0.20      2954

    accuracy                           0.93     36557
   macro avg       0.94      0.56      0.58     36557
weighted avg       0.93      0.93      0.90     36557



In [107]:
test_row = X_test.iloc[[34]]
predicted_default = model.predict(test_row)[0]
print(f"Predicted default: {predicted_default}, Actual default: {y_test.iloc[34]}")

Predicted default: 0, Actual default: 0


In [108]:
positive_samples = X_test.loc[y_test == 1].head(5)
preds = model.predict(positive_samples)

print(preds)
print(y_test.loc[positive_samples.index])

[1 0 0 0 0]
50227     1
118968    1
31439     1
37831     1
65044     1
Name: Default, dtype: int64


In [109]:
# save the model using joblib
import joblib
import os
if not os.path.exists('data/models'):
    os.makedirs('data/models')
joblib.dump(model, 'data/models/model.joblib')


['data/models/model.joblib']

In [114]:
# Log the RandomForestClassifier model in MLflow
import mlflow
import mlflow.sklearn

with mlflow.start_run(run_name="Loan_Default", experiment_id=mlflow.set_experiment("Loan_Default").experiment_id) as run:
    mlflow.sklearn.log_model(model, "random_forest_model")
    # Optionally, log metrics
    acc = accuracy_score(y_test, y_pred)
    mlflow.log_metric("accuracy", acc)

2025/12/10 15:44:28 INFO mlflow.tracking.fluent: Experiment with name 'Loan_Default' does not exist. Creating a new experiment.


In [110]:
# make an input
dct = df.iloc[0].to_dict()

In [111]:
dct

{'ID': 12142509.0,
 'Client_Income': 6750.0,
 'Car_Owned': 0.0,
 'Bike_Owned': 0.0,
 'Active_Loan': 1.0,
 'House_Own': 0.0,
 'Child_Count': 0.0,
 'Credit_Amount': 61190.55,
 'Loan_Contract_Type': 1.0,
 'Loan_Annuity': 3416.85,
 'Client_Marital_Status': 0.0,
 'Client_Housing_Type': 0.0,
 'Employed_Days': 1062.0,
 'Client_Occupation': 0.0,
 'Client_Family_Members': 2.0,
 'Type_Organization': 0.0,
 'Score_Source_1': 0.56806615,
 'Score_Source_2': 0.47878667,
 'Score_Source_3': 0.537069958,
 'Credit_Bureau': 0.0,
 'Default': 0.0}