# Training a ramdom forest using different strategies for dealing with an imbalanced dataset

In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import os

In [7]:
#Loading data

#Defining BASE_PATH
BASE_PATH = os.getenv("/Users/carlos/Desktop/CURSOS/Anyone AI/Credit-Risk-App/dataset/", "/Users/carlos/Desktop/CURSOS/Anyone AI/Credit-Risk-App/dataset/")

train_file_path = os.path.join(BASE_PATH, "X_train_data.csv")
y_train_file_path = os.path.join(BASE_PATH, "y_train_data.csv")
val_file_path = os.path.join(BASE_PATH, "X_val_data.csv")
y_val_path = os.path.join(BASE_PATH, "y_val_data.csv")

train_df = pd.read_csv(train_file_path)

y_train_df = pd.read_csv(y_train_file_path)
y_train = y_train_df['TARGET_LABEL_BAD']
y_train = y_train.to_numpy()

val_df = pd.read_csv(val_file_path)

y_valid_df = pd.read_csv(y_val_path)
y_valid = y_valid_df['TARGET_LABEL_BAD']
y_valid = y_valid.to_numpy()


In [8]:
# Train Random Forest with default parameters and class_weight='balanced'
rf_default = RandomForestClassifier(n_jobs=-1, random_state=42, class_weight='balanced')

# Fit the model
rf_default.fit(train_df, y_train)

# Predictions
y_pred_default = rf_default.predict(val_df)
y_prob_default = rf_default.predict_proba(val_df)[:, 1]

# Performance Metrics
print("\n=== Default Model Performance ===")
print("Accuracy:", accuracy_score(y_valid, y_pred_default))
print("Precision:", precision_score(y_valid, y_pred_default))
print("Recall:", recall_score(y_valid, y_pred_default))
print("F1 Score:", f1_score(y_valid, y_pred_default))
print("ROC-AUC:", roc_auc_score(y_valid, y_prob_default))

# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_valid, y_pred_default))

# Feature Importance
importances_default = rf_default.feature_importances_
feature_importance_default = pd.DataFrame({
    'Feature': train_df.columns,
    'Importance': importances_default
}).sort_values(by='Importance', ascending=False)

print("\n=== Feature Importance (Default Model) ===")
print(feature_importance_default)



=== Default Model Performance ===
Accuracy: 0.7312
Precision: 0.33783783783783783
Recall: 0.009384384384384385
F1 Score: 0.018261504747991233
ROC-AUC: 0.6039194694967596
Confusion Matrix:
 [[7287   49]
 [2639   25]]

=== Feature Importance (Default Model) ===
                      Feature    Importance
8     PERSONAL_MONTHLY_INCOME  5.755174e-02
3            RESIDENCIAL_CITY  5.133699e-02
20          RESIDENCIAL_ZIP_3  5.067589e-02
21         PROFESSIONAL_ZIP_3  5.045448e-02
2               CITY_OF_BIRTH  4.703033e-02
..                        ...           ...
229      PROFESSION_CODE_18.0  7.418909e-07
186  MONTHS_IN_RESIDENCE_69.0  6.496827e-07
178  MONTHS_IN_RESIDENCE_61.0  3.311942e-07
183  MONTHS_IN_RESIDENCE_66.0  1.628427e-08
194  MONTHS_IN_RESIDENCE_82.0  3.264430e-20

[326 rows x 2 columns]


In [9]:
# Apply SMOTE for balancing the dataset
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(train_df, y_train)

# Train Random Forest with the SMOTE-balanced data
rf_smote = RandomForestClassifier(n_jobs=-1, random_state=42, class_weight='balanced')
rf_smote.fit(X_train_smote, y_train_smote)

# Predictions
y_pred_smote = rf_smote.predict(val_df)
y_prob_smote = rf_smote.predict_proba(val_df)[:, 1]

# Performance Metrics (SMOTE Model)
print("\n=== SMOTE Model Performance ===")
print("Accuracy:", accuracy_score(y_valid, y_pred_smote))
print("Precision:", precision_score(y_valid, y_pred_smote))
print("Recall:", recall_score(y_valid, y_pred_smote))
print("F1 Score:", f1_score(y_valid, y_pred_smote))
print("ROC-AUC:", roc_auc_score(y_valid, y_prob_smote))

# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_valid, y_pred_smote))

# Feature Importance (SMOTE Model)
importances_smote = rf_smote.feature_importances_
feature_importance_smote = pd.DataFrame({
    'Feature': train_df.columns,
    'Importance': importances_smote
}).sort_values(by='Importance', ascending=False)

print("\n=== Feature Importance (SMOTE Model) ===")
print(feature_importance_smote)



=== SMOTE Model Performance ===
Accuracy: 0.7296
Precision: 0.44871794871794873
Recall: 0.0656906906906907
F1 Score: 0.114603798297315
ROC-AUC: 0.6063711271249439
Confusion Matrix:
 [[7121  215]
 [2489  175]]

=== Feature Importance (SMOTE Model) ===
                         Feature    Importance
4            RESIDENCIAL_BOROUGH  4.621150e-02
2                  CITY_OF_BIRTH  4.609565e-02
3               RESIDENCIAL_CITY  4.142927e-02
8        PERSONAL_MONTHLY_INCOME  4.101855e-02
6    RESIDENCIAL_PHONE_AREA_CODE  3.633150e-02
..                           ...           ...
188     MONTHS_IN_RESIDENCE_71.0  4.194912e-07
189     MONTHS_IN_RESIDENCE_72.0  3.831482e-07
194     MONTHS_IN_RESIDENCE_82.0  1.125270e-07
178     MONTHS_IN_RESIDENCE_61.0  9.451141e-08
187     MONTHS_IN_RESIDENCE_70.0  2.640031e-08

[326 rows x 2 columns]


In [10]:
# Apply Undersampling
rus = RandomUnderSampler(random_state=42)
X_train_under, y_train_under = rus.fit_resample(train_df, y_train)

# Train Random Forest with Undersampled data
rf_under = RandomForestClassifier(n_jobs=-1, random_state=42, class_weight='balanced')
rf_under.fit(X_train_under, y_train_under)

# Predictions
y_pred_under = rf_under.predict(val_df)
y_prob_under = rf_under.predict_proba(val_df)[:, 1]

# Performance Metrics (Undersampled Model)
print("\n=== Undersampled Model Performance ===")
print("Accuracy:", accuracy_score(y_valid, y_pred_under))
print("Precision:", precision_score(y_valid, y_pred_under))
print("Recall:", recall_score(y_valid, y_pred_under))
print("F1 Score:", f1_score(y_valid, y_pred_under))
print("ROC-AUC:", roc_auc_score(y_valid, y_prob_under))

# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_valid, y_pred_under))

# Feature Importance (Undersampled Model)
importances_under = rf_under.feature_importances_
feature_importance_under = pd.DataFrame({
    'Feature': train_df.columns,
    'Importance': importances_under
}).sort_values(by='Importance', ascending=False)

print("\n=== Feature Importance (Undersampled Model) ===")
print(feature_importance_under)



=== Undersampled Model Performance ===
Accuracy: 0.5707
Precision: 0.3246501614639397
Recall: 0.566066066066066
F1 Score: 0.4126419482829388
ROC-AUC: 0.6044004064042232
Confusion Matrix:
 [[4199 3137]
 [1156 1508]]

=== Feature Importance (Undersampled Model) ===
                      Feature  Importance
8     PERSONAL_MONTHLY_INCOME    0.056145
3            RESIDENCIAL_CITY    0.050321
20          RESIDENCIAL_ZIP_3    0.050228
21         PROFESSIONAL_ZIP_3    0.049640
2               CITY_OF_BIRTH    0.046514
..                        ...         ...
197  MONTHS_IN_RESIDENCE_90.0    0.000000
180  MONTHS_IN_RESIDENCE_63.0    0.000000
229      PROFESSION_CODE_18.0    0.000000
175  MONTHS_IN_RESIDENCE_58.0    0.000000
176  MONTHS_IN_RESIDENCE_59.0    0.000000

[326 rows x 2 columns]


In [12]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define hyperparameter search space
param_dist = {
    'n_estimators': randint(50, 200),  # Number of trees in the forest
    'max_features': ['sqrt', 'log2'],  # Number of features to consider for best split
    'max_depth': randint(10, 50),  # Maximum depth of the tree
    'min_samples_split': randint(2, 10),  # Minimum samples required to split a node
    'min_samples_leaf': randint(1, 10),  # Minimum samples required at leaf node
    'bootstrap': [True, False]  # Whether bootstrap samples are used
}

# Perform Randomized Search with 5-fold cross-validation
random_search = RandomizedSearchCV(RandomForestClassifier(random_state=42, class_weight='balanced'),
                                   param_distributions=param_dist, n_iter=50, cv=5, n_jobs=-1, random_state=42)

# Train the model using SMOTE data (for example)
random_search.fit(X_train_smote, y_train_smote)

# Best parameters from RandomizedSearchCV
print("Best Parameters from RandomizedSearchCV:", random_search.best_params_)

# Predictions (Tuned Model)
y_pred_tuned = random_search.best_estimator_.predict(val_df)
y_prob_tuned = random_search.best_estimator_.predict_proba(val_df)[:, 1]

# Performance Metrics (Tuned Model)
print("\n=== Tuned Model Performance ===")
print("Accuracy:", accuracy_score(y_valid, y_pred_tuned))
print("Precision:", precision_score(y_valid, y_pred_tuned))
print("Recall:", recall_score(y_valid, y_pred_tuned))
print("F1 Score:", f1_score(y_valid, y_pred_tuned))
print("ROC-AUC:", roc_auc_score(y_valid, y_prob_tuned))

# Confusion Matrix (Tuned Model)
print("Confusion Matrix:\n", confusion_matrix(y_valid, y_pred_tuned))

# Feature Importance (Tuned Model)
importances_tuned = random_search.best_estimator_.feature_importances_
feature_importance_tuned = pd.DataFrame({
    'Feature': train_df.columns,
    'Importance': importances_tuned
}).sort_values(by='Importance', ascending=False)

print("\n=== Feature Importance (Tuned Model) ===")
print(feature_importance_tuned)


Best Parameters from RandomizedSearchCV: {'bootstrap': False, 'max_depth': 38, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 196}

=== Tuned Model Performance ===
Accuracy: 0.7245
Precision: 0.4077079107505071
Recall: 0.07545045045045046
F1 Score: 0.12733607855559076
ROC-AUC: 0.6128511622309333
Confusion Matrix:
 [[7044  292]
 [2463  201]]

=== Feature Importance (Tuned Model) ===
                         Feature  Importance
4            RESIDENCIAL_BOROUGH    0.054540
2                  CITY_OF_BIRTH    0.050331
6    RESIDENCIAL_PHONE_AREA_CODE    0.043597
1                            SEX    0.040814
3               RESIDENCIAL_CITY    0.038174
..                           ...         ...
177     MONTHS_IN_RESIDENCE_60.0    0.000000
176     MONTHS_IN_RESIDENCE_59.0    0.000000
175     MONTHS_IN_RESIDENCE_58.0    0.000000
198     MONTHS_IN_RESIDENCE_96.0    0.000000
178     MONTHS_IN_RESIDENCE_61.0    0.000000

[326 rows x 2 columns]


In [13]:
# Train the final model with best parameters
best_rf = random_search.best_estimator_

# Fit the final model on the full balanced dataset (e.g., SMOTE data)
best_rf.fit(X_train_smote, y_train_smote)

# Predictions (Final Model)
y_pred_final = best_rf.predict(val_df)
y_prob_final = best_rf.predict_proba(val_df)[:, 1]

# Performance Metrics (Final Model)
print("\n=== Final Model Performance ===")
print("Accuracy:", accuracy_score(y_valid, y_pred_final))
print("Precision:", precision_score(y_valid, y_pred_final))
print("Recall:", recall_score(y_valid, y_pred_final))
print("F1 Score:", f1_score(y_valid, y_pred_final))
print("ROC-AUC:", roc_auc_score(y_valid, y_prob_final))

# Confusion Matrix (Final Model)
print("Confusion Matrix:\n", confusion_matrix(y_valid, y_pred_final))

# Feature Importance (Final Model)
importances_final = best_rf.feature_importances_
feature_importance_final = pd.DataFrame({
    'Feature': train_df.columns,
    'Importance': importances_final
}).sort_values(by='Importance', ascending=False)

print("\n=== Feature Importance (Final Model) ===")
print(feature_importance_final)



=== Final Model Performance ===
Accuracy: 0.7245
Precision: 0.4077079107505071
Recall: 0.07545045045045046
F1 Score: 0.12733607855559076
ROC-AUC: 0.6128511622309333
Confusion Matrix:
 [[7044  292]
 [2463  201]]

=== Feature Importance (Final Model) ===
                         Feature  Importance
4            RESIDENCIAL_BOROUGH    0.054540
2                  CITY_OF_BIRTH    0.050331
6    RESIDENCIAL_PHONE_AREA_CODE    0.043597
1                            SEX    0.040814
3               RESIDENCIAL_CITY    0.038174
..                           ...         ...
177     MONTHS_IN_RESIDENCE_60.0    0.000000
176     MONTHS_IN_RESIDENCE_59.0    0.000000
175     MONTHS_IN_RESIDENCE_58.0    0.000000
198     MONTHS_IN_RESIDENCE_96.0    0.000000
178     MONTHS_IN_RESIDENCE_61.0    0.000000

[326 rows x 2 columns]
