# ML Models training

## Notebook to train different ML models and get the best one based on the performance metrics.

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import os
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix

In [5]:
#Loading data

#Defining BASE_PATH
BASE_PATH = os.getenv("/Users/carlos/Desktop/CURSOS/Anyone AI/Credit-risk-analysis/dataset/", "/Users/carlos/Desktop/CURSOS/Anyone AI/Credit-risk-analysis/dataset/")

train_file_path = os.path.join(BASE_PATH, "X_train_data.csv")
y_train_file_path = os.path.join(BASE_PATH, "y_train_data.csv")
val_file_path = os.path.join(BASE_PATH, "X_val_data.csv")
y_val_path = os.path.join(BASE_PATH, "y_val_data.csv")

train_df = pd.read_csv(train_file_path)

y_train_df = pd.read_csv(y_train_file_path)
y_train = y_train_df['TARGET_LABEL_BAD']
y_train = y_train.to_numpy()

val_df = pd.read_csv(val_file_path)

y_valid_df = pd.read_csv(y_val_path)
y_valid = y_valid_df['TARGET_LABEL_BAD']
y_valid = y_valid.to_numpy()


In [6]:
#Training a set of common ML models used for binary classification looking for the best option

# Define models
models = {
    "Logistic Regression": LogisticRegression(solver='liblinear'),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(probability=True, kernel='rbf'),
    "MLP Neural Network": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "LightGBM": LGBMClassifier(random_state=42)
}

# Function to train and evaluate models
def evaluate_models(models, X_train, y_train, X_val, y_val):
    results = []
    for name, model in models.items():
        # Train model
        model.fit(X_train, y_train)
        # Predict
        y_pred = model.predict(X_val)
        y_proba = model.predict_proba(X_val)[:, 1]  # Probability for class 1 (BAD)
        
        # Compute metrics
        acc = accuracy_score(y_val, y_pred)
        precision = precision_score(y_val, y_pred)
        recall = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)
        roc_auc = roc_auc_score(y_val, y_proba)
        
        # Store results
        results.append([name, acc, precision, recall, f1, roc_auc])
    
    # Convert to DataFrame
    results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-score", "ROC-AUC"])
    return results_df

# Evaluate models
results_df = evaluate_models(models, train_df, y_train, val_df, y_valid)

# Display results sorted by F1-score or ROC-AUC
print(results_df.sort_values(by="F1-score", ascending=False))


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 10377, number of negative: 29623
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005140 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1679
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 89
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.259425 -> initscore=-1.048959
[LightGBM] [Info] Start training from score -1.048959
                 Model  Accuracy  Precision    Recall  F1-score   ROC-AUC
4              XGBoost    0.7293   0.466872  0.113739  0.182916  0.639892
3   MLP Neural Network    0.7017   0.319774  0.106231  0.159482  0.587311
5             LightGBM    0.7354   0.561644  0.030781  0.058363  0.646520
1        Random Forest    0.7317   0.440252  0.026276  0.049593  0.630287
0  Logistic Regression    0.7330   0.416667  0.005631  0.011111  0.61047

In [None]:
# Random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Compute class weights
classes = np.array([0, 1])
class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
print("Class Weights:", class_weights)

# Step 2: Train a basic RandomForest model with default settings
rf_model = RandomForestClassifier(
    class_weight='balanced',  # Automatically balance class weights
    n_estimators=100,  # Number of trees in the forest
    random_state=42
)

# Step 3: Hyperparameter tuning with cross-validation
params = {
    'class_weight': ['balanced', None],  # Balance class weights automatically or leave it unbalanced
    'max_depth': [3, 5, 7, 10],  # Max depth of trees
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split an internal node
    'n_estimators': [100, 200],  # Number of trees in the forest
    'max_features': ['sqrt', 'log2'],  # Number of features to consider for the best split
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

# Perform GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=params,
    scoring=make_scorer(roc_auc_score),  # Scoring by ROC AUC
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    verbose=2,
    n_jobs=-1
)

# Fit the model with cross-validation and grid search
grid_search.fit(train_df, y_train)

# Best parameters and best score from the grid search
print("Best parameters:", grid_search.best_params_)
print("Best ROC-AUC score:", grid_search.best_score_)

# Step 4: Evaluate the best model on validation set
best_model = grid_search.best_estimator_

# Predict on the validation set
y_pred_best = best_model.predict(val_df)
y_pred_prob_best = best_model.predict_proba(val_df)[:, 1]

# Metrics
print("Classification Report for Best Model:")
print(classification_report(y_valid, y_pred_best))

print("ROC-AUC Score for Best Model:", roc_auc_score(y_valid, y_pred_prob_best))

# Confusion Matrix for Best Model
print("Confusion Matrix for Best Model:")
print(confusion_matrix(y_valid, y_pred_best))

# Step 5: Plot Feature Importance (using Gini Importance)
importances = best_model.feature_importances_
indices = np.argsort(importances)[::-1]

# Print out the feature importances
print("Feature Importances:")
for f in range(len(train_df.columns)):
    print(f"{train_df.columns[indices[f]]}: {importances[indices[f]]}")

# Step 6: Plotting feature importances
plt.figure(figsize=(10, 6))
plt.title("Feature Importance")
plt.barh(range(len(train_df.columns)), importances[indices], align="center")
plt.yticks(range(len(train_df.columns)), [train_df.columns[i] for i in indices])
plt.gca().invert_yaxis()
plt.xlabel("Feature Importance (Gini)")
plt.show()


# Using SMOTE (Synthetic Minority Over-sampling Technique)
### For balancing the dataset

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score, precision_recall_curve, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Apply SMOTE (Oversampling minority class)
smote = SMOTE(random_state=42)
train_X_resampled, train_y_resampled = smote.fit_resample(train_df, y_train)

print("Original class distribution:", np.bincount(y_train))
print("Resampled class distribution:", np.bincount(train_y_resampled))

# Train a Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42)
gb_clf.fit(train_X_resampled, train_y_resampled)

# Predictions
y_pred = gb_clf.predict(val_df)
y_proba = gb_clf.predict_proba(val_df)[:, 1]  # Probability scores for ROC-AUC

# Compute Evaluation Metrics
def evaluate_model(model_name, y_true, y_pred, y_proba):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_proba)
    report = classification_report(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)  # Confusion Matrix

    print(f"\n{model_name} Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print("Classification Report:\n", report)
    print("Confusion Matrix:\n", cm)

# Evaluate Gradient Boosting Model
evaluate_model("Gradient Boosting (SMOTE)", y_valid, y_pred, y_proba)


### No SMOTE

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split

# Train a Gradient Boosting Classifier (without SMOTE)
gb_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42)
gb_clf.fit(train_df, y_train)

# Predictions
y_pred = gb_clf.predict(val_df)
y_proba = gb_clf.predict_proba(val_df)[:, 1]  # Probability scores for ROC-AUC

# Compute Evaluation Metrics
def evaluate_model(model_name, y_true, y_pred, y_proba):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_proba)
    report = classification_report(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)  # Confusion Matrix

    print(f"\n🔹 {model_name} Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print("Classification Report:\n", report)
    print("Confusion Matrix:\n", cm)

# Evaluate Gradient Boosting Model (Without SMOTE)
evaluate_model("Gradient Boosting (No SMOTE)", y_valid, y_pred, y_proba)
