# LightGBM model training

In [10]:
import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.utils.class_weight import compute_class_weight
from lightgbm import LGBMClassifier
import os

In [6]:
#Loading data

#Defining BASE_PATH
BASE_PATH = os.getenv("/Users/carlos/Desktop/CURSOS/Anyone AI/Credit-Risk-App/dataset/", "/Users/carlos/Desktop/CURSOS/Anyone AI/Credit-Risk-App/dataset/")

train_file_path = os.path.join(BASE_PATH, "X_train_data.csv")
y_train_file_path = os.path.join(BASE_PATH, "y_train_data.csv")
val_file_path = os.path.join(BASE_PATH, "X_val_data.csv")
y_val_path = os.path.join(BASE_PATH, "y_val_data.csv")

train_df = pd.read_csv(train_file_path)

y_train_df = pd.read_csv(y_train_file_path)
y_train = y_train_df['TARGET_LABEL_BAD']
y_train = y_train.to_numpy()

val_df = pd.read_csv(val_file_path)

y_valid_df = pd.read_csv(y_val_path)
y_valid = y_valid_df['TARGET_LABEL_BAD']
y_valid = y_valid.to_numpy()


## LightGBM with default parameters

In [9]:
# Convert data into LightGBM Dataset format
dtrain = lgb.Dataset(train_df, label=y_train)
dvalid = lgb.Dataset(val_df, label=y_valid, reference=dtrain)

# Default parameters
params = {
    "objective": "binary",
    "metric": "binary_logloss"
}

# Train LightGBM model
model = lgb.train(params, dtrain, valid_sets=[dvalid], num_boost_round=100)

# Predictions
y_pred_proba = model.predict(val_df)
y_pred = (y_pred_proba > 0.5).astype(int)

# Performance Metrics
conf_matrix = confusion_matrix(y_valid, y_pred)
print("Confusion Matrix:\n", conf_matrix)
print("\nAccuracy:", accuracy_score(y_valid, y_pred))
print("\nClassification Report:\n", classification_report(y_valid, y_pred))



[LightGBM] [Info] Number of positive: 10377, number of negative: 29623
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004534 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1807
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 256
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.259425 -> initscore=-1.048959
[LightGBM] [Info] Start training from score -1.048959
Confusion Matrix:
 [[7299   37]
 [2615   49]]

Accuracy: 0.7348

Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.99      0.85      7336
           1       0.57      0.02      0.04      2664

    accuracy                           0.73     10000
   macro avg       0.65      0.51      0.44     10000
weighted avg       0.69      0.73      0.63     10000



In [12]:
# Hyperparameter Tuning

param_grid = {
    "num_leaves": [31, 50, 70],
    "learning_rate": [0.01, 0.1, 0.2],
    "n_estimators": [50, 100, 200],
    "subsample": [0.6, 0.8, 1.0]
}

lgb_clf = LGBMClassifier(objective="binary")
grid_search = GridSearchCV(lgb_clf, param_grid, scoring="accuracy", cv=5, n_jobs=-1, verbose=2)
grid_search.fit(train_df, y_train)

print("Best Parameters:", grid_search.best_params_)


Fitting 5 folds for each of 81 candidates, totalling 405 fits
[LightGBM] [Info] Number of positive: 8301, number of negative: 23699
[LightGBM] [Info] Number of positive: 8301, number of negative: 23699
[LightGBM] [Info] Number of positive: 8302, number of negative: 23698
[LightGBM] [Info] Number of positive: 8301, number of negative: 23699
[LightGBM] [Info] Number of positive: 8302, number of negative: 23698
[LightGBM] [Info] Number of positive: 8301, number of negative: 23699
[LightGBM] [Info] Number of positive: 8302, number of negative: 23698
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008591 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1795
[LightGBM] [Info] Number of data points in the train set: 32000, number of used features: 253
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.259437 -> initscore=-1.048894
[LightGBM] [In

In [None]:
# Model with hyperparameter tunned

params = {
    "objective": "binary",
    "metric": "binary_logloss",
    "learning_rate": 0.1,
    "num_leaves": 70,
    "subsample": 0.6
}

# Train LightGBM model
model = lgb.train(params, dtrain, valid_sets=[dvalid], num_boost_round=50)

# Predictions
y_pred_proba = model.predict(val_df)
y_pred = (y_pred_proba > 0.5).astype(int)

# Performance Metrics
conf_matrix = confusion_matrix(y_valid, y_pred)
print("Confusion Matrix:\n", conf_matrix)
print("\nAccuracy:", accuracy_score(y_valid, y_pred))
print("\nClassification Report:\n", classification_report(y_valid, y_pred))



[LightGBM] [Info] Number of positive: 10377, number of negative: 29623
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004465 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1807
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 256
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.259425 -> initscore=-1.048959
[LightGBM] [Info] Start training from score -1.048959
Confusion Matrix:
 [[7285   51]
 [2614   50]]

Accuracy: 0.7335

Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.99      0.85      7336
           1       0.50      0.02      0.04      2664

    accuracy                           0.73     10000
   macro avg       0.62      0.51      0.44     10000
weighted avg       0.67      0.73      0.63     10000



In [14]:
# Get feature importance by gain
importance = model.feature_importance(importance_type="gain")
importance_df = pd.DataFrame({"Feature": train_df.columns, "Gain": importance}).sort_values(by="Gain", ascending=False)

print(importance_df.head(20))


                         Feature         Gain
8        PERSONAL_MONTHLY_INCOME  2967.990494
20             RESIDENCIAL_ZIP_3  2718.878163
3               RESIDENCIAL_CITY  2486.069126
2                  CITY_OF_BIRTH  1821.218102
4            RESIDENCIAL_BOROUGH  1661.793295
6    RESIDENCIAL_PHONE_AREA_CODE  1591.032301
5         FLAG_RESIDENCIAL_PHONE  1363.324978
231          OCCUPATION_TYPE_1.0   942.145323
85            MARITAL_STATUS_1.0   901.852809
81              PAYMENT_DAY_25.0   625.322204
78              PAYMENT_DAY_10.0   375.804722
1                            SEX   369.252720
18       FLAG_PROFESSIONAL_PHONE   358.949113
9                  OTHER_INCOMES   288.581582
242                     AGE_18.0   288.545039
113           RESIDENCE_TYPE_2.0   282.959112
80              PAYMENT_DAY_20.0   265.864781
11               FLAG_MASTERCARD   264.825831
86            MARITAL_STATUS_2.0   261.017850
77               PAYMENT_DAY_5.0   257.322113


## Training LightGBM with imbalanced data strategy

In [None]:
from collections import Counter

# Calculate scale_pos_weight
# Count class distribution
class_counts = Counter(y_train)
scale_pos_weight = class_counts[0] / class_counts[1]  # Majority class (0) / Minority class (1)

# Define parameters with scale_pos_weight
best_params = {
    "objective": "binary",
    "metric": "binary_logloss",
    "learning_rate": 0.1,
    "num_leaves": 70,
    "subsample": 0.6,
    "scale_pos_weight": scale_pos_weight  # Add scale_pos_weight to the parameters
}

# Convert data into LightGBM Dataset format
dtrain = lgb.Dataset(train_df, label=y_train)
dvalid = lgb.Dataset(val_df, label=y_valid, reference=dtrain)

# Step 4: Train LightGBM model
model = lgb.train(
    best_params,
    dtrain,
    valid_sets=[dvalid],
    valid_names=["validation"],
    num_boost_round=50
)





[LightGBM] [Info] Number of positive: 10377, number of negative: 29623
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004158 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1807
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 256
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.259425 -> initscore=-1.048959
[LightGBM] [Info] Start training from score -1.048959


In [None]:
# Predictions
y_pred_proba = model.predict(val_df)
y_pred = (y_pred_proba > 0.5).astype(int)

# Step 6: Performance Metrics
conf_matrix = confusion_matrix(y_valid, y_pred)
print("Confusion Matrix:\n", conf_matrix)
print("\nAccuracy:", accuracy_score(y_valid, y_pred))
print("\nClassification Report:\n", classification_report(y_valid, y_pred))


Confusion Matrix:
 [[4540 2796]
 [1112 1552]]

Accuracy: 0.6092

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.62      0.70      7336
           1       0.36      0.58      0.44      2664

    accuracy                           0.61     10000
   macro avg       0.58      0.60      0.57     10000
weighted avg       0.68      0.61      0.63     10000



In [None]:
# Feature Importance (using gain)
importance = model.feature_importance(importance_type="gain")
importance_df = pd.DataFrame({"Feature": train_df.columns, "Gain": importance}).sort_values(by="Gain", ascending=False)

print(importance_df.head(20))

                         Feature         Gain
5         FLAG_RESIDENCIAL_PHONE  2768.815482
3               RESIDENCIAL_CITY  2759.371950
20             RESIDENCIAL_ZIP_3  2453.099199
8        PERSONAL_MONTHLY_INCOME  2290.557033
6    RESIDENCIAL_PHONE_AREA_CODE  1889.793548
231          OCCUPATION_TYPE_1.0  1630.635595
2                  CITY_OF_BIRTH  1380.099294
85            MARITAL_STATUS_1.0  1273.687101
4            RESIDENCIAL_BOROUGH  1251.309690
81              PAYMENT_DAY_25.0   966.212801
78              PAYMENT_DAY_10.0   575.725992
18       FLAG_PROFESSIONAL_PHONE   531.175702
1                            SEX   528.858821
88            MARITAL_STATUS_4.0   472.157700
86            MARITAL_STATUS_2.0   456.110470
287                     AGE_63.0   426.455169
77               PAYMENT_DAY_5.0   414.280392
242                     AGE_18.0   396.731039
113           RESIDENCE_TYPE_2.0   372.256402
11               FLAG_MASTERCARD   370.898896


In [None]:
#Parameter tuning 
# Define the parameter grid for tuning
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'num_leaves': [31, 70, 100],
    'subsample': [0.6, 0.8, 1.0],
    'n_estimators': [50, 100, 200],
    'scale_pos_weight': [scale_pos_weight]  # Add scale_pos_weight to the grid
}

# Create a LightGBM classifier
lgb_clf = lgb.LGBMClassifier(objective="binary", metric="binary_logloss")

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(lgb_clf, param_grid, scoring=make_scorer(accuracy_score), cv=5, n_jobs=-1, verbose=1)
grid_search.fit(train_df, y_train)

# Best Parameters
print("Best Parameters Found:", grid_search.best_params_)


Fitting 5 folds for each of 81 candidates, totalling 405 fits
[LightGBM] [Info] Number of positive: 8301, number of negative: 23699
[LightGBM] [Info] Number of positive: 8301, number of negative: 23699
[LightGBM] [Info] Number of positive: 8302, number of negative: 23698
[LightGBM] [Info] Number of positive: 8301, number of negative: 23699
[LightGBM] [Info] Number of positive: 8302, number of negative: 23698
[LightGBM] [Info] Number of positive: 8302, number of negative: 23698
[LightGBM] [Info] Number of positive: 8301, number of negative: 23699
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008106 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1795
[LightGBM] [Info] Number of data points in the train set: 32000, number of used features: 253
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016103

'# Step 6: Evaluate the Best Model\nbest_model = grid_search.best_estimator_\ny_pred = best_model.predict(val_df)\n\n# Performance Metrics\nconf_matrix = confusion_matrix(y_valid, y_pred)\nprint("Confusion Matrix:\n", conf_matrix)\nprint("\nAccuracy:", accuracy_score(y_valid, y_pred))\nprint("\nClassification Report:\n", classification_report(y_valid, y_pred))'

In [None]:
# Using best hyperparameters after tunning

from collections import Counter

# Calculate scale_pos_weight
# Count class distribution
class_counts = Counter(y_train)
scale_pos_weight = class_counts[0] / class_counts[1]  # Majority class (0) / Minority class (1)

# Define the best parameters with scale_pos_weight
best_params = {
    "objective": "binary",
    "metric": "binary_logloss",
    "learning_rate": 0.1,
    "num_leaves": 31,
    "subsample": 0.6,
    "scale_pos_weight": 2.8546786161703768  # Add scale_pos_weight to the parameters
}

# Convert data into LightGBM Dataset format
dtrain = lgb.Dataset(train_df, label=y_train)
dvalid = lgb.Dataset(val_df, label=y_valid, reference=dtrain)

# Train LightGBM model
model = lgb.train(
    best_params,
    dtrain,
    valid_sets=[dvalid],
    valid_names=["validation"],
    num_boost_round=50
)

[LightGBM] [Info] Number of positive: 10377, number of negative: 29623
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004539 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1807
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 256
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.259425 -> initscore=-1.048959
[LightGBM] [Info] Start training from score -1.048959


In [None]:
# Predictions
y_pred_proba = model.predict(val_df)
y_pred = (y_pred_proba > 0.5).astype(int)

# Performance Metrics
conf_matrix = confusion_matrix(y_valid, y_pred)
print("Confusion Matrix:\n", conf_matrix)
print("\nAccuracy:", accuracy_score(y_valid, y_pred))
print("\nClassification Report:\n", classification_report(y_valid, y_pred))


Confusion Matrix:
 [[4275 3061]
 [1028 1636]]

Accuracy: 0.5911

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.58      0.68      7336
           1       0.35      0.61      0.44      2664

    accuracy                           0.59     10000
   macro avg       0.58      0.60      0.56     10000
weighted avg       0.68      0.59      0.61     10000



In [None]:
# Feature Importance (using gain)
importance = model.feature_importance(importance_type="gain")
importance_df = pd.DataFrame({"Feature": train_df.columns, "Gain": importance}).sort_values(by="Gain", ascending=False)

print(importance_df.head(20))

                         Feature         Gain
5         FLAG_RESIDENCIAL_PHONE  2768.815482
3               RESIDENCIAL_CITY  2759.371950
20             RESIDENCIAL_ZIP_3  2453.099199
8        PERSONAL_MONTHLY_INCOME  2290.557033
6    RESIDENCIAL_PHONE_AREA_CODE  1889.793548
231          OCCUPATION_TYPE_1.0  1630.635595
2                  CITY_OF_BIRTH  1380.099294
85            MARITAL_STATUS_1.0  1273.687101
4            RESIDENCIAL_BOROUGH  1251.309690
81              PAYMENT_DAY_25.0   966.212801
78              PAYMENT_DAY_10.0   575.725992
18       FLAG_PROFESSIONAL_PHONE   531.175702
1                            SEX   528.858821
88            MARITAL_STATUS_4.0   472.157700
86            MARITAL_STATUS_2.0   456.110470
287                     AGE_63.0   426.455169
77               PAYMENT_DAY_5.0   414.280392
242                     AGE_18.0   396.731039
113           RESIDENCE_TYPE_2.0   372.256402
11               FLAG_MASTERCARD   370.898896
