# Xgboost model training

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from lightgbm import LGBMClassifier
from sklearn.metrics import make_scorer
from sklearn.utils.class_weight import compute_class_weight
import os

In [None]:
#Loading data

#Defining BASE_PATH
BASE_PATH = os.getenv("/Users/carlos/Desktop/CURSOS/Anyone AI/Credit-Risk-App/dataset/", "/Users/carlos/Desktop/CURSOS/Anyone AI/Credit-Risk-App/dataset/")

train_file_path = os.path.join(BASE_PATH, "X_train_data.csv")
y_train_file_path = os.path.join(BASE_PATH, "y_train_data.csv")
val_file_path = os.path.join(BASE_PATH, "X_val_data.csv")
y_val_path = os.path.join(BASE_PATH, "y_val_data.csv")

train_df = pd.read_csv(train_file_path)

y_train_df = pd.read_csv(y_train_file_path)
y_train = y_train_df['TARGET_LABEL_BAD']
y_train = y_train.to_numpy()

val_df = pd.read_csv(val_file_path)

y_valid_df = pd.read_csv(y_val_path)
y_valid = y_valid_df['TARGET_LABEL_BAD']
y_valid = y_valid.to_numpy()


XGBoost incluye su propia clase para almacenar conjuntos de datos, llamada DMatrix.  Es una clase muy optimizada en cuanto a memoria y velocidad.  Por eso, convertir conjuntos de datos a este formato es un requisito para la API nativa de XGBoost.

## XGBoost with default parameters

In [38]:
#Training a XGBoost model with default params

# Convert data to DMatrix format
dtrain = xgb.DMatrix(train_df, label=y_train)
dvalid = xgb.DMatrix(val_df, label=y_valid)

# Define parameters
params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "eta": 0.1,
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "lambda": 1,
    "alpha": 0
}

# Train model
evals = [(dtrain, "train"), (dvalid, "valid")]
model = xgb.train(params, dtrain, num_boost_round=100, evals=evals, early_stopping_rounds=10)

# Predictions
y_pred_proba = model.predict(dvalid)
y_pred = (y_pred_proba > 0.5).astype(int)

# Performance Metrics
conf_matrix = confusion_matrix(y_valid, y_pred)
print("Confusion Matrix:\n", conf_matrix)
print("\nAccuracy:", accuracy_score(y_valid, y_pred))
print("\nClassification Report:\n", classification_report(y_valid, y_pred))

[0]	train-logloss:0.57025	valid-logloss:0.57770
[1]	train-logloss:0.56766	valid-logloss:0.57599
[2]	train-logloss:0.56556	valid-logloss:0.57466
[3]	train-logloss:0.56358	valid-logloss:0.57344
[4]	train-logloss:0.56166	valid-logloss:0.57255
[5]	train-logloss:0.56003	valid-logloss:0.57173
[6]	train-logloss:0.55860	valid-logloss:0.57105
[7]	train-logloss:0.55714	valid-logloss:0.57029
[8]	train-logloss:0.55590	valid-logloss:0.56975
[9]	train-logloss:0.55449	valid-logloss:0.56905
[10]	train-logloss:0.55330	valid-logloss:0.56846
[11]	train-logloss:0.55215	valid-logloss:0.56811
[12]	train-logloss:0.55114	valid-logloss:0.56763
[13]	train-logloss:0.55012	valid-logloss:0.56718
[14]	train-logloss:0.54903	valid-logloss:0.56652
[15]	train-logloss:0.54824	valid-logloss:0.56646
[16]	train-logloss:0.54747	valid-logloss:0.56601
[17]	train-logloss:0.54658	valid-logloss:0.56584
[18]	train-logloss:0.54572	valid-logloss:0.56553
[19]	train-logloss:0.54482	valid-logloss:0.56558
[20]	train-logloss:0.54414	val

In [39]:
# Hyperparameter tuning

param_grid = {
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.05, 0.1],
    "n_estimators": [50, 100, 200],
    "subsample": [0.6, 0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]  
}

xgb_clf = xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss")

grid_search = GridSearchCV(xgb_clf, param_grid, scoring="accuracy", cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), n_jobs=-1, verbose=1)
grid_search.fit(train_df, y_train)

print("Best Parameters:", grid_search.best_params_)


Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.8}


In [40]:
# Define parameters based on parameter optimization
params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "eta": 0.05,
    "max_depth": 7,
    "subsample": 0.8,
    "colsample_bytree": 1.0,
    "lambda": 1,
    "alpha": 0
}

# Train model
evals = [(dtrain, "train"), (dvalid, "valid")]
model = xgb.train(params, dtrain, num_boost_round=100, evals=evals, early_stopping_rounds=10)

# Predictions
y_pred_proba = model.predict(dvalid)
y_pred = (y_pred_proba > 0.5).astype(int)

# Performance Metrics
conf_matrix = confusion_matrix(y_valid, y_pred)
print("Confusion Matrix:\n", conf_matrix)
print("\nAccuracy:", accuracy_score(y_valid, y_pred))
print("\nClassification Report:\n", classification_report(y_valid, y_pred))

[0]	train-logloss:0.57128	valid-logloss:0.57874
[1]	train-logloss:0.56951	valid-logloss:0.57767
[2]	train-logloss:0.56781	valid-logloss:0.57654
[3]	train-logloss:0.56616	valid-logloss:0.57554
[4]	train-logloss:0.56462	valid-logloss:0.57475
[5]	train-logloss:0.56319	valid-logloss:0.57397
[6]	train-logloss:0.56189	valid-logloss:0.57333
[7]	train-logloss:0.56062	valid-logloss:0.57272
[8]	train-logloss:0.55946	valid-logloss:0.57226
[9]	train-logloss:0.55828	valid-logloss:0.57167
[10]	train-logloss:0.55720	valid-logloss:0.57114
[11]	train-logloss:0.55601	valid-logloss:0.57070
[12]	train-logloss:0.55507	valid-logloss:0.57030
[13]	train-logloss:0.55406	valid-logloss:0.56993
[14]	train-logloss:0.55317	valid-logloss:0.56958
[15]	train-logloss:0.55226	valid-logloss:0.56925
[16]	train-logloss:0.55130	valid-logloss:0.56887
[17]	train-logloss:0.55024	valid-logloss:0.56852
[18]	train-logloss:0.54933	valid-logloss:0.56819
[19]	train-logloss:0.54854	valid-logloss:0.56793
[20]	train-logloss:0.54765	val

In [41]:
# Get feature importance by gain
importance = model.get_score(importance_type="gain")
importance_df = pd.DataFrame(importance.items(), columns=["Feature", "Gain"]).sort_values(by="Gain", ascending=False)

print(importance_df.head(20))



                    Feature       Gain
135     OCCUPATION_TYPE_1.0  24.007029
5    FLAG_RESIDENCIAL_PHONE  19.475811
70         PAYMENT_DAY_25.0  16.645630
72       MARITAL_STATUS_1.0  15.721720
187                AGE_63.0  11.813228
142                AGE_18.0  11.207093
144                AGE_20.0  10.865468
154                AGE_30.0  10.059038
127     PROFESSION_CODE_8.0   9.959561
145                AGE_21.0   9.810191
195                AGE_71.0   9.788873
181                AGE_57.0   9.709737
198                AGE_78.0   9.684025
190                AGE_66.0   9.646227
90       RESIDENCE_TYPE_3.0   9.358445
192                AGE_68.0   9.228590
77       MARITAL_STATUS_6.0   9.227916
183                AGE_59.0   9.175991
182                AGE_58.0   9.065054
194                AGE_70.0   9.003184


## Training XGBoost with imbalanced data strategy

In [42]:
# XGBoost model computing class weights for handling class imbalance
# First a model training using default param values
xgb_model = xgb.XGBClassifier(eval_metric='mlogloss', random_state=42)

# Fit the model
xgb_model.fit(train_df, y_train)

# Evaluate initial performance on validation set
y_pred = xgb_model.predict(val_df)
y_pred_prob = xgb_model.predict_proba(val_df)[:, 1]

# Metrics
print("Classification Report:")
print(classification_report(y_valid, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_valid, y_pred_prob))
print("Confusion Matrix:")
print(confusion_matrix(y_valid, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.96      0.84      7336
           1       0.43      0.08      0.13      2664

    accuracy                           0.73     10000
   macro avg       0.58      0.52      0.48     10000
weighted avg       0.66      0.73      0.65     10000

ROC-AUC Score: 0.6301169455988159
Confusion Matrix:
[[7062  274]
 [2461  203]]


In [43]:
# Defining hyperparameters and  balancing dataset.

# Compute class weights to adjust scale_pos_weight
class_weights = compute_class_weight('balanced', classes=np.array([0, 1]), y=y_train)
scale_pos_weight = class_weights[1] / class_weights[0]  # Calculate scale_pos_weight

# Set up XGBoost model with hyperparameter tuning
params = {
    'scale_pos_weight': [scale_pos_weight],  # Use the computed scale_pos_weight
    'max_depth': [3, 5, 7],  # Example of tuning max_depth
    'learning_rate': [0.01, 0.05, 0.1],  # Learning rate
    'n_estimators': [100, 200],  # Number of estimators
    'subsample': [0.8, 1.0],  # Fraction of samples used for each tree
    'colsample_bytree': [0.8, 1.0]  # Fraction of features used for each tree
}

# Perform GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=xgb.XGBClassifier(eval_metric='mlogloss', random_state=42),
    param_grid=params,
    scoring=make_scorer(roc_auc_score),  # Scoring by ROC AUC
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    verbose=2,
    n_jobs=-1
)

In [37]:
# Fit the model with cross-validation and grid search
grid_search.fit(train_df, y_train)

# Best parameters and best score from the grid search
print("Best parameters:", grid_search.best_params_)
print("Best ROC-AUC score:", grid_search.best_score_)

# Step 4: Evaluate best model on validation set
best_model = grid_search.best_estimator_

# Predict on the validation set
y_pred_best = best_model.predict(val_df)
y_pred_prob_best = best_model.predict_proba(val_df)[:, 1]

# Metrics
print("Classification Report for Best Model:")
print(classification_report(y_valid, y_pred_best))
print("ROC-AUC Score for Best Model:", roc_auc_score(y_valid, y_pred_prob_best))
print("Confusion Matrix for Best Model:")
print(confusion_matrix(y_valid, y_pred_best))

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, scale_pos_weight=2.8546786161703768, subsample=0.8; total time=   2.4s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, scale_pos_weight=2.8546786161703768, subsample=0.8; total time=   2.4s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, scale_pos_weight=2.8546786161703768, subsample=0.8; total time=   2.5s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, scale_pos_weight=2.8546786161703768, subsample=1.0; total time=   2.4s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, scale_pos_weight=2.8546786161703768, subsample=0.8; total time=   2.5s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, scale_pos_weight=2.8546786161703768, subsample=1.0; total time=   2.4s
[CV] END colsample_b

In [48]:
# Feature Importance using Gain
importance = best_model.get_booster().get_score(importance_type="gain")

# Convert to DataFrame for better visualization
importance_df = pd.DataFrame(importance.items(), columns=["Feature", "Gain"])
importance_df = importance_df.sort_values(by="Gain", ascending=False)

# Print feature importance
print("\nFeature Importance (Gain):")
print(importance_df.head(40))



Feature Importance (Gain):
                         Feature       Gain
122          OCCUPATION_TYPE_1.0  87.293053
64            MARITAL_STATUS_1.0  71.281769
63              PAYMENT_DAY_25.0  63.121243
5         FLAG_RESIDENCIAL_PHONE  50.562473
60              PAYMENT_DAY_10.0  42.450478
67            MARITAL_STATUS_4.0  36.159863
167                     AGE_63.0  32.977535
80            RESIDENCE_TYPE_1.0  28.239454
15       FLAG_PROFESSIONAL_PHONE  25.816042
142                     AGE_31.0  25.569647
65            MARITAL_STATUS_2.0  24.818583
79               NACIONALITY_1.0  24.803543
129                     AGE_18.0  24.414759
13                    QUANT_CARS  24.050468
155                     AGE_48.0  23.812122
59               PAYMENT_DAY_5.0  23.791893
1                            SEX  23.218260
110   QUANT_BANKING_ACCOUNTS_1.0  23.143536
158                     AGE_53.0  22.792255
159                     AGE_55.0  22.432432
49          RESIDENCIAL_STATE_PB  22.306227
62  