# MODEL TRAINING

## 1.1 Import data and required packages

In [15]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (confusion_matrix, classification_report, roc_auc_score,accuracy_score)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier



In [16]:
df=pd.read_csv("data\cleaned.csv")

In [17]:
df.shape

(307511, 74)

# Spliting data into traing and testing

Preparing X and y variable

In [18]:
X = df.drop('TARGET', axis=1)
y = df['TARGET']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y     # used because it will divide classes equally for training and testing datasets mean total 92 def and 8 non def 
                    # train will also contain same ratio as testing       
)

### identifying numeric features ONLY

In [19]:
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X_train.select_dtypes(include='object').columns

num = X_train.select_dtypes(include=['int64', 'float64'])
cat = X_train.select_dtypes(include='object')

# Data Scaling and Imputing with encoding 

In [20]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Numerical pipeline
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline
cat_pipeline = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine both
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols)
    ]
)

# Fit & transform
X_train_final = preprocessor.fit_transform(X_train)
X_test_final  = preprocessor.transform(X_test)

print(X_train_final.shape)
print(X_test_final.shape)

(230633, 190)
(76878, 190)


# Creating an evaluation function to check metrics

In [21]:
def evaluate_model(model, X_test_final, y_test ):
    y_pred=model.predict(X_test_final)
    y_proba = model.predict_proba(X_test_final)[:, 1]
    print(classification_report(y_test, y_pred))
    print("ROc:AUC :",roc_auc_score(y_test, y_proba))

In [22]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "XGBoost": XGBClassifier(),
    "LightGBM": LGBMClassifier()
}


In [33]:
from sklearn.metrics import roc_auc_score, recall_score

for name, model in models.items():
    model.fit(X_train_final, y_train)
    prob = model.predict_proba(X_test_final)[:,1]
    y_pred=model.predict(X_test_final)
    pred = (prob > 0.5).astype(int)

    print(
        name,
        "ROC-AUC:", roc_auc_score(y_test, prob),
        "Recall:", recall_score(y_test, pred)
    )


Logistic Regression ROC-AUC: 0.7479758564363757 Recall: 0.012568482114083145
Decision Tree ROC-AUC: 0.5419010463046308 Recall: 0.16935223976796648
Random Forest ROC-AUC: 0.714851214538123 Recall: 0.0016113438607798904
Gradient Boosting ROC-AUC: 0.7537112255107289 Recall: 0.012568482114083145
AdaBoost ROC-AUC: 0.7419987641226062 Recall: 0.006123106670963584
XGBoost ROC-AUC: 0.7476064628331884 Recall: 0.03931679020302933
[LightGBM] [Info] Number of positive: 18619, number of negative: 212014
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021136 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8667
[LightGBM] [Info] Number of data points in the train set: 230633, number of used features: 184
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080730 -> initscore=-2.432470
[LightGBM] [Info] Start training from score -2.432470




LightGBM ROC-AUC: 0.7571774433966676 Recall: 0.019336126329358685


Structural reasons LightGBM often wins on credit data
ðŸ”¹ LightGBM

Histogram-based splits

Leaf-wise tree growth

Handles sparse / one-hot data extremely well

Fast and memory-efficient

Designed for tabular + imbalanced data

Observations:

LightGBM has higher ROC-AUC

XGBoost has slightly higher recall at threshold = 0.5

But recall at 0.5 is not meaningful for imbalanced data.
ROC-AUC is.

 Ranking ability > default-threshold recall

# XG Boost Classifier

In [34]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# Calculate scale_pos_weight for your imbalance
# ratio = number_of_negative_samples / number_of_positive_samples
scale_pos_weight = len(y_train[y_train==0]) / len(y_train[y_train==1])

xgb = XGBClassifier(
    scale_pos_weight=scale_pos_weight,  # DIRECTLY HANDLES IMBALANCE
    random_state=42,
    n_jobs=-1,
    eval_metric='aucpr'  # Use PR-AUC for imbalanced data
)

# Tuning these parameters
param_dist = {
    'n_estimators': randint(100, 300),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': uniform(0, 1),  # Minimum loss reduction for split
    'reg_alpha': uniform(0, 1),  # L1 regularization
    'reg_lambda': uniform(1, 2)  # L2 regularization
}

In [None]:
random_search = RandomizedSearchCV( estimator=xgb, param_distributions=param_dist, n_iter=30, 
  verbose=2, random_state=42, n_jobs=-1)
random_search.fit(X_train_final, y_train)
best_xgb = random_search.best_estimator_
print(random_search.best_params_)
print(random_search.best_score_)

0.8616416574671995


In [37]:
y_test_pred = best_xgb.predict(X_test_final)

# Probabilities (this is what actually matters)
y_test_prob = best_xgb.predict_proba(X_test_final)[:, 1]
evaluate_model(best_xgb, X_test_final,y_test)

              precision    recall  f1-score   support

           0       0.94      0.90      0.92     70672
           1       0.22      0.32      0.26      6206

    accuracy                           0.85     76878
   macro avg       0.58      0.61      0.59     76878
weighted avg       0.88      0.85      0.87     76878

ROc:AUC : 0.7093270812620007
