In [None]:
import pandas as pd
from google.colab import drive
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
import lightgbm as lgb
import re
drive.mount('/content/drive')

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
merged_df = pd.read_csv(f'/content/drive/My Drive/data/data_project_2/merged_df_train_raw.csv')
merged_df_test = pd.read_csv(f'/content/drive/My Drive/data/data_project_2/merged_df_test_raw.csv')

X = merged_df.drop(columns=['TARGET', 'SK_ID_CURR'])
y = merged_df['TARGET']

# DATA CLEANING

# LGBM can handle NA values, thus do not need to impute.
X['CODE_GENDER'].replace('XNA', np.nan, inplace=True) # XNA should be replace with NA
X['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True) # NA values were logged as 365243 in data
X['NAME_FAMILY_STATUS'].replace('Unknown', np.nan, inplace=True) # Unknown should be replaced with NA
X['ORGANIZATION_TYPE'].replace('XNA', np.nan, inplace=True) # XNA should be replace with NA

X = X.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x)) # Rename columns for LGBM model (cannot handle JSON column name format)
for col in X.select_dtypes(include='object').columns:
    X[col] = X[col].astype('category')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X['CODE_GENDER'].replace('XNA', np.nan, inplace=True) # XNA should be replace with NA
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True) # NA values were logged as 365243 in data
The behavior will change in pandas 3.0. This in

## LGBM Model

In [None]:
## Feature Engineering
# these features were chosen from the most important features from the basic model
# The code for feature importance is replicated in the final code chunk of this section
# This is the final feature engineering chosen
X_engineered = X.copy()
X_engineered['CREDIT_TO_ANNUITY_RATIO'] = X['AMT_CREDIT'] / X['AMT_ANNUITY']
X_engineered['CREDIT_GOODS_RATIO'] = X['AMT_CREDIT'] / X['AMT_GOODS_PRICE']
X_engineered['DEBT_TO_CREDIT_RATIO'] = X['AMT_CREDIT_SUM_DEBT_sum'] / X['AMT_CREDIT_SUM_sum']
X_engineered['DEBT_INSTALMENT_RATIO'] = X['AMT_CREDIT_SUM_DEBT_sum'] / X['CNT_INSTALMENT_FUTURE_mean']
X_engineered['AGE_EMPLOYED_RATIO'] = X['DAYS_EMPLOYED'] / X['DAYS_BIRTH']
X_engineered['CAR_AGE_EMPLOYED_RATIO'] = X['OWN_CAR_AGE'] / X['DAYS_EMPLOYED']
X_engineered['CREDIT_AGE_RATIO'] = X['DAYS_CREDIT_max'] / X['DAYS_BIRTH']
X_engineered['LATE_PAYMENT_RATIO'] = X['SK_DPD_DEF_mean_POS_CASH_balance'] / X['DAYS_CREDIT_max']
X_engineered['REFUSED_APPLICATION_RATIO'] = X['NAME_CONTRACT_STATUS_Refused_sum_previous_application'] / (
    X['NAME_CONTRACT_STATUS_Active_sum_POS_CASH_balance'] + 1e-5)
X_engineered['INCOME_TO_CREDIT_RATIO'] = X['AMT_INCOME_TOTAL'] / X['AMT_CREDIT']
X_engineered['INCOME_TO_PAYMENT_RATIO'] = X['AMT_INCOME_TOTAL'] / X['AMT_ANNUITY']
X_engineered['CREDIT_BUREAU_INQUIRIES_RATIO'] = X['AMT_REQ_CREDIT_BUREAU_QRT'] / X['AMT_CREDIT']
X_engineered['PHONE_CHANGE_CREDIT_RATIO'] = X['DAYS_LAST_PHONE_CHANGE'] / X['DAYS_CREDIT_mean']
X_engineered['DECISION_ENTRY_RATIO'] = X['DAYS_DECISION_max'] / X['DAYS_ENTRY_PAYMENT_max']
X_engineered['INSTALMENT_TO_DEBT_RATIO'] = X['CNT_INSTALMENT_FUTURE_mean'] / X['AMT_CREDIT_SUM_DEBT_mean']
X_engineered.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
lgb_model = lgb.LGBMClassifier()

# Hyperparameter tuning
param_grid = {
    'num_leaves': [20, 31, 50], # Default 31
    'min_child_samples': [15, 20, 50], # Default 20
    'colsample_bytree': [0.5, 0.8, 1], # Default 1
    'reg_lambda': [10, 60, 80] # Default 0
}

grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    cv=5,
    scoring='roc_auc', # Used roc auc for the kaggle competition
    n_jobs=-1
)

grid_search.fit(X_engineered, y)

best_lambda = grid_search.best_params_['reg_lambda']
best_score = grid_search.best_score_

print(f"Best lambda: {best_lambda}")
print(f"Best cross-validated ROC AUC score: {best_score:.4f}")

**Best Params:** colsample_bytree = 0.27, min_child_samples = 55, num_leaves = 50, reg_lambda = 11

**Best CV Roc Auc:** 0.784889

In [None]:
# Initial number of features
print(len(X_engineered.columns))

lgb_model = lgb.LGBMClassifier(colsample_bytree = 0.27, min_child_samples = 55, num_leaves = 50, reg_lambda = 11) # Using best params
lgb_model.fit(X_engineered, y)

feature_importances = lgb_model.feature_importances_
features = X_engineered.columns
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
# Removing features with 0 importance (practically unused by model)
important_features = feature_importance_df[feature_importance_df['Importance'] > 0]
# list(important_features.sort_values(by='Importance', ascending=False)['Feature']) # - This code prints the feature importances ranked by importance

# Resulting in a large drop of important features, these best features were used for other models
print(len(important_features['Feature']))

640
[LightGBM] [Info] Number of positive: 24825, number of negative: 282686
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 5.488579 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84619
[LightGBM] [Info] Number of data points in the train set: 307511, number of used features: 636
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486
303


## Visuals

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_engineered, y, test_size=0.2, random_state=42)

# Using best model
lgb_model = lgb.LGBMClassifier(
    colsample_bytree=0.27,
    min_child_samples=55,
    num_leaves=50,
    reg_lambda=11,
    objective='binary',
    metric='auc',
    random_state=42
)

# Calculate the auc for each iteration
lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_metric='auc'
)

# Plotting how the training and testing roc aucs change as more trees are fitted (expect to increase up to a point and then flatten for gradient boosting)
plt.figure(figsize=(10, 6))
plt.plot(lgb_model.evals_result_['training']['auc'], label='Train ROC-AUC')
plt.plot(lgb_model.evals_result_['valid_1']['auc'], label='Validation ROC-AUC')
plt.xlabel('Boosting Iterations (Trees)')
plt.ylabel('ROC-AUC')
plt.title('ROC-AUC Score Over Iterations')
plt.legend()
plt.grid(True)
plt.show()

## Fairness

In [None]:
!pip install fairlearn
from fairlearn.metrics import (
    demographic_parity_ratio,
    equalized_odds_ratio,
    equal_opportunity_ratio
)

In [None]:
X_fair = X_engineered.copy()
X_fair['AGE'] = X_engineered['DAYS_BIRTH'] / -365
bins = [18, 25, 40, 60, 100]
labels = ['18-25', '26-40', '41-60', '61+']
# Grouping age so can be used when calculating fairness metrics
X_fair['AGE_GROUP'] = pd.cut(X_fair['AGE'], bins=bins, labels=labels, right=False)

In [None]:
lgb_model = lgb.LGBMClassifier(reg_lambda=60)
lgb_model.fit(X_fair, y)
y_pred_proba = lgb_model.predict_proba(X_fair)[:, 1]

# This is the class that the model predicts: 0 = paid on time, 1 = not on time.
y_pred_class = (y_pred_proba > 0.5).astype(int)

In [None]:
# These are the 3 protected classes: age, gender, and marital status
X_fair['AGE_GROUP'] = X_fair['AGE_GROUP'].astype(str)
X_fair['CODE_GENDER'] = X_fair['CODE_GENDER'].astype(str)
X_fair['NAME_FAMILY_STATUS'] = X_fair['NAME_FAMILY_STATUS'].astype(str)

In [None]:
# The demographic parity ratio of 1 means that all groups have the same selection rate.
# The equalized odds ratio of 1 means that all groups have the same true positive, true negative, false positive, and false negative rates.
dp_diff_age = demographic_parity_ratio(y, y_pred_class, sensitive_features=X_fair['AGE_GROUP'])
eo_diff_age = equalized_odds_ratio(y, y_pred_class, sensitive_features=X_fair['AGE_GROUP'])

dp_diff_gender = demographic_parity_ratio(y, y_pred_class, sensitive_features=X_fair['CODE_GENDER'])
eo_diff_gender = equalized_odds_ratio(y, y_pred_class, sensitive_features=X_fair['CODE_GENDER'])

dp_diff_family = demographic_parity_ratio(y, y_pred_class, sensitive_features=X_fair['NAME_FAMILY_STATUS'])
eo_diff_family = equalized_odds_ratio(y, y_pred_class, sensitive_features=X_fair['NAME_FAMILY_STATUS'])

eo_opportunity_age = equal_opportunity_ratio(y, y_pred_class, sensitive_features=X_fair['AGE_GROUP'])
eo_opportunity_gender = equal_opportunity_ratio(y, y_pred_class, sensitive_features=X_fair['CODE_GENDER'])
eo_opportunity_family = equal_opportunity_ratio(y, y_pred_class, sensitive_features=X_fair['NAME_FAMILY_STATUS'])

data = {
    'Sensitive Feature': ['Age Group', 'Gender', 'Family Status'],
    'Demographic Parity Ratio': [dp_diff_age, dp_diff_gender, dp_diff_family],
    'Equalized Odds Ratio': [eo_diff_age, eo_diff_gender, eo_diff_family],
    'Equal Opportunity Ratio': [eo_opportunity_age, eo_opportunity_gender, eo_opportunity_family]
}

fairness_summary = pd.DataFrame(data)

fairness_summary.style.format({
    'Demographic Parity Ratio': '{:.3f}',
    'Equalized Odds Ratio': '{:.3f}',
    'Equal Opportunity Ratio': '{:.3f}'
}).set_caption("Fairness Metrics Summary")

These are terrible scores, since we would want a ratio of near 1.

In [None]:
# Check how much the protected classes affect the model scores
X_no_protected = X_engineered.drop(columns=['AGE_GROUP', 'AGE', 'DAYS_BIRTH', 'CODE_GENDER', 'NAME_FAMILY_STATUS'])
cross_val_score(lgb_model, X_no_protected, y, cv=5, scoring='roc_auc').mean()

Since removing the protected classes did not hurt the model, our final fair model will simply remove the protected classes and otherwise be the same.