In [29]:
import os
import glob
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
import pyspark.sql.functions as F

from pyspark.sql.functions import col
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint, uniform, loguniform

import xgboost as xgb
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score, roc_auc_score
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

## set up pyspark session

In [2]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/12 09:30:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## set up config

In [3]:
# set up config
model_train_date_str = "2024-09-01"
train_test_period_months = 12
oot_period_months = 2
train_test_ratio = 0.8

config = {}
config["model_train_date_str"] = model_train_date_str
config["train_test_period_months"] = train_test_period_months
config["oot_period_months"] =  oot_period_months
config["model_train_date"] =  datetime.strptime(model_train_date_str, "%Y-%m-%d")
config["oot_end_date"] =  config['model_train_date'] - timedelta(days = 1)
config["oot_start_date"] =  config['model_train_date'] - relativedelta(months = oot_period_months)
config["train_test_end_date"] =  config["oot_start_date"] - timedelta(days = 1)
config["train_test_start_date"] =  config["oot_start_date"] - relativedelta(months = train_test_period_months)
config["train_test_ratio"] = train_test_ratio 


pprint.pprint(config)

{'model_train_date': datetime.datetime(2024, 9, 1, 0, 0),
 'model_train_date_str': '2024-09-01',
 'oot_end_date': datetime.datetime(2024, 8, 31, 0, 0),
 'oot_period_months': 2,
 'oot_start_date': datetime.datetime(2024, 7, 1, 0, 0),
 'train_test_end_date': datetime.datetime(2024, 6, 30, 0, 0),
 'train_test_period_months': 12,
 'train_test_ratio': 0.8,
 'train_test_start_date': datetime.datetime(2023, 7, 1, 0, 0)}


## get label store

In [4]:
# connect to label store
folder_path = "datamart/gold/label_store/"
files_list = [folder_path+os.path.basename(f) for f in glob.glob(os.path.join(folder_path, '*'))]
label_store_sdf = spark.read.option("header", "true").parquet(*files_list)
print("row_count:",label_store_sdf.count())

label_store_sdf.show()

row_count: 8974
+--------------------+-----------+-----+----------+-------------+
|             loan_id|Customer_ID|label| label_def|snapshot_date|
+--------------------+-----------+-----+----------+-------------+
|CUS_0x1037_2023_0...| CUS_0x1037|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1069_2023_0...| CUS_0x1069|    0|30dpd_6mob|   2023-07-01|
|CUS_0x114a_2023_0...| CUS_0x114a|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1184_2023_0...| CUS_0x1184|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1297_2023_0...| CUS_0x1297|    1|30dpd_6mob|   2023-07-01|
|CUS_0x12fb_2023_0...| CUS_0x12fb|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1325_2023_0...| CUS_0x1325|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1341_2023_0...| CUS_0x1341|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1375_2023_0...| CUS_0x1375|    1|30dpd_6mob|   2023-07-01|
|CUS_0x13a8_2023_0...| CUS_0x13a8|    0|30dpd_6mob|   2023-07-01|
|CUS_0x13ef_2023_0...| CUS_0x13ef|    0|30dpd_6mob|   2023-07-01|
|CUS_0x1440_2023_0...| CUS_0x1440|    0|30dpd_6mob|   2023-0

In [5]:
# extract label store
labels_sdf = label_store_sdf.filter((col("snapshot_date") >= config["train_test_start_date"]) & (col("snapshot_date") <= config["oot_end_date"]))

print("extracted labels_sdf", labels_sdf.count(), config["train_test_start_date"], config["oot_end_date"])

extracted labels_sdf 6961 2023-07-01 00:00:00 2024-08-31 00:00:00


## get features

In [6]:
# connect to feature store
folder_path_1 = "datamart/gold/feature_store/eng/"
folder_path_2 = "datamart/gold/feature_store/cust_fin_risk/"
files_list_1 = [folder_path_1+os.path.basename(f) for f in glob.glob(os.path.join(folder_path_1, '*'))]
files_list_2 = [folder_path_2+os.path.basename(f) for f in glob.glob(os.path.join(folder_path_2, '*'))]
feature_store_sdf_1 = spark.read.option("header", "true").parquet(*files_list_1)
feature_store_sdf_2 = spark.read.option("header", "true").parquet(*files_list_2)
print("row_count:",feature_store_sdf_1.count())
print("row_count:",feature_store_sdf_2.count())

print("======Feature Table 1======")
feature_store_sdf_1.show()
print("======Feature Table 2======")
feature_store_sdf_2.show()


row_count: 206402
row_count: 11974
+-----------+-------------+--------+--------+--------+--------+--------+--------+
|Customer_ID|snapshot_date|click_1m|click_2m|click_3m|click_4m|click_5m|click_6m|
+-----------+-------------+--------+--------+--------+--------+--------+--------+
| CUS_0xc65a|   2024-03-01|     239|     236|      44|     309|       0|      35|
| CUS_0x5e1f|   2024-03-01|     103|      26|     182|     133|      27|      43|
| CUS_0x78d3|   2024-03-01|      69|     124|     140|       0|       0|     149|
| CUS_0x1844|   2024-03-01|     142|     232|      25|     145|       0|      97|
| CUS_0x7f07|   2024-03-01|      53|       0|     256|     158|     160|     141|
| CUS_0xbd2c|   2024-03-01|      93|      70|     318|      49|     329|     155|
| CUS_0x9d41|   2024-03-01|      69|       0|      47|      59|      39|      84|
| CUS_0x834f|   2024-03-01|     240|      49|      16|      24|     171|      52|
| CUS_0x4b6b|   2024-03-01|       8|     157|     274|      41|

In [7]:
# extract feature store
features_sdf_1 = feature_store_sdf_1.filter((col("snapshot_date") >= config["train_test_start_date"]) & (col("snapshot_date") <= config["oot_end_date"]))
features_sdf_2 = feature_store_sdf_2

print("extracted features_sdf_1", features_sdf_1.count(), config["train_test_start_date"], config["oot_end_date"])
print("extracted features_sdf_2", features_sdf_2.count(), config["train_test_start_date"], config["oot_end_date"])

extracted features_sdf_1 125636 2023-07-01 00:00:00 2024-08-31 00:00:00
extracted features_sdf_2 11974 2023-07-01 00:00:00 2024-08-31 00:00:00


In [8]:
features_pdf_2 = features_sdf_2.toPandas()
features_pdf_2.shape[0]

11974

In [9]:
len(features_pdf_2['Customer_ID'].unique())

11974

## prepare data for modeling

In [10]:
# check click details
features_sdf_1.show()

+-----------+-------------+--------+--------+--------+--------+--------+--------+
|Customer_ID|snapshot_date|click_1m|click_2m|click_3m|click_4m|click_5m|click_6m|
+-----------+-------------+--------+--------+--------+--------+--------+--------+
| CUS_0xc65a|   2024-03-01|     239|     236|      44|     309|       0|      35|
| CUS_0x5e1f|   2024-03-01|     103|      26|     182|     133|      27|      43|
| CUS_0x78d3|   2024-03-01|      69|     124|     140|       0|       0|     149|
| CUS_0x1844|   2024-03-01|     142|     232|      25|     145|       0|      97|
| CUS_0x7f07|   2024-03-01|      53|       0|     256|     158|     160|     141|
| CUS_0xbd2c|   2024-03-01|      93|      70|     318|      49|     329|     155|
| CUS_0x9d41|   2024-03-01|      69|       0|      47|      59|      39|      84|
| CUS_0x834f|   2024-03-01|     240|      49|      16|      24|     171|      52|
| CUS_0x4b6b|   2024-03-01|       8|     157|     274|      41|     155|     124|
| CUS_0x99c2|   

In [11]:
# check demographics and financial detaisl
features_sdf_2.show()

+-----------+-------------+------------------+------------+--------------------+--------------------+------------------+---------------------+-----------+----------------+-------------+-------------------+--------------------+
|Customer_ID|snapshot_date|Credit_History_Age|Num_Fin_Pdts|       EMI_to_Salary|      Debt_to_Salary| Repayment_Ability|Loans_per_Credit_Item|Loan_Extent|Outstanding_Debt|Interest_Rate|Delay_from_due_date|Changed_Credit_Limit|
+-----------+-------------+------------------+------------+--------------------+--------------------+------------------+---------------------+-----------+----------------+-------------+-------------------+--------------------+
| CUS_0x10ac|   2024-08-01|               195|          14| 0.04008904831579113|  0.6713314768490112|          1219.258|  0.36363636363636365|        104|          853.41|           15|                 26|               18.45|
| CUS_0x10c5|   2024-08-01|               362|          10|0.006906143466531408| 0.086091996

In [12]:
# check uniqueness
features_pdf_2 = features_sdf_2.toPandas()
print("record counts: ", features_pdf_2.shape[0])
print("unqiue customer ID: ", len(features_pdf_2['Customer_ID'].unique()))

record counts:  11974
unqiue customer ID:  11974


In [13]:
# prepare data for modeling by joining tables
data_pdf_temp = labels_sdf.join(features_sdf_1, on=["Customer_ID", "snapshot_date"], how="left")
data_pdf_temp.show()
print(data_pdf_temp.count())

+-----------+-------------+--------------------+-----+----------+--------+--------+--------+--------+--------+--------+
|Customer_ID|snapshot_date|             loan_id|label| label_def|click_1m|click_2m|click_3m|click_4m|click_5m|click_6m|
+-----------+-------------+--------------------+-----+----------+--------+--------+--------+--------+--------+--------+
| CUS_0x1037|   2023-07-01|CUS_0x1037_2023_0...|    0|30dpd_6mob|     145|     203|       9|     232|      55|      63|
| CUS_0x1069|   2023-07-01|CUS_0x1069_2023_0...|    0|30dpd_6mob|      14|       0|     104|       0|     100|       0|
| CUS_0x114a|   2023-07-01|CUS_0x114a_2023_0...|    0|30dpd_6mob|     305|     150|     164|     139|     142|       0|
| CUS_0x1184|   2023-07-01|CUS_0x1184_2023_0...|    0|30dpd_6mob|       3|       0|      68|       6|     130|       0|
| CUS_0x1297|   2023-07-01|CUS_0x1297_2023_0...|    1|30dpd_6mob|     311|      46|     113|     137|      60|      55|
| CUS_0x12fb|   2023-07-01|CUS_0x12fb_20

In [14]:
features_sdf_2 = features_sdf_2.drop('snapshot_date')
data_pdf = data_pdf_temp.join(features_sdf_2, on=["Customer_ID"], how="left").toPandas()
# data_pdf = data_pdf.drop(data_pdf.columns[11], axis=1)
data_pdf

Unnamed: 0,Customer_ID,snapshot_date,loan_id,label,label_def,click_1m,click_2m,click_3m,click_4m,click_5m,...,Num_Fin_Pdts,EMI_to_Salary,Debt_to_Salary,Repayment_Ability,Loans_per_Credit_Item,Loan_Extent,Outstanding_Debt,Interest_Rate,Delay_from_due_date,Changed_Credit_Limit
0,CUS_0x1037,2023-07-01,CUS_0x1037_2023_01_01,0,30dpd_6mob,145,203,9,232,55,...,13,0.031080,0.612291,1052.627,0.400000,52,665.82,2,13,0.50
1,CUS_0x1069,2023-07-01,CUS_0x1069_2023_01_01,0,30dpd_6mob,14,0,104,0,100,...,19,0.029140,0.043496,4659.560,0.818182,81,208.80,10,9,12.56
2,CUS_0x114a,2023-07-01,CUS_0x114a_2023_01_01,0,30dpd_6mob,305,150,164,139,142,...,9,0.016486,0.521676,1210.153,0.250000,28,642.42,2,14,15.95
3,CUS_0x1184,2023-07-01,CUS_0x1184_2023_01_01,0,30dpd_6mob,3,0,68,6,130,...,11,0.030485,0.506066,1354.016,0.333333,30,707.29,11,10,6.74
4,CUS_0x1297,2023-07-01,CUS_0x1297_2023_01_01,1,30dpd_6mob,311,46,113,137,60,...,26,0.060683,0.802144,4585.221,0.500000,549,3916.47,30,61,14.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6956,CUS_0xdf6,2024-03-01,CUS_0xdf6_2023_09_01,0,30dpd_6mob,221,141,279,214,151,...,4,0.116570,0.163804,2449.468,0.000000,0,454.36,1,13,4.29
6957,CUS_0xe23,2024-03-01,CUS_0xe23_2023_09_01,0,30dpd_6mob,45,96,214,116,114,...,15,0.009343,0.687812,1414.285,0.066667,25,982.63,18,25,14.18
6958,CUS_0xe4e,2024-03-01,CUS_0xe4e_2023_09_01,0,30dpd_6mob,239,75,20,0,139,...,6,0.000000,0.151417,8738.140,0.000000,0,1323.25,9,29,16.16
6959,CUS_0xedd,2024-03-01,CUS_0xedd_2023_09_01,0,30dpd_6mob,2,138,13,164,167,...,12,0.032364,0.108824,10107.611,0.444444,72,1136.85,10,18,


In [15]:
# check data info
data_pdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6961 entries, 0 to 6960
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Customer_ID            6961 non-null   object 
 1   snapshot_date          6961 non-null   object 
 2   loan_id                6961 non-null   object 
 3   label                  6961 non-null   int32  
 4   label_def              6961 non-null   object 
 5   click_1m               6961 non-null   int32  
 6   click_2m               6961 non-null   int32  
 7   click_3m               6961 non-null   int32  
 8   click_4m               6961 non-null   int32  
 9   click_5m               6961 non-null   int32  
 10  click_6m               6961 non-null   int32  
 11  Credit_History_Age     6961 non-null   int32  
 12  Num_Fin_Pdts           6961 non-null   int32  
 13  EMI_to_Salary          6961 non-null   float64
 14  Debt_to_Salary         6961 non-null   float64
 15  Repa

In [16]:
# rename features
columns_to_exclude = ['Customer_ID', 'snapshot_date', 'loan_id', 'label', 'label_def']
columns_to_rename = [col for col in data_pdf.columns if col not in columns_to_exclude]
rename_dict = {col: 'feature_' + col for col in columns_to_rename}
data_pdf.rename(columns=rename_dict, inplace=True)
data_pdf.info()
data_pdf

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6961 entries, 0 to 6960
Data columns (total 22 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Customer_ID                    6961 non-null   object 
 1   snapshot_date                  6961 non-null   object 
 2   loan_id                        6961 non-null   object 
 3   label                          6961 non-null   int32  
 4   label_def                      6961 non-null   object 
 5   feature_click_1m               6961 non-null   int32  
 6   feature_click_2m               6961 non-null   int32  
 7   feature_click_3m               6961 non-null   int32  
 8   feature_click_4m               6961 non-null   int32  
 9   feature_click_5m               6961 non-null   int32  
 10  feature_click_6m               6961 non-null   int32  
 11  feature_Credit_History_Age     6961 non-null   int32  
 12  feature_Num_Fin_Pdts           6961 non-null   i

Unnamed: 0,Customer_ID,snapshot_date,loan_id,label,label_def,feature_click_1m,feature_click_2m,feature_click_3m,feature_click_4m,feature_click_5m,...,feature_Num_Fin_Pdts,feature_EMI_to_Salary,feature_Debt_to_Salary,feature_Repayment_Ability,feature_Loans_per_Credit_Item,feature_Loan_Extent,feature_Outstanding_Debt,feature_Interest_Rate,feature_Delay_from_due_date,feature_Changed_Credit_Limit
0,CUS_0x1037,2023-07-01,CUS_0x1037_2023_01_01,0,30dpd_6mob,145,203,9,232,55,...,13,0.031080,0.612291,1052.627,0.400000,52,665.82,2,13,0.50
1,CUS_0x1069,2023-07-01,CUS_0x1069_2023_01_01,0,30dpd_6mob,14,0,104,0,100,...,19,0.029140,0.043496,4659.560,0.818182,81,208.80,10,9,12.56
2,CUS_0x114a,2023-07-01,CUS_0x114a_2023_01_01,0,30dpd_6mob,305,150,164,139,142,...,9,0.016486,0.521676,1210.153,0.250000,28,642.42,2,14,15.95
3,CUS_0x1184,2023-07-01,CUS_0x1184_2023_01_01,0,30dpd_6mob,3,0,68,6,130,...,11,0.030485,0.506066,1354.016,0.333333,30,707.29,11,10,6.74
4,CUS_0x1297,2023-07-01,CUS_0x1297_2023_01_01,1,30dpd_6mob,311,46,113,137,60,...,26,0.060683,0.802144,4585.221,0.500000,549,3916.47,30,61,14.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6956,CUS_0xdf6,2024-03-01,CUS_0xdf6_2023_09_01,0,30dpd_6mob,221,141,279,214,151,...,4,0.116570,0.163804,2449.468,0.000000,0,454.36,1,13,4.29
6957,CUS_0xe23,2024-03-01,CUS_0xe23_2023_09_01,0,30dpd_6mob,45,96,214,116,114,...,15,0.009343,0.687812,1414.285,0.066667,25,982.63,18,25,14.18
6958,CUS_0xe4e,2024-03-01,CUS_0xe4e_2023_09_01,0,30dpd_6mob,239,75,20,0,139,...,6,0.000000,0.151417,8738.140,0.000000,0,1323.25,9,29,16.16
6959,CUS_0xedd,2024-03-01,CUS_0xedd_2023_09_01,0,30dpd_6mob,2,138,13,164,167,...,12,0.032364,0.108824,10107.611,0.444444,72,1136.85,10,18,


In [17]:
# split data into train - test - oot
oot_pdf = data_pdf[(data_pdf['snapshot_date'] >= config["oot_start_date"].date()) & (data_pdf['snapshot_date'] <= config["oot_end_date"].date())]
train_test_pdf = data_pdf[(data_pdf['snapshot_date'] >= config["train_test_start_date"].date()) & (data_pdf['snapshot_date'] <= config["train_test_end_date"].date())]

feature_cols = [fe_col for fe_col in data_pdf.columns if fe_col.startswith('feature_')]

X_oot = oot_pdf[feature_cols]
y_oot = oot_pdf["label"]
X_train, X_test, y_train, y_test = train_test_split(
    train_test_pdf[feature_cols], train_test_pdf["label"], 
    test_size= 1 - config["train_test_ratio"],
    random_state=88,     # Ensures reproducibility
    shuffle=True,        # Shuffle the data before splitting
    stratify=train_test_pdf["label"]           # Stratify based on the label column
)


print('X_train', X_train.shape[0])
print('X_test', X_test.shape[0])
print('X_oot', X_oot.shape[0])
print('y_train', y_train.shape[0], round(y_train.mean(),2))
print('y_test', y_test.shape[0], round(y_test.mean(),2))
print('y_oot', y_oot.shape[0], round(y_oot.mean(),2))

X_train

X_train 4766
X_test 1192
X_oot 1003
y_train 4766 0.28
y_test 1192 0.28
y_oot 1003 0.29


Unnamed: 0,feature_click_1m,feature_click_2m,feature_click_3m,feature_click_4m,feature_click_5m,feature_click_6m,feature_Credit_History_Age,feature_Num_Fin_Pdts,feature_EMI_to_Salary,feature_Debt_to_Salary,feature_Repayment_Ability,feature_Loans_per_Credit_Item,feature_Loan_Extent,feature_Outstanding_Debt,feature_Interest_Rate,feature_Delay_from_due_date,feature_Changed_Credit_Limit
137,0,191,11,32,0,102,243,13,0.019009,0.035021,4107.670,0.272727,33,146.68,18,11,10.96
4090,0,73,120,0,217,46,117,17,0.021612,0.322445,6258.069,0.285714,168,2062.79,20,42,7.73
3489,116,96,158,6,137,289,152,21,0.037236,3.832187,1019.694,0.375000,114,4062.77,28,19,8.79
431,149,0,118,156,143,262,316,11,0.008086,0.046414,7084.141,0.090909,31,331.53,12,31,8.06
4120,0,0,247,90,20,84,155,24,0.054666,2.259510,1478.577,0.562500,333,3536.44,23,37,2.86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2211,203,0,127,153,163,188,398,7,0.008352,1.162687,1141.868,0.142857,26,1339.99,18,26,9.91
4145,347,37,94,46,5,30,396,6,0.006153,0.038251,13014.686,0.166667,0,500.94,7,0,0.48
2304,208,254,243,0,0,149,202,10,0.023795,0.104731,10812.629,0.571429,52,1160.13,20,13,18.11
6398,340,0,17,264,197,250,106,19,0.028479,1.627419,1526.659,0.333333,140,2559.02,33,28,18.86


## preprocess data

In [18]:
# set up standard scalar preprocessing
scaler = StandardScaler()

transformer_stdscaler = scaler.fit(X_train) # for standardisation, we should use training set, to prevent data leakage

# transform data
X_train_processed = transformer_stdscaler.transform(X_train)
X_test_processed = transformer_stdscaler.transform(X_test)
X_oot_processed = transformer_stdscaler.transform(X_oot)

print('X_train_processed', X_train_processed.shape[0])
print('X_test_processed', X_test_processed.shape[0])
print('X_oot_processed', X_oot_processed.shape[0])

pd.DataFrame(X_train_processed)

X_train_processed 4766
X_test_processed 1192
X_oot_processed 1003


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,-1.262592,0.928934,-1.142088,-0.883872,-1.298239,-0.102800,0.183831,-0.314657,-0.118736,-0.610422,0.155421,-0.193687,-0.545787,-1.105180,0.334864,-0.669722,0.088550
1,-1.262592,-0.409540,0.120065,-1.248311,1.172961,-0.735891,-1.087219,0.385543,-0.118227,-0.360310,0.392097,-0.139927,0.619569,0.559010,0.552966,1.412446,-0.401265
2,0.069514,-0.148651,0.560082,-1.179978,0.261920,2.011272,-0.734150,1.085743,-0.115167,2.693818,-0.184446,0.229676,0.153426,2.296043,1.425375,-0.132389,-0.240521
3,0.448475,-1.237580,0.096906,0.528327,0.330248,1.706031,0.920232,-0.664757,-0.120876,-0.600508,0.483015,-0.946334,-0.563052,-0.944633,-0.319442,0.673612,-0.351222
4,-1.262592,-1.237580,1.590647,-0.223327,-1.070479,-0.306293,-0.703887,1.610893,-0.111754,1.325296,-0.133941,1.005844,2.043893,1.838912,0.880119,1.076612,-1.139780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4761,1.068594,-1.237580,0.201121,0.494161,0.558008,0.869447,1.747424,-1.364957,-0.120823,0.370857,-0.171000,-0.731292,-0.606213,-0.068761,0.334864,0.337778,-0.070678
4762,2.722243,-0.817889,-0.180999,-0.724430,-1.241299,-0.916774,1.727248,-1.540007,-0.121254,-0.607612,1.135739,-0.632732,-0.830652,-0.797497,-0.864698,-1.408556,-1.500696
4763,1.126012,1.643543,1.544329,-1.248311,-1.298239,0.428544,-0.229765,-0.839807,-0.117799,-0.549761,0.893378,1.042804,-0.381774,-0.224973,0.552966,-0.535389,1.172816
4764,2.641858,-1.237580,-1.072612,1.758308,0.945201,1.570369,-1.198184,0.735643,-0.116882,0.775260,-0.128649,0.057195,0.377865,0.989998,1.970630,0.472112,1.286550


## train model

### Random Forest

In [31]:
# Define the Random Forest classifier
rf_clf = RandomForestClassifier( random_state=88)

# Define the hyperparameter space to search
rf_param_dist = {
    'n_estimators': randint(50, 500),
    'max_depth': randint(3, 20),
    'max_features': ['sqrt', 'log2', 0.6, 0.8],
    'min_samples_leaf': randint(1, 10),
    'min_samples_split': randint(2, 20),
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}

# Create a scorer based on AUC score
auc_scorer = make_scorer(roc_auc_score)

# Set up the random search with cross-validation
rf_random_search = RandomizedSearchCV(
    estimator=rf_clf,
    param_distributions=rf_param_dist,
    scoring=auc_scorer,
    n_iter=100,  # Number of iterations for random search
    cv=5,       # Number of folds in cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1   # Use all available cores
)

# Perform the random search
rf_random_search.fit(X_train_processed, y_train)

# Output the best parameters and best score
print("Best parameters found: ", rf_random_search.best_params_)
print("Best AUC score: ", rf_random_search.best_score_)

# Evaluate the model on the train set
rf_best_model = rf_random_search.best_estimator_
y_pred_proba = rf_best_model.predict_proba(X_train_processed)[:, 1]
rf_train_auc_score = roc_auc_score(y_train, y_pred_proba)
print("Train AUC score: ", rf_train_auc_score)

# Evaluate the model on the test set
rf_best_model = rf_random_search.best_estimator_
y_pred_proba = rf_best_model.predict_proba(X_test_processed)[:, 1]
rf_test_auc_score = roc_auc_score(y_test, y_pred_proba)
print("Test AUC score: ", rf_test_auc_score)

# Evaluate the model on the oot set
rf_best_model = rf_random_search.best_estimator_
y_pred_proba = rf_best_model.predict_proba(X_oot_processed)[:, 1]
rf_oot_auc_score = roc_auc_score(y_oot, y_pred_proba)
print("OOT AUC score: ", rf_oot_auc_score)

print("TRAIN GINI score: ", round(2*rf_train_auc_score-1,3))
print("Test GINI score: ", round(2*rf_test_auc_score-1,3))
print("OOT GINI score: ", round(2*rf_oot_auc_score-1,3))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters found:  {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 4, 'max_features': 0.6, 'min_samples_leaf': 2, 'min_samples_split': 18, 'n_estimators': 153}
Best AUC score:  0.7077614993349048
Train AUC score:  0.8144771932427862
Test AUC score:  0.7969805820188454
OOT AUC score:  0.7860583420209275
TRAIN GINI score:  0.629
Test GINI score:  0.594
OOT GINI score:  0.572


### XGBoost

In [34]:
# Define the XGBoost classifier
xgb_clf = xgb.XGBClassifier(eval_metric='logloss', random_state=88)

# Define the hyperparameter space to search
xgb_param_dist = {
    'n_estimators': randint(25, 101),  
    'max_depth': randint(2, 6),       
    'learning_rate': loguniform(0.01, 0.2),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': uniform(0, 0.2),
    'min_child_weight': randint(1, 6),
    'reg_alpha': uniform(0, 1),
    'reg_lambda': uniform(1, 1.5) 
}
# Create a scorer based on AUC score
auc_scorer = make_scorer(roc_auc_score)

# Set up the random search with cross-validation
xgb_random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=xgb_param_dist,
    scoring=auc_scorer,
    n_iter=100,  # Number of iterations for random search
    cv=10,       # Number of folds in cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1   # Use all available cores
)

# Perform the random search
xgb_random_search.fit(X_train_processed, y_train)

# Output the best parameters and best score
print("Best parameters found: ", xgb_random_search.best_params_)
print("Best AUC score: ", xgb_random_search.best_score_)

# Evaluate the model on the train set
xgb_best_model = xgb_random_search.best_estimator_
y_pred_proba = xgb_best_model.predict_proba(X_train_processed)[:, 1]
xgb_train_auc_score = roc_auc_score(y_train, y_pred_proba)
print("Train AUC score: ", xgb_train_auc_score)

# Evaluate the model on the test set
xgb_best_model = xgb_random_search.best_estimator_
y_pred_proba = xgb_best_model.predict_proba(X_test_processed)[:, 1]
xgb_test_auc_score = roc_auc_score(y_test, y_pred_proba)
print("Test AUC score: ", xgb_test_auc_score)

# Evaluate the model on the oot set
xgb_best_model = xgb_random_search.best_estimator_
y_pred_proba = xgb_best_model.predict_proba(X_oot_processed)[:, 1]
xgb_oot_auc_score = roc_auc_score(y_oot, y_pred_proba)
print("OOT AUC score: ", xgb_oot_auc_score)

print("TRAIN GINI score: ", round(2*xgb_train_auc_score-1,3))
print("Test GINI score: ", round(2*xgb_test_auc_score-1,3))
print("OOT GINI score: ", round(2*xgb_oot_auc_score-1,3))

Fitting 10 folds for each of 100 candidates, totalling 1000 fits
Best parameters found:  {'colsample_bytree': np.float64(0.8880146146184298), 'gamma': np.float64(0.12822952657705947), 'learning_rate': np.float64(0.07995538921031954), 'max_depth': 5, 'min_child_weight': 2, 'n_estimators': 67, 'reg_alpha': np.float64(0.5990293647773265), 'reg_lambda': np.float64(2.2401982417136175), 'subsample': np.float64(0.9836299178256112)}
Best AUC score:  0.6970855502801883
Train AUC score:  0.9025417542297669
Test AUC score:  0.7970291703541742
OOT AUC score:  0.7976707594887833
TRAIN GINI score:  0.805
Test GINI score:  0.594
OOT GINI score:  0.595


## prepare model artefact to save

In [35]:
model_artefact = {}

model_artefact['model'] = rf_best_model
model_artefact['model_version'] = "credit_model_"+config["model_train_date_str"].replace('-','_')
model_artefact['preprocessing_transformers'] = {}
model_artefact['preprocessing_transformers']['stdscaler'] = transformer_stdscaler
model_artefact['data_dates'] = config
model_artefact['data_stats'] = {}
model_artefact['data_stats']['X_train'] = X_train.shape[0]
model_artefact['data_stats']['X_test'] = X_test.shape[0]
model_artefact['data_stats']['X_oot'] = X_oot.shape[0]
model_artefact['data_stats']['y_train'] = round(y_train.mean(),2)
model_artefact['data_stats']['y_test'] = round(y_test.mean(),2)
model_artefact['data_stats']['y_oot'] = round(y_oot.mean(),2)
model_artefact['results'] = {}
model_artefact['results']['auc_train'] = rf_train_auc_score
model_artefact['results']['auc_test'] = rf_test_auc_score
model_artefact['results']['auc_oot'] = rf_oot_auc_score
model_artefact['results']['gini_train'] = round(2*rf_train_auc_score-1,3)
model_artefact['results']['gini_test'] = round(2*rf_test_auc_score-1,3)
model_artefact['results']['gini_oot'] = round(2*rf_oot_auc_score-1,3)
model_artefact['hp_params'] = rf_random_search.best_params_


pprint.pprint(model_artefact)

{'data_dates': {'model_train_date': datetime.datetime(2024, 9, 1, 0, 0),
                'model_train_date_str': '2024-09-01',
                'oot_end_date': datetime.datetime(2024, 8, 31, 0, 0),
                'oot_period_months': 2,
                'oot_start_date': datetime.datetime(2024, 7, 1, 0, 0),
                'train_test_end_date': datetime.datetime(2024, 6, 30, 0, 0),
                'train_test_period_months': 12,
                'train_test_ratio': 0.8,
                'train_test_start_date': datetime.datetime(2023, 7, 1, 0, 0)},
 'data_stats': {'X_oot': 1003,
                'X_test': 1192,
                'X_train': 4766,
                'y_oot': np.float64(0.29),
                'y_test': np.float64(0.28),
                'y_train': np.float64(0.28)},
 'hp_params': {'bootstrap': True,
               'criterion': 'entropy',
               'max_depth': 4,
               'max_features': 0.6,
               'min_samples_leaf': 2,
               'min_samples_split': 18,


## save artefact to model bank

In [36]:
# create model_bank dir
model_bank_directory = "model_bank/"

if not os.path.exists(model_bank_directory):
    os.makedirs(model_bank_directory)

In [37]:
# Full path to the file
file_path = os.path.join(model_bank_directory, model_artefact['model_version'] + '.pkl')

# Write the model to a pickle file
with open(file_path, 'wb') as file:
    pickle.dump(model_artefact, file)

print(f"Model saved to {file_path}")

Model saved to model_bank/credit_model_2024_09_01.pkl


## test load pickle and make model inference

In [39]:
# Load the model from the pickle file
with open(file_path, 'rb') as file:
    loaded_model_artefact = pickle.load(file)

y_pred_proba = loaded_model_artefact['model'].predict_proba(X_oot_processed)[:, 1]
oot_auc_score = roc_auc_score(y_oot, y_pred_proba)
print("OOT AUC score: ", oot_auc_score)

print("Model loaded successfully!")

OOT AUC score:  0.7860583420209275
Model loaded successfully!
