In [1]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
import pyspark.sql.functions as F

from pyspark.sql.functions import col
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType

from utils.data_processing_bronze_table import process_bronze_tables
from utils.data_processing_silver_table import process_silver_tables
from utils.data_processing_gold_table import process_gold_tables

In [2]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/28 19:37:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### EDA

In [3]:
clickstream_df = pd.read_csv("data/feature_clickstream.csv")

In [16]:
clickstream_df['snapshot_date'] = pd.to_datetime(clickstream_df['snapshot_date'], errors='coerce')

# view min and max
min_date = clickstream_df['snapshot_date'].min()
max_date = clickstream_df['snapshot_date'].max()

print("Earliest snapshot date:", min_date)
print("Latest snapshot date:", max_date)

Earliest snapshot date: 2023-01-01 00:00:00
Latest snapshot date: 2024-12-01 00:00:00


In [4]:
attributes_df = pd.read_csv("data/features_attributes.csv")

In [5]:
financials_df = pd.read_csv("data/features_financials.csv")

In [14]:
financials_df['snapshot_date'] = pd.to_datetime(financials_df['snapshot_date'], errors='coerce')

# view min and max
min_date = financials_df['snapshot_date'].min()
max_date = financials_df['snapshot_date'].max()

print("Earliest snapshot date:", min_date)
print("Latest snapshot date:", max_date)

Earliest snapshot date: 2023-01-01 00:00:00
Latest snapshot date: 2025-01-01 00:00:00


In [26]:
loans_df = pd.read_csv("data/lms_loan_daily.csv")

In [27]:
loans_df['loan_start_date'] = pd.to_datetime(loans_df['loan_start_date'], errors='coerce')

# view min and max
min_date = loans_df['loan_start_date'].min()
max_date = loans_df['loan_start_date'].max()

print("Earliest loan start date:", min_date)
print("Latest loan start date:", max_date)

Earliest loan start date: 2023-01-01 00:00:00
Latest loan start date: 2025-01-01 00:00:00


In [28]:
loans_df

Unnamed: 0,loan_id,Customer_ID,loan_start_date,tenure,installment_num,loan_amt,due_amt,paid_amt,overdue_amt,balance,snapshot_date
0,CUS_0x1000_2023_05_01,CUS_0x1000,2023-05-01,10,0,10000,0.0,0.0,0.0,10000.0,2023-05-01
1,CUS_0x1000_2023_05_01,CUS_0x1000,2023-05-01,10,1,10000,1000.0,1000.0,0.0,9000.0,2023-06-01
2,CUS_0x1000_2023_05_01,CUS_0x1000,2023-05-01,10,2,10000,1000.0,1000.0,0.0,8000.0,2023-07-01
3,CUS_0x1000_2023_05_01,CUS_0x1000,2023-05-01,10,3,10000,1000.0,0.0,1000.0,8000.0,2023-08-01
4,CUS_0x1000_2023_05_01,CUS_0x1000,2023-05-01,10,4,10000,1000.0,2000.0,0.0,6000.0,2023-09-01
...,...,...,...,...,...,...,...,...,...,...,...
137495,CUS_0xffd_2024_03_01,CUS_0xffd,2024-03-01,10,6,10000,1000.0,1000.0,0.0,4000.0,2024-09-01
137496,CUS_0xffd_2024_03_01,CUS_0xffd,2024-03-01,10,7,10000,1000.0,1000.0,0.0,3000.0,2024-10-01
137497,CUS_0xffd_2024_03_01,CUS_0xffd,2024-03-01,10,8,10000,1000.0,1000.0,0.0,2000.0,2024-11-01
137498,CUS_0xffd_2024_03_01,CUS_0xffd,2024-03-01,10,9,10000,1000.0,1000.0,0.0,1000.0,2024-12-01


In [30]:
import pandas as pd

# Condition 1: any overdue after installment 4
any_overdue_after_4 = (
    loans_df
    .loc[loans_df["installment_num"] >= 4]
    .groupby("Customer_ID")["overdue_amt"]
    .apply(lambda x: (x > 0).any())
)

# Condition 2: all zero overdue before installment 4
no_overdue_before_4 = (
    loans_df
    .loc[loans_df["installment_num"] < 4]
    .groupby("Customer_ID")["overdue_amt"]
    .apply(lambda x: (x == 0).all())
)

# Combine both conditions (align indices)
flags = pd.concat([any_overdue_after_4, no_overdue_before_4], axis=1)
flags.columns = ["any_overdue_after_4", "no_overdue_before_4"]

# Select customers satisfying both
qualified_customers = flags[
    (flags["any_overdue_after_4"]) & (flags["no_overdue_before_4"])
]

# Compute percentage
total_customers = loans_df["Customer_ID"].nunique()
qualified_percentage = (len(qualified_customers) / total_customers) * 100

print(f"Percentage of customers meeting condition: {qualified_percentage:.2f}%")

# Optional: get qualifying customer IDs
qualified_ids = qualified_customers.index.tolist()

Percentage of customers meeting condition: 5.14%


In [31]:
import pandas as pd

# Condition 1: any overdue after installment 4
any_overdue_after_4 = (
    loans_df.loc[loans_df["installment_num"] >= 4]
    .groupby("Customer_ID")["overdue_amt"]
    .apply(lambda x: (x > 0).any())
)

# Condition 2: all zero overdue before installment 4
no_overdue_before_4 = (
    loans_df.loc[loans_df["installment_num"] < 4]
    .groupby("Customer_ID")["overdue_amt"]
    .apply(lambda x: (x == 0).all())
)

# Combine both conditions
flags = pd.concat([any_overdue_after_4, no_overdue_before_4], axis=1).fillna(False)
flags.columns = ["any_overdue_after_4", "no_overdue_before_4"]

# Select customers satisfying both
qualified_customers = flags[
    (flags["any_overdue_after_4"]) & (flags["no_overdue_before_4"])
]

# ✅ Compute denominator: customers who had *any* overdue_amt > 0 anywhere
customers_with_any_overdue = (
    loans_df.groupby("Customer_ID")["overdue_amt"]
    .apply(lambda x: (x > 0).any())
)
num_customers_with_overdue = customers_with_any_overdue.sum()

# Compute new percentage
qualified_percentage = (len(qualified_customers) / num_customers_with_overdue) * 100

print(f"Percentage of customers meeting condition: {qualified_percentage:.2f}%")

# Optional: list of qualifying IDs
qualified_ids = qualified_customers.index.tolist()

Percentage of customers meeting condition: 17.82%


In [35]:
import pandas as pd

# Condition 1: any overdue after installment 4
any_overdue_after_4 = (
    loans_df.loc[loans_df["installment_num"] >= 5]
    .groupby("Customer_ID")["overdue_amt"]
    .apply(lambda x: (x > 0).any())
)

# Condition 2: all zero overdue before installment 4
no_overdue_before_4 = (
    loans_df.loc[loans_df["installment_num"] < 5]
    .groupby("Customer_ID")["overdue_amt"]
    .apply(lambda x: (x == 0).all())
)

# Combine both conditions
flags = pd.concat([any_overdue_after_4, no_overdue_before_4], axis=1).fillna(False)
flags.columns = ["any_overdue_after_4", "no_overdue_before_4"]

# Select customers satisfying both
qualified_customers = flags[
    (flags["any_overdue_after_4"]) & (flags["no_overdue_before_4"])
]

# ✅ Compute denominator: customers who had *any* overdue_amt > 0 anywhere
customers_with_any_overdue = (
    loans_df.groupby("Customer_ID")["overdue_amt"]
    .apply(lambda x: (x > 0).any())
)
num_customers_with_overdue = customers_with_any_overdue.sum()

# Compute new percentage
qualified_percentage = (len(qualified_customers) / num_customers_with_overdue) * 100

print(f"Percentage of customers meeting condition: {qualified_percentage:.2f}%")

# Optional: list of qualifying IDs
qualified_ids = qualified_customers.index.tolist()

Percentage of customers meeting condition: 2.97%


In [38]:
import pandas as pd

# (1) Customers who have any overdue_amt > 0 at any period
customers_with_overdue_any = (
    loans_df.groupby("Customer_ID")["overdue_amt"]
    .apply(lambda x: (x > 0).any())
)
num_customers_with_overdue_any = customers_with_overdue_any.sum()

# (2) Customers who have any overdue_amt > 0 when installment_num <= 6
customers_with_overdue_upto6 = (
    loans_df[loans_df["installment_num"] <= 4]
    .groupby("Customer_ID")["overdue_amt"]
    .apply(lambda x: (x > 0).any())
)
num_customers_with_overdue_upto6 = customers_with_overdue_upto6.sum()

# Print results
print(f"1) Customers with any overdue_amt at any period: {num_customers_with_overdue_any}")
print(f"2) Customers with any overdue_amt when installment_num <= 6: {num_customers_with_overdue_upto6}")


1) Customers with any overdue_amt at any period: 3602
2) Customers with any overdue_amt when installment_num <= 6: 3495


In [18]:
# Path to the specific snapshot file
path = "datamart/gold/label_store/2024_10_01.parquet"

# Load into Spark DataFrame
df = spark.read.parquet(path)

# Count rows
print("Row count:", df.count())

# If you want to convert to pandas for inspection:
pdf = df.toPandas()

Row count: 498


In [5]:
# Path to the specific snapshot file
path = "datamart/gold/feature_store/2024_06_01.parquet"

# Load into Spark DataFrame
df = spark.read.parquet(path)

# Count rows
print("Row count:", df.count())

# If you want to convert to pandas for inspection:
pdf = df.toPandas()

                                                                                

Row count: 5418


In [7]:
pdf.columns.to_list()

['Customer_ID',
 'loan_start_date',
 'Age',
 'Age_valid',
 'Occupation',
 'Annual_Income',
 'Monthly_Inhand_Salary',
 'Num_Bank_Accounts',
 'Num_Credit_Card',
 'Interest_Rate',
 'Num_of_Loan',
 'Delay_from_due_date',
 'Num_of_Delayed_Payment',
 'Changed_Credit_Limit',
 'Num_Credit_Inquiries',
 'Credit_Mix',
 'Outstanding_Debt',
 'Credit_Utilization_Ratio',
 'Credit_History_Age_Months',
 'Payment_of_Min_Amount',
 'Amount_invested_monthly',
 'Payment_Behaviour',
 'Monthly_Balance',
 'avg_fe_1_L3M',
 'avg_fe_2_L3M',
 'avg_fe_3_L3M',
 'avg_fe_4_L3M',
 'avg_fe_5_L3M',
 'avg_fe_6_L3M',
 'avg_fe_7_L3M',
 'avg_fe_8_L3M',
 'avg_fe_9_L3M',
 'avg_fe_10_L3M',
 'avg_fe_11_L3M',
 'avg_fe_12_L3M',
 'avg_fe_13_L3M',
 'avg_fe_14_L3M',
 'avg_fe_15_L3M',
 'avg_fe_16_L3M',
 'avg_fe_17_L3M',
 'avg_fe_18_L3M',
 'avg_fe_19_L3M',
 'avg_fe_20_L3M',
 'sum_fe_1_L3M',
 'sum_fe_2_L3M',
 'sum_fe_3_L3M',
 'sum_fe_4_L3M',
 'sum_fe_5_L3M',
 'sum_fe_6_L3M',
 'sum_fe_7_L3M',
 'sum_fe_8_L3M',
 'sum_fe_9_L3M',
 'sum_fe_10

In [12]:
pdf

Unnamed: 0,Customer_ID,loan_start_date,Age,Age_valid,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,...,LoanType_Auto_Loan,LoanType_Credit-Builder_Loan,LoanType_Debt_Consolidation_Loan,LoanType_Home_Equity_Loan,LoanType_Mortgage_Loan,LoanType_Not_Specified,LoanType_Payday_Loan,LoanType_Personal_Loan,LoanType_Student_Loan,LoanType_No_Loan
0,CUS_0x100b,2024-03-01,,,,,,,,,...,0,0,0,0,0,0,0,0,0,1
1,CUS_0x1011,2023-11-01,,,,,,,,,...,0,0,0,0,0,0,0,0,0,1
2,CUS_0x1013,2023-12-01,,,,,,,,,...,0,0,0,0,0,0,0,0,0,1
3,CUS_0x1015,2023-08-01,,,,,,,,,...,0,0,0,0,0,0,0,0,0,1
4,CUS_0x1018,2023-11-01,,,,,,,,,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5413,CUS_0xfe4,2023-09-01,,,,,,,,,...,0,0,0,0,0,0,0,0,0,1
5414,CUS_0xfea,2023-10-01,,,,,,,,,...,0,0,0,0,0,0,0,0,0,1
5415,CUS_0xff3,2024-06-01,55.0,1.0,Scientist,17032.789062,1176.398804,0.0,6.0,2.0,...,1,0,0,0,1,0,0,1,0,0
5416,CUS_0xffc,2024-01-01,,,,,,,,,...,0,0,0,0,0,0,0,0,0,1


In [7]:
pdf['loan_start_date'].value_counts()

loan_start_date
2024-02-01    518
2024-04-01    513
2024-03-01    511
2024-06-01    498
2023-11-01    491
2024-05-01    491
2023-12-01    489
2023-10-01    487
2024-01-01    485
2023-08-01    481
2023-09-01    454
Name: count, dtype: int64

In [33]:
import os
from pyspark.sql import SparkSession

# Base directory containing all subfolders
base_path = "datamart/gold/label_store/"

# List all subfolders ending with ".parquet"
subfolders = [
    os.path.join(base_path, f)
    for f in os.listdir(base_path)
    if f.endswith(".parquet")
]

# Read all parquet partitions into a single Spark DataFrame
df = None
for folder in subfolders:
    print(f"Reading: {folder}")
    temp_df = spark.read.parquet(folder)
    df = temp_df if df is None else df.unionByName(temp_df)

# Count total rows
print("Total row count:", df.count())

# Convert to pandas
pdf = df.toPandas()

Reading: datamart/gold/label_store/2023_04_01.parquet
Reading: datamart/gold/label_store/2023_05_01.parquet
Reading: datamart/gold/label_store/2023_06_01.parquet
Reading: datamart/gold/label_store/2023_07_01.parquet
Reading: datamart/gold/label_store/2023_08_01.parquet
Reading: datamart/gold/label_store/2023_09_01.parquet
Reading: datamart/gold/label_store/2023_10_01.parquet
Reading: datamart/gold/label_store/2023_11_01.parquet
Reading: datamart/gold/label_store/2023_12_01.parquet
Reading: datamart/gold/label_store/2024_01_01.parquet
Reading: datamart/gold/label_store/2024_02_01.parquet
Reading: datamart/gold/label_store/2024_03_01.parquet
Reading: datamart/gold/label_store/2024_04_01.parquet
Reading: datamart/gold/label_store/2024_05_01.parquet
Reading: datamart/gold/label_store/2024_06_01.parquet
Reading: datamart/gold/label_store/2024_07_01.parquet
Reading: datamart/gold/label_store/2024_08_01.parquet
Reading: datamart/gold/label_store/2024_09_01.parquet
Reading: datamart/gold/label

In [34]:
pdf.columns.to_list()

['Customer_ID', 'loan_start_date', 'label', 'label_def', 'snapshot_date']

In [26]:
pdf['label'].value_counts()

label
0    8020
1    2495
Name: count, dtype: int64

In [7]:
import os
from pyspark.sql import SparkSession

# Base directory containing all subfolders
base_path = "datamart/gold/feature_store/"

# List all subfolders ending with ".parquet"
subfolders = [
    os.path.join(base_path, f)
    for f in os.listdir(base_path)
    if f.endswith(".parquet")
]

# Read all parquet partitions into a single Spark DataFrame
df = None
for folder in subfolders:
    print(f"Reading: {folder}")
    temp_df = spark.read.parquet(folder)
    df = temp_df if df is None else df.unionByName(temp_df)

# Count total rows
print("Total row count:", df.count())

# Convert to pandas
pdf = df.toPandas()

Reading: datamart/gold/feature_store/2023_04_01.parquet
Reading: datamart/gold/feature_store/2023_05_01.parquet
Reading: datamart/gold/feature_store/2023_06_01.parquet
Reading: datamart/gold/feature_store/2023_07_01.parquet
Reading: datamart/gold/feature_store/2023_08_01.parquet
Reading: datamart/gold/feature_store/2023_09_01.parquet
Reading: datamart/gold/feature_store/2023_10_01.parquet
Reading: datamart/gold/feature_store/2023_11_01.parquet
Reading: datamart/gold/feature_store/2023_12_01.parquet
Reading: datamart/gold/feature_store/2024_01_01.parquet
Reading: datamart/gold/feature_store/2024_02_01.parquet
Reading: datamart/gold/feature_store/2024_03_01.parquet
Reading: datamart/gold/feature_store/2024_04_01.parquet
Reading: datamart/gold/feature_store/2024_05_01.parquet
Reading: datamart/gold/feature_store/2024_06_01.parquet
Reading: datamart/gold/feature_store/2024_07_01.parquet
Reading: datamart/gold/feature_store/2024_08_01.parquet
Reading: datamart/gold/feature_store/2024_09_01.

                                                                                

In [32]:
pdf.columns.to_list()

['Customer_ID',
 'loan_start_date',
 'Age',
 'Age_valid',
 'Occupation',
 'Annual_Income',
 'Monthly_Inhand_Salary',
 'Num_Bank_Accounts',
 'Num_Credit_Card',
 'Interest_Rate',
 'Num_of_Loan',
 'Delay_from_due_date',
 'Num_of_Delayed_Payment',
 'Changed_Credit_Limit',
 'Num_Credit_Inquiries',
 'Credit_Mix',
 'Outstanding_Debt',
 'Credit_Utilization_Ratio',
 'Credit_History_Age_Months',
 'Payment_of_Min_Amount',
 'Amount_invested_monthly',
 'Payment_Behaviour',
 'Monthly_Balance',
 'avg_fe_1_L3M',
 'avg_fe_2_L3M',
 'avg_fe_3_L3M',
 'avg_fe_4_L3M',
 'avg_fe_5_L3M',
 'avg_fe_6_L3M',
 'avg_fe_7_L3M',
 'avg_fe_8_L3M',
 'avg_fe_9_L3M',
 'avg_fe_10_L3M',
 'avg_fe_11_L3M',
 'avg_fe_12_L3M',
 'avg_fe_13_L3M',
 'avg_fe_14_L3M',
 'avg_fe_15_L3M',
 'avg_fe_16_L3M',
 'avg_fe_17_L3M',
 'avg_fe_18_L3M',
 'avg_fe_19_L3M',
 'avg_fe_20_L3M',
 'sum_fe_1_L3M',
 'sum_fe_2_L3M',
 'sum_fe_3_L3M',
 'sum_fe_4_L3M',
 'sum_fe_5_L3M',
 'sum_fe_6_L3M',
 'sum_fe_7_L3M',
 'sum_fe_8_L3M',
 'sum_fe_9_L3M',
 'sum_fe_10

### Fun with YAML

In [5]:
import yaml
with open("config/silver_config.yaml", "r") as f:
    config = yaml.safe_load(f)

In [25]:
config['datasets']['features_attributes']

{'path': 'features_attributes',
 'types': {'Customer_ID': 'string',
  'Name': 'string',
  'Age': 'int',
  'SSN': 'string',
  'Occupation': 'string',
  'snapshot_date': 'date'},
 'drop_nulls': ['Customer_ID']}

In [10]:
config['datasets'].items()

dict_items([('features_attributes', {'path': 'features_attributes', 'types': {'Customer_ID': 'string', 'Name': 'string', 'Age': 'int', 'SSN': 'string', 'Occupation': 'string', 'snapshot_date': 'date'}, 'drop_nulls': ['Customer_ID']}), ('features_financials', {'path': 'features_financials', 'types': {'Customer_ID': 'string', 'Annual_Income': 'float', 'Monthly_Inhand_Salary': 'float', 'Num_Bank_Accounts': 'int', 'Num_Credit_Card': 'int', 'Interest_Rate': 'int', 'Num_of_Loan': 'int', 'Type_of_Loan': 'string', 'Delay_from_due_date': 'int', 'Num_of_Delayed_Payment': 'int', 'Changed_Credit_Limit': 'float', 'Num_Credit_Inquiries': 'int', 'Credit_Mix': 'string', 'Outstanding_Debt': 'float', 'Credit_Utilization_Ratio': 'float', 'Credit_History_Age': 'string', 'Payment_of_Min_Amount': 'string', 'Total_EMI_per_month': 'float', 'Amount_invested_monthly': 'float', 'Payment_Behaviour': 'string', 'Monthly_Balance': 'float', 'snapshot_date': 'date'}, 'drop_nulls': ['Customer_ID']}), ('feature_clickstr

In [12]:
for table_name, dataset_config in config["datasets"].items():
    print(f"Table Name: {table_name}")
    print(f"Dataset Config: {dataset_config}")

Table Name: features_attributes
Dataset Config: {'path': 'features_attributes', 'types': {'Customer_ID': 'string', 'Name': 'string', 'Age': 'int', 'SSN': 'string', 'Occupation': 'string', 'snapshot_date': 'date'}, 'drop_nulls': ['Customer_ID']}
Table Name: features_financials
Dataset Config: {'path': 'features_financials', 'types': {'Customer_ID': 'string', 'Annual_Income': 'float', 'Monthly_Inhand_Salary': 'float', 'Num_Bank_Accounts': 'int', 'Num_Credit_Card': 'int', 'Interest_Rate': 'int', 'Num_of_Loan': 'int', 'Type_of_Loan': 'string', 'Delay_from_due_date': 'int', 'Num_of_Delayed_Payment': 'int', 'Changed_Credit_Limit': 'float', 'Num_Credit_Inquiries': 'int', 'Credit_Mix': 'string', 'Outstanding_Debt': 'float', 'Credit_Utilization_Ratio': 'float', 'Credit_History_Age': 'string', 'Payment_of_Min_Amount': 'string', 'Total_EMI_per_month': 'float', 'Amount_invested_monthly': 'float', 'Payment_Behaviour': 'string', 'Monthly_Balance': 'float', 'snapshot_date': 'date'}, 'drop_nulls': ['C

# ETL Pipeline

In [3]:
# -------------------------------------------------------------------------
# Run bronze processing for each snapshot_date between 2023-01-01 and 2025-01-01
# -------------------------------------------------------------------------

start_date = datetime(2023, 1, 1)
end_date = datetime(2025, 1, 1)

current_date = start_date

while current_date <= end_date:
    snapshot_str = current_date.strftime("%Y-%m-%d")
    print(f"\n=== Processing snapshot_date: {snapshot_str} ===")

    try:
        process_bronze_tables(snapshot_date=snapshot_str, spark=spark)
    except Exception as e:
        print(f"⚠️ Failed to process {snapshot_str}: {e}")

    # Increment by 1 month
    # Use relativedelta to ensure correct month transitions
    from dateutil.relativedelta import relativedelta
    current_date += relativedelta(months=1)


=== Processing snapshot_date: 2023-01-01 ===
2025-10-25 13:53:26,347 [INFO] 🔍 Resolving config path: /app/config/bronze_config.yaml
2025-10-25 13:53:26,359 [INFO] 🚀 Starting bronze processing for snapshot_date=2023-01-01
2025-10-25 13:53:26,361 [INFO] Using configuration from: /app/config/bronze_config.yaml
2025-10-25 13:53:26,361 [INFO] 📂 Processing file: data/features_attributes.csv
2025-10-25 13:53:31,647 [INFO] 📅 features_attributes: 530 records found for 2023-01-01


                                                                                

2025-10-25 13:53:33,323 [INFO] ✅ Saved Bronze file: datamart/bronze/features_attributes/2023_01_01.parquet (530 rows)
2025-10-25 13:53:33,324 [INFO] 📂 Processing file: data/features_financials.csv
2025-10-25 13:53:33,976 [INFO] 📅 features_financials: 530 records found for 2023-01-01


                                                                                

2025-10-25 13:53:35,099 [INFO] ✅ Saved Bronze file: datamart/bronze/features_financials/2023_01_01.parquet (530 rows)
2025-10-25 13:53:35,100 [INFO] 📂 Processing file: data/feature_clickstream.csv
2025-10-25 13:53:35,941 [INFO] 📅 feature_clickstream: 8974 records found for 2023-01-01


                                                                                

2025-10-25 13:53:37,540 [INFO] ✅ Saved Bronze file: datamart/bronze/feature_clickstream/2023_01_01.parquet (8974 rows)
2025-10-25 13:53:37,542 [INFO] 📂 Processing file: data/lms_loan_daily.csv
2025-10-25 13:53:38,137 [INFO] 📅 lms_loan_daily: 530 records found for 2023-01-01


                                                                                

2025-10-25 13:53:39,382 [INFO] ✅ Saved Bronze file: datamart/bronze/lms_loan_daily/2023_01_01.parquet (530 rows)
2025-10-25 13:53:39,383 [INFO] 🎉 Bronze layer processing completed successfully.

=== Processing snapshot_date: 2023-02-01 ===
2025-10-25 13:53:39,384 [INFO] 🔍 Resolving config path: /app/config/bronze_config.yaml
2025-10-25 13:53:39,390 [INFO] 🚀 Starting bronze processing for snapshot_date=2023-02-01
2025-10-25 13:53:39,391 [INFO] Using configuration from: /app/config/bronze_config.yaml
2025-10-25 13:53:39,392 [INFO] 📂 Processing file: data/features_attributes.csv
2025-10-25 13:53:39,781 [INFO] 📅 features_attributes: 501 records found for 2023-02-01
2025-10-25 13:53:40,698 [INFO] ✅ Saved Bronze file: datamart/bronze/features_attributes/2023_02_01.parquet (501 rows)
2025-10-25 13:53:40,699 [INFO] 📂 Processing file: data/features_financials.csv
2025-10-25 13:53:41,248 [INFO] 📅 features_financials: 501 records found for 2023-02-01
2025-10-25 13:53:42,245 [INFO] ✅ Saved Bronze 

                                                                                

2025-10-25 13:53:44,104 [INFO] ✅ Saved Bronze file: datamart/bronze/feature_clickstream/2023_02_01.parquet (8974 rows)
2025-10-25 13:53:44,105 [INFO] 📂 Processing file: data/lms_loan_daily.csv
2025-10-25 13:53:44,040 [INFO] 📅 lms_loan_daily: 1031 records found for 2023-02-01


                                                                                

2025-10-25 13:53:45,152 [INFO] ✅ Saved Bronze file: datamart/bronze/lms_loan_daily/2023_02_01.parquet (1031 rows)
2025-10-25 13:53:45,153 [INFO] 🎉 Bronze layer processing completed successfully.

=== Processing snapshot_date: 2023-03-01 ===
2025-10-25 13:53:45,153 [INFO] 🔍 Resolving config path: /app/config/bronze_config.yaml
2025-10-25 13:53:45,159 [INFO] 🚀 Starting bronze processing for snapshot_date=2023-03-01
2025-10-25 13:53:45,160 [INFO] Using configuration from: /app/config/bronze_config.yaml
2025-10-25 13:53:45,161 [INFO] 📂 Processing file: data/features_attributes.csv
2025-10-25 13:53:45,476 [INFO] 📅 features_attributes: 506 records found for 2023-03-01
2025-10-25 13:53:46,242 [INFO] ✅ Saved Bronze file: datamart/bronze/features_attributes/2023_03_01.parquet (506 rows)
2025-10-25 13:53:46,243 [INFO] 📂 Processing file: data/features_financials.csv
2025-10-25 13:53:46,666 [INFO] 📅 features_financials: 506 records found for 2023-03-01
2025-10-25 13:53:47,508 [INFO] ✅ Saved Bronze

                                                                                

2025-10-25 13:53:49,855 [INFO] ✅ Saved Bronze file: datamart/bronze/feature_clickstream/2023_03_01.parquet (8974 rows)
2025-10-25 13:53:49,857 [INFO] 📂 Processing file: data/lms_loan_daily.csv
2025-10-25 13:53:50,454 [INFO] 📅 lms_loan_daily: 1537 records found for 2023-03-01


                                                                                

2025-10-25 13:53:51,685 [INFO] ✅ Saved Bronze file: datamart/bronze/lms_loan_daily/2023_03_01.parquet (1537 rows)
2025-10-25 13:53:51,687 [INFO] 🎉 Bronze layer processing completed successfully.

=== Processing snapshot_date: 2023-04-01 ===
2025-10-25 13:53:51,688 [INFO] 🔍 Resolving config path: /app/config/bronze_config.yaml
2025-10-25 13:53:51,695 [INFO] 🚀 Starting bronze processing for snapshot_date=2023-04-01
2025-10-25 13:53:51,696 [INFO] Using configuration from: /app/config/bronze_config.yaml
2025-10-25 13:53:51,697 [INFO] 📂 Processing file: data/features_attributes.csv
2025-10-25 13:53:52,033 [INFO] 📅 features_attributes: 510 records found for 2023-04-01
2025-10-25 13:53:52,869 [INFO] ✅ Saved Bronze file: datamart/bronze/features_attributes/2023_04_01.parquet (510 rows)
2025-10-25 13:53:52,871 [INFO] 📂 Processing file: data/features_financials.csv
2025-10-25 13:53:53,324 [INFO] 📅 features_financials: 510 records found for 2023-04-01
2025-10-25 13:53:54,281 [INFO] ✅ Saved Bronze

                                                                                

2025-10-25 13:53:56,469 [INFO] ✅ Saved Bronze file: datamart/bronze/feature_clickstream/2023_04_01.parquet (8974 rows)
2025-10-25 13:53:56,470 [INFO] 📂 Processing file: data/lms_loan_daily.csv
2025-10-25 13:53:57,051 [INFO] 📅 lms_loan_daily: 2047 records found for 2023-04-01


                                                                                

2025-10-25 13:53:58,472 [INFO] ✅ Saved Bronze file: datamart/bronze/lms_loan_daily/2023_04_01.parquet (2047 rows)
2025-10-25 13:53:58,473 [INFO] 🎉 Bronze layer processing completed successfully.

=== Processing snapshot_date: 2023-05-01 ===
2025-10-25 13:53:58,475 [INFO] 🔍 Resolving config path: /app/config/bronze_config.yaml
2025-10-25 13:53:58,481 [INFO] 🚀 Starting bronze processing for snapshot_date=2023-05-01
2025-10-25 13:53:58,482 [INFO] Using configuration from: /app/config/bronze_config.yaml
2025-10-25 13:53:58,483 [INFO] 📂 Processing file: data/features_attributes.csv
2025-10-25 13:53:58,787 [INFO] 📅 features_attributes: 521 records found for 2023-05-01
2025-10-25 13:53:59,607 [INFO] ✅ Saved Bronze file: datamart/bronze/features_attributes/2023_05_01.parquet (521 rows)
2025-10-25 13:53:59,608 [INFO] 📂 Processing file: data/features_financials.csv
2025-10-25 13:53:59,986 [INFO] 📅 features_financials: 521 records found for 2023-05-01
2025-10-25 13:54:00,833 [INFO] ✅ Saved Bronze

                                                                                

2025-10-25 13:54:03,029 [INFO] ✅ Saved Bronze file: datamart/bronze/feature_clickstream/2023_05_01.parquet (8974 rows)
2025-10-25 13:54:03,030 [INFO] 📂 Processing file: data/lms_loan_daily.csv
2025-10-25 13:54:03,541 [INFO] 📅 lms_loan_daily: 2568 records found for 2023-05-01


                                                                                

2025-10-25 13:54:04,759 [INFO] ✅ Saved Bronze file: datamart/bronze/lms_loan_daily/2023_05_01.parquet (2568 rows)
2025-10-25 13:54:04,760 [INFO] 🎉 Bronze layer processing completed successfully.

=== Processing snapshot_date: 2023-06-01 ===
2025-10-25 13:54:04,761 [INFO] 🔍 Resolving config path: /app/config/bronze_config.yaml
2025-10-25 13:54:04,766 [INFO] 🚀 Starting bronze processing for snapshot_date=2023-06-01
2025-10-25 13:54:04,767 [INFO] Using configuration from: /app/config/bronze_config.yaml
2025-10-25 13:54:04,767 [INFO] 📂 Processing file: data/features_attributes.csv
2025-10-25 13:54:05,068 [INFO] 📅 features_attributes: 517 records found for 2023-06-01
2025-10-25 13:54:05,898 [INFO] ✅ Saved Bronze file: datamart/bronze/features_attributes/2023_06_01.parquet (517 rows)
2025-10-25 13:54:05,899 [INFO] 📂 Processing file: data/features_financials.csv
2025-10-25 13:54:06,274 [INFO] 📅 features_financials: 517 records found for 2023-06-01
2025-10-25 13:54:07,092 [INFO] ✅ Saved Bronze

                                                                                

2025-10-25 13:54:09,128 [INFO] ✅ Saved Bronze file: datamart/bronze/feature_clickstream/2023_06_01.parquet (8974 rows)
2025-10-25 13:54:09,129 [INFO] 📂 Processing file: data/lms_loan_daily.csv
2025-10-25 13:54:09,595 [INFO] 📅 lms_loan_daily: 3085 records found for 2023-06-01


                                                                                

2025-10-25 13:54:10,755 [INFO] ✅ Saved Bronze file: datamart/bronze/lms_loan_daily/2023_06_01.parquet (3085 rows)
2025-10-25 13:54:10,756 [INFO] 🎉 Bronze layer processing completed successfully.

=== Processing snapshot_date: 2023-07-01 ===
2025-10-25 13:54:10,756 [INFO] 🔍 Resolving config path: /app/config/bronze_config.yaml
2025-10-25 13:54:10,762 [INFO] 🚀 Starting bronze processing for snapshot_date=2023-07-01
2025-10-25 13:54:10,763 [INFO] Using configuration from: /app/config/bronze_config.yaml
2025-10-25 13:54:10,763 [INFO] 📂 Processing file: data/features_attributes.csv
2025-10-25 13:54:11,086 [INFO] 📅 features_attributes: 471 records found for 2023-07-01
2025-10-25 13:54:11,932 [INFO] ✅ Saved Bronze file: datamart/bronze/features_attributes/2023_07_01.parquet (471 rows)
2025-10-25 13:54:11,935 [INFO] 📂 Processing file: data/features_financials.csv
2025-10-25 13:54:12,541 [INFO] 📅 features_financials: 471 records found for 2023-07-01
2025-10-25 13:54:13,051 [INFO] ✅ Saved Bronze

                                                                                

2025-10-25 13:54:14,541 [INFO] ✅ Saved Bronze file: datamart/bronze/feature_clickstream/2023_07_01.parquet (8974 rows)
2025-10-25 13:54:14,542 [INFO] 📂 Processing file: data/lms_loan_daily.csv
2025-10-25 13:54:14,987 [INFO] 📅 lms_loan_daily: 3556 records found for 2023-07-01


                                                                                

2025-10-25 13:54:16,157 [INFO] ✅ Saved Bronze file: datamart/bronze/lms_loan_daily/2023_07_01.parquet (3556 rows)
2025-10-25 13:54:16,158 [INFO] 🎉 Bronze layer processing completed successfully.

=== Processing snapshot_date: 2023-08-01 ===
2025-10-25 13:54:16,159 [INFO] 🔍 Resolving config path: /app/config/bronze_config.yaml
2025-10-25 13:54:16,166 [INFO] 🚀 Starting bronze processing for snapshot_date=2023-08-01
2025-10-25 13:54:16,167 [INFO] Using configuration from: /app/config/bronze_config.yaml
2025-10-25 13:54:16,167 [INFO] 📂 Processing file: data/features_attributes.csv
2025-10-25 13:54:16,452 [INFO] 📅 features_attributes: 481 records found for 2023-08-01
2025-10-25 13:54:17,246 [INFO] ✅ Saved Bronze file: datamart/bronze/features_attributes/2023_08_01.parquet (481 rows)
2025-10-25 13:54:17,247 [INFO] 📂 Processing file: data/features_financials.csv
2025-10-25 13:54:17,598 [INFO] 📅 features_financials: 481 records found for 2023-08-01
2025-10-25 13:54:18,404 [INFO] ✅ Saved Bronze

                                                                                

2025-10-25 13:54:20,582 [INFO] ✅ Saved Bronze file: datamart/bronze/feature_clickstream/2023_08_01.parquet (8974 rows)
2025-10-25 13:54:20,583 [INFO] 📂 Processing file: data/lms_loan_daily.csv
2025-10-25 13:54:21,088 [INFO] 📅 lms_loan_daily: 4037 records found for 2023-08-01
2025-10-25 13:54:22,240 [INFO] ✅ Saved Bronze file: datamart/bronze/lms_loan_daily/2023_08_01.parquet (4037 rows)
2025-10-25 13:54:22,241 [INFO] 🎉 Bronze layer processing completed successfully.

=== Processing snapshot_date: 2023-09-01 ===
2025-10-25 13:54:22,242 [INFO] 🔍 Resolving config path: /app/config/bronze_config.yaml
2025-10-25 13:54:22,254 [INFO] 🚀 Starting bronze processing for snapshot_date=2023-09-01
2025-10-25 13:54:22,255 [INFO] Using configuration from: /app/config/bronze_config.yaml
2025-10-25 13:54:22,255 [INFO] 📂 Processing file: data/features_attributes.csv
2025-10-25 13:54:22,552 [INFO] 📅 features_attributes: 454 records found for 2023-09-01
2025-10-25 13:54:23,340 [INFO] ✅ Saved Bronze file: d

                                                                                

2025-10-25 13:54:26,744 [INFO] ✅ Saved Bronze file: datamart/bronze/feature_clickstream/2023_09_01.parquet (8974 rows)
2025-10-25 13:54:26,745 [INFO] 📂 Processing file: data/lms_loan_daily.csv
2025-10-25 13:54:27,287 [INFO] 📅 lms_loan_daily: 4491 records found for 2023-09-01


                                                                                

2025-10-25 13:54:28,643 [INFO] ✅ Saved Bronze file: datamart/bronze/lms_loan_daily/2023_09_01.parquet (4491 rows)
2025-10-25 13:54:28,644 [INFO] 🎉 Bronze layer processing completed successfully.

=== Processing snapshot_date: 2023-10-01 ===
2025-10-25 13:54:28,645 [INFO] 🔍 Resolving config path: /app/config/bronze_config.yaml
2025-10-25 13:54:28,651 [INFO] 🚀 Starting bronze processing for snapshot_date=2023-10-01
2025-10-25 13:54:28,652 [INFO] Using configuration from: /app/config/bronze_config.yaml
2025-10-25 13:54:28,653 [INFO] 📂 Processing file: data/features_attributes.csv
2025-10-25 13:54:29,023 [INFO] 📅 features_attributes: 487 records found for 2023-10-01
2025-10-25 13:54:29,967 [INFO] ✅ Saved Bronze file: datamart/bronze/features_attributes/2023_10_01.parquet (487 rows)
2025-10-25 13:54:29,968 [INFO] 📂 Processing file: data/features_financials.csv
2025-10-25 13:54:30,434 [INFO] 📅 features_financials: 487 records found for 2023-10-01
2025-10-25 13:54:31,467 [INFO] ✅ Saved Bronze

                                                                                

2025-10-25 13:54:33,990 [INFO] ✅ Saved Bronze file: datamart/bronze/feature_clickstream/2023_10_01.parquet (8974 rows)
2025-10-25 13:54:33,991 [INFO] 📂 Processing file: data/lms_loan_daily.csv
2025-10-25 13:54:34,449 [INFO] 📅 lms_loan_daily: 4978 records found for 2023-10-01


                                                                                

2025-10-25 13:54:35,784 [INFO] ✅ Saved Bronze file: datamart/bronze/lms_loan_daily/2023_10_01.parquet (4978 rows)
2025-10-25 13:54:35,785 [INFO] 🎉 Bronze layer processing completed successfully.

=== Processing snapshot_date: 2023-11-01 ===
2025-10-25 13:54:35,786 [INFO] 🔍 Resolving config path: /app/config/bronze_config.yaml
2025-10-25 13:54:35,793 [INFO] 🚀 Starting bronze processing for snapshot_date=2023-11-01
2025-10-25 13:54:35,794 [INFO] Using configuration from: /app/config/bronze_config.yaml
2025-10-25 13:54:35,795 [INFO] 📂 Processing file: data/features_attributes.csv
2025-10-25 13:54:36,114 [INFO] 📅 features_attributes: 491 records found for 2023-11-01
2025-10-25 13:54:36,999 [INFO] ✅ Saved Bronze file: datamart/bronze/features_attributes/2023_11_01.parquet (491 rows)
2025-10-25 13:54:37,001 [INFO] 📂 Processing file: data/features_financials.csv
2025-10-25 13:54:37,503 [INFO] 📅 features_financials: 491 records found for 2023-11-01
2025-10-25 13:54:38,458 [INFO] ✅ Saved Bronze

                                                                                

2025-10-25 13:54:40,507 [INFO] ✅ Saved Bronze file: datamart/bronze/feature_clickstream/2023_11_01.parquet (8974 rows)
2025-10-25 13:54:40,508 [INFO] 📂 Processing file: data/lms_loan_daily.csv
2025-10-25 13:54:40,951 [INFO] 📅 lms_loan_daily: 5469 records found for 2023-11-01


                                                                                

2025-10-25 13:54:42,068 [INFO] ✅ Saved Bronze file: datamart/bronze/lms_loan_daily/2023_11_01.parquet (5469 rows)
2025-10-25 13:54:42,069 [INFO] 🎉 Bronze layer processing completed successfully.

=== Processing snapshot_date: 2023-12-01 ===
2025-10-25 13:54:42,069 [INFO] 🔍 Resolving config path: /app/config/bronze_config.yaml
2025-10-25 13:54:42,075 [INFO] 🚀 Starting bronze processing for snapshot_date=2023-12-01
2025-10-25 13:54:42,076 [INFO] Using configuration from: /app/config/bronze_config.yaml
2025-10-25 13:54:42,076 [INFO] 📂 Processing file: data/features_attributes.csv
2025-10-25 13:54:41,940 [INFO] 📅 features_attributes: 489 records found for 2023-12-01
2025-10-25 13:54:42,086 [INFO] ✅ Saved Bronze file: datamart/bronze/features_attributes/2023_12_01.parquet (489 rows)
2025-10-25 13:54:42,087 [INFO] 📂 Processing file: data/features_financials.csv
2025-10-25 13:54:42,403 [INFO] 📅 features_financials: 489 records found for 2023-12-01
2025-10-25 13:54:43,220 [INFO] ✅ Saved Bronze

                                                                                

2025-10-25 13:54:45,843 [INFO] ✅ Saved Bronze file: datamart/bronze/feature_clickstream/2023_12_01.parquet (8974 rows)
2025-10-25 13:54:45,844 [INFO] 📂 Processing file: data/lms_loan_daily.csv
2025-10-25 13:54:46,361 [INFO] 📅 lms_loan_daily: 5428 records found for 2023-12-01


                                                                                

2025-10-25 13:54:47,683 [INFO] ✅ Saved Bronze file: datamart/bronze/lms_loan_daily/2023_12_01.parquet (5428 rows)
2025-10-25 13:54:47,684 [INFO] 🎉 Bronze layer processing completed successfully.

=== Processing snapshot_date: 2024-01-01 ===
2025-10-25 13:54:47,685 [INFO] 🔍 Resolving config path: /app/config/bronze_config.yaml
2025-10-25 13:54:47,690 [INFO] 🚀 Starting bronze processing for snapshot_date=2024-01-01
2025-10-25 13:54:47,691 [INFO] Using configuration from: /app/config/bronze_config.yaml
2025-10-25 13:54:47,692 [INFO] 📂 Processing file: data/features_attributes.csv
2025-10-25 13:54:48,007 [INFO] 📅 features_attributes: 485 records found for 2024-01-01
2025-10-25 13:54:48,904 [INFO] ✅ Saved Bronze file: datamart/bronze/features_attributes/2024_01_01.parquet (485 rows)
2025-10-25 13:54:48,905 [INFO] 📂 Processing file: data/features_financials.csv
2025-10-25 13:54:49,247 [INFO] 📅 features_financials: 485 records found for 2024-01-01
2025-10-25 13:54:50,275 [INFO] ✅ Saved Bronze

                                                                                

2025-10-25 13:54:52,710 [INFO] ✅ Saved Bronze file: datamart/bronze/feature_clickstream/2024_01_01.parquet (8974 rows)
2025-10-25 13:54:52,712 [INFO] 📂 Processing file: data/lms_loan_daily.csv
2025-10-25 13:54:53,302 [INFO] 📅 lms_loan_daily: 5412 records found for 2024-01-01


                                                                                

2025-10-25 13:54:54,533 [INFO] ✅ Saved Bronze file: datamart/bronze/lms_loan_daily/2024_01_01.parquet (5412 rows)
2025-10-25 13:54:54,534 [INFO] 🎉 Bronze layer processing completed successfully.

=== Processing snapshot_date: 2024-02-01 ===
2025-10-25 13:54:54,534 [INFO] 🔍 Resolving config path: /app/config/bronze_config.yaml
2025-10-25 13:54:54,540 [INFO] 🚀 Starting bronze processing for snapshot_date=2024-02-01
2025-10-25 13:54:54,541 [INFO] Using configuration from: /app/config/bronze_config.yaml
2025-10-25 13:54:54,542 [INFO] 📂 Processing file: data/features_attributes.csv
2025-10-25 13:54:54,798 [INFO] 📅 features_attributes: 518 records found for 2024-02-01
2025-10-25 13:54:55,550 [INFO] ✅ Saved Bronze file: datamart/bronze/features_attributes/2024_02_01.parquet (518 rows)
2025-10-25 13:54:55,551 [INFO] 📂 Processing file: data/features_financials.csv
2025-10-25 13:54:55,879 [INFO] 📅 features_financials: 518 records found for 2024-02-01
2025-10-25 13:54:56,675 [INFO] ✅ Saved Bronze

                                                                                

2025-10-25 13:54:58,685 [INFO] ✅ Saved Bronze file: datamart/bronze/feature_clickstream/2024_02_01.parquet (8974 rows)
2025-10-25 13:54:58,686 [INFO] 📂 Processing file: data/lms_loan_daily.csv
2025-10-25 13:54:59,138 [INFO] 📅 lms_loan_daily: 5424 records found for 2024-02-01


                                                                                

2025-10-25 13:55:00,285 [INFO] ✅ Saved Bronze file: datamart/bronze/lms_loan_daily/2024_02_01.parquet (5424 rows)
2025-10-25 13:55:00,285 [INFO] 🎉 Bronze layer processing completed successfully.

=== Processing snapshot_date: 2024-03-01 ===
2025-10-25 13:55:00,286 [INFO] 🔍 Resolving config path: /app/config/bronze_config.yaml
2025-10-25 13:55:00,291 [INFO] 🚀 Starting bronze processing for snapshot_date=2024-03-01
2025-10-25 13:55:00,292 [INFO] Using configuration from: /app/config/bronze_config.yaml
2025-10-25 13:55:00,292 [INFO] 📂 Processing file: data/features_attributes.csv
2025-10-25 13:55:00,524 [INFO] 📅 features_attributes: 511 records found for 2024-03-01
2025-10-25 13:55:01,326 [INFO] ✅ Saved Bronze file: datamart/bronze/features_attributes/2024_03_01.parquet (511 rows)
2025-10-25 13:55:01,327 [INFO] 📂 Processing file: data/features_financials.csv
2025-10-25 13:55:01,677 [INFO] 📅 features_financials: 511 records found for 2024-03-01
2025-10-25 13:55:02,525 [INFO] ✅ Saved Bronze

                                                                                

2025-10-25 13:55:04,721 [INFO] ✅ Saved Bronze file: datamart/bronze/feature_clickstream/2024_03_01.parquet (8974 rows)
2025-10-25 13:55:04,721 [INFO] 📂 Processing file: data/lms_loan_daily.csv
2025-10-25 13:55:05,164 [INFO] 📅 lms_loan_daily: 5425 records found for 2024-03-01
2025-10-25 13:55:06,257 [INFO] ✅ Saved Bronze file: datamart/bronze/lms_loan_daily/2024_03_01.parquet (5425 rows)
2025-10-25 13:55:06,259 [INFO] 🎉 Bronze layer processing completed successfully.

=== Processing snapshot_date: 2024-04-01 ===
2025-10-25 13:55:06,260 [INFO] 🔍 Resolving config path: /app/config/bronze_config.yaml
2025-10-25 13:55:06,266 [INFO] 🚀 Starting bronze processing for snapshot_date=2024-04-01
2025-10-25 13:55:06,267 [INFO] Using configuration from: /app/config/bronze_config.yaml
2025-10-25 13:55:06,268 [INFO] 📂 Processing file: data/features_attributes.csv
2025-10-25 13:55:06,519 [INFO] 📅 features_attributes: 513 records found for 2024-04-01
2025-10-25 13:55:07,228 [INFO] ✅ Saved Bronze file: d

                                                                                

2025-10-25 13:55:10,332 [INFO] ✅ Saved Bronze file: datamart/bronze/feature_clickstream/2024_04_01.parquet (8974 rows)
2025-10-25 13:55:10,333 [INFO] 📂 Processing file: data/lms_loan_daily.csv
2025-10-25 13:55:10,792 [INFO] 📅 lms_loan_daily: 5417 records found for 2024-04-01
2025-10-25 13:55:11,008 [INFO] ✅ Saved Bronze file: datamart/bronze/lms_loan_daily/2024_04_01.parquet (5417 rows)
2025-10-25 13:55:11,009 [INFO] 🎉 Bronze layer processing completed successfully.

=== Processing snapshot_date: 2024-05-01 ===
2025-10-25 13:55:11,010 [INFO] 🔍 Resolving config path: /app/config/bronze_config.yaml
2025-10-25 13:55:11,015 [INFO] 🚀 Starting bronze processing for snapshot_date=2024-05-01
2025-10-25 13:55:11,015 [INFO] Using configuration from: /app/config/bronze_config.yaml
2025-10-25 13:55:11,016 [INFO] 📂 Processing file: data/features_attributes.csv
2025-10-25 13:55:11,293 [INFO] 📅 features_attributes: 491 records found for 2024-05-01
2025-10-25 13:55:12,113 [INFO] ✅ Saved Bronze file: d

                                                                                

2025-10-25 13:55:15,380 [INFO] ✅ Saved Bronze file: datamart/bronze/feature_clickstream/2024_05_01.parquet (8974 rows)
2025-10-25 13:55:15,381 [INFO] 📂 Processing file: data/lms_loan_daily.csv
2025-10-25 13:55:15,828 [INFO] 📅 lms_loan_daily: 5391 records found for 2024-05-01
2025-10-25 13:55:16,892 [INFO] ✅ Saved Bronze file: datamart/bronze/lms_loan_daily/2024_05_01.parquet (5391 rows)
2025-10-25 13:55:16,894 [INFO] 🎉 Bronze layer processing completed successfully.

=== Processing snapshot_date: 2024-06-01 ===
2025-10-25 13:55:16,895 [INFO] 🔍 Resolving config path: /app/config/bronze_config.yaml
2025-10-25 13:55:16,901 [INFO] 🚀 Starting bronze processing for snapshot_date=2024-06-01
2025-10-25 13:55:16,901 [INFO] Using configuration from: /app/config/bronze_config.yaml
2025-10-25 13:55:16,902 [INFO] 📂 Processing file: data/features_attributes.csv
2025-10-25 13:55:17,139 [INFO] 📅 features_attributes: 498 records found for 2024-06-01
2025-10-25 13:55:17,902 [INFO] ✅ Saved Bronze file: d

                                                                                

2025-10-25 13:55:21,046 [INFO] ✅ Saved Bronze file: datamart/bronze/feature_clickstream/2024_06_01.parquet (8974 rows)
2025-10-25 13:55:21,047 [INFO] 📂 Processing file: data/lms_loan_daily.csv
2025-10-25 13:55:21,552 [INFO] 📅 lms_loan_daily: 5418 records found for 2024-06-01


                                                                                

2025-10-25 13:55:22,599 [INFO] ✅ Saved Bronze file: datamart/bronze/lms_loan_daily/2024_06_01.parquet (5418 rows)
2025-10-25 13:55:22,600 [INFO] 🎉 Bronze layer processing completed successfully.

=== Processing snapshot_date: 2024-07-01 ===
2025-10-25 13:55:22,601 [INFO] 🔍 Resolving config path: /app/config/bronze_config.yaml
2025-10-25 13:55:22,606 [INFO] 🚀 Starting bronze processing for snapshot_date=2024-07-01
2025-10-25 13:55:22,607 [INFO] Using configuration from: /app/config/bronze_config.yaml
2025-10-25 13:55:22,607 [INFO] 📂 Processing file: data/features_attributes.csv
2025-10-25 13:55:22,859 [INFO] 📅 features_attributes: 505 records found for 2024-07-01
2025-10-25 13:55:23,592 [INFO] ✅ Saved Bronze file: datamart/bronze/features_attributes/2024_07_01.parquet (505 rows)
2025-10-25 13:55:23,593 [INFO] 📂 Processing file: data/features_financials.csv
2025-10-25 13:55:23,909 [INFO] 📅 features_financials: 505 records found for 2024-07-01
2025-10-25 13:55:24,751 [INFO] ✅ Saved Bronze

                                                                                

2025-10-25 13:55:26,721 [INFO] ✅ Saved Bronze file: datamart/bronze/feature_clickstream/2024_07_01.parquet (8974 rows)
2025-10-25 13:55:26,723 [INFO] 📂 Processing file: data/lms_loan_daily.csv
2025-10-25 13:55:27,141 [INFO] 📅 lms_loan_daily: 5442 records found for 2024-07-01
2025-10-25 13:55:28,168 [INFO] ✅ Saved Bronze file: datamart/bronze/lms_loan_daily/2024_07_01.parquet (5442 rows)
2025-10-25 13:55:28,169 [INFO] 🎉 Bronze layer processing completed successfully.

=== Processing snapshot_date: 2024-08-01 ===
2025-10-25 13:55:28,170 [INFO] 🔍 Resolving config path: /app/config/bronze_config.yaml
2025-10-25 13:55:28,176 [INFO] 🚀 Starting bronze processing for snapshot_date=2024-08-01
2025-10-25 13:55:28,177 [INFO] Using configuration from: /app/config/bronze_config.yaml
2025-10-25 13:55:28,178 [INFO] 📂 Processing file: data/features_attributes.csv
2025-10-25 13:55:28,429 [INFO] 📅 features_attributes: 543 records found for 2024-08-01
2025-10-25 13:55:29,174 [INFO] ✅ Saved Bronze file: d

                                                                                

2025-10-25 13:55:32,353 [INFO] ✅ Saved Bronze file: datamart/bronze/feature_clickstream/2024_08_01.parquet (8974 rows)
2025-10-25 13:55:32,354 [INFO] 📂 Processing file: data/lms_loan_daily.csv
2025-10-25 13:55:32,790 [INFO] 📅 lms_loan_daily: 5531 records found for 2024-08-01


                                                                                

2025-10-25 13:55:33,837 [INFO] ✅ Saved Bronze file: datamart/bronze/lms_loan_daily/2024_08_01.parquet (5531 rows)
2025-10-25 13:55:33,838 [INFO] 🎉 Bronze layer processing completed successfully.

=== Processing snapshot_date: 2024-09-01 ===
2025-10-25 13:55:33,838 [INFO] 🔍 Resolving config path: /app/config/bronze_config.yaml
2025-10-25 13:55:33,843 [INFO] 🚀 Starting bronze processing for snapshot_date=2024-09-01
2025-10-25 13:55:33,844 [INFO] Using configuration from: /app/config/bronze_config.yaml
2025-10-25 13:55:33,846 [INFO] 📂 Processing file: data/features_attributes.csv
2025-10-25 13:55:34,123 [INFO] 📅 features_attributes: 493 records found for 2024-09-01
2025-10-25 13:55:34,913 [INFO] ✅ Saved Bronze file: datamart/bronze/features_attributes/2024_09_01.parquet (493 rows)
2025-10-25 13:55:34,914 [INFO] 📂 Processing file: data/features_financials.csv
2025-10-25 13:55:35,321 [INFO] 📅 features_financials: 493 records found for 2024-09-01
2025-10-25 13:55:36,104 [INFO] ✅ Saved Bronze

                                                                                

2025-10-25 13:55:38,029 [INFO] ✅ Saved Bronze file: datamart/bronze/feature_clickstream/2024_09_01.parquet (8974 rows)
2025-10-25 13:55:38,030 [INFO] 📂 Processing file: data/lms_loan_daily.csv
2025-10-25 13:55:38,461 [INFO] 📅 lms_loan_daily: 5537 records found for 2024-09-01


                                                                                

2025-10-25 13:55:39,582 [INFO] ✅ Saved Bronze file: datamart/bronze/lms_loan_daily/2024_09_01.parquet (5537 rows)
2025-10-25 13:55:39,583 [INFO] 🎉 Bronze layer processing completed successfully.

=== Processing snapshot_date: 2024-10-01 ===
2025-10-25 13:55:39,584 [INFO] 🔍 Resolving config path: /app/config/bronze_config.yaml
2025-10-25 13:55:39,590 [INFO] 🚀 Starting bronze processing for snapshot_date=2024-10-01
2025-10-25 13:55:39,591 [INFO] Using configuration from: /app/config/bronze_config.yaml
2025-10-25 13:55:39,591 [INFO] 📂 Processing file: data/features_attributes.csv
2025-10-25 13:55:39,842 [INFO] 📅 features_attributes: 456 records found for 2024-10-01
2025-10-25 13:55:39,599 [INFO] ✅ Saved Bronze file: datamart/bronze/features_attributes/2024_10_01.parquet (456 rows)
2025-10-25 13:55:39,601 [INFO] 📂 Processing file: data/features_financials.csv
2025-10-25 13:55:39,923 [INFO] 📅 features_financials: 456 records found for 2024-10-01
2025-10-25 13:55:40,767 [INFO] ✅ Saved Bronze

                                                                                

2025-10-25 13:55:43,002 [INFO] ✅ Saved Bronze file: datamart/bronze/feature_clickstream/2024_10_01.parquet (8974 rows)
2025-10-25 13:55:43,004 [INFO] 📂 Processing file: data/lms_loan_daily.csv
2025-10-25 13:55:43,490 [INFO] 📅 lms_loan_daily: 5502 records found for 2024-10-01


                                                                                

2025-10-25 13:55:44,641 [INFO] ✅ Saved Bronze file: datamart/bronze/lms_loan_daily/2024_10_01.parquet (5502 rows)
2025-10-25 13:55:44,642 [INFO] 🎉 Bronze layer processing completed successfully.

=== Processing snapshot_date: 2024-11-01 ===
2025-10-25 13:55:44,642 [INFO] 🔍 Resolving config path: /app/config/bronze_config.yaml
2025-10-25 13:55:44,648 [INFO] 🚀 Starting bronze processing for snapshot_date=2024-11-01
2025-10-25 13:55:44,648 [INFO] Using configuration from: /app/config/bronze_config.yaml
2025-10-25 13:55:44,649 [INFO] 📂 Processing file: data/features_attributes.csv
2025-10-25 13:55:44,855 [INFO] 📅 features_attributes: 488 records found for 2024-11-01
2025-10-25 13:55:45,657 [INFO] ✅ Saved Bronze file: datamart/bronze/features_attributes/2024_11_01.parquet (488 rows)
2025-10-25 13:55:45,658 [INFO] 📂 Processing file: data/features_financials.csv
2025-10-25 13:55:45,951 [INFO] 📅 features_financials: 488 records found for 2024-11-01
2025-10-25 13:55:46,745 [INFO] ✅ Saved Bronze

                                                                                

2025-10-25 13:55:48,552 [INFO] ✅ Saved Bronze file: datamart/bronze/feature_clickstream/2024_11_01.parquet (8974 rows)
2025-10-25 13:55:48,553 [INFO] 📂 Processing file: data/lms_loan_daily.csv
2025-10-25 13:55:48,980 [INFO] 📅 lms_loan_daily: 5501 records found for 2024-11-01


                                                                                

2025-10-25 13:55:50,431 [INFO] ✅ Saved Bronze file: datamart/bronze/lms_loan_daily/2024_11_01.parquet (5501 rows)
2025-10-25 13:55:50,432 [INFO] 🎉 Bronze layer processing completed successfully.

=== Processing snapshot_date: 2024-12-01 ===
2025-10-25 13:55:50,432 [INFO] 🔍 Resolving config path: /app/config/bronze_config.yaml
2025-10-25 13:55:50,439 [INFO] 🚀 Starting bronze processing for snapshot_date=2024-12-01
2025-10-25 13:55:50,439 [INFO] Using configuration from: /app/config/bronze_config.yaml
2025-10-25 13:55:50,440 [INFO] 📂 Processing file: data/features_attributes.csv
2025-10-25 13:55:50,754 [INFO] 📅 features_attributes: 515 records found for 2024-12-01
2025-10-25 13:55:51,669 [INFO] ✅ Saved Bronze file: datamart/bronze/features_attributes/2024_12_01.parquet (515 rows)
2025-10-25 13:55:51,670 [INFO] 📂 Processing file: data/features_financials.csv
2025-10-25 13:55:52,025 [INFO] 📅 features_financials: 515 records found for 2024-12-01
2025-10-25 13:55:52,940 [INFO] ✅ Saved Bronze

                                                                                

2025-10-25 13:55:55,015 [INFO] ✅ Saved Bronze file: datamart/bronze/feature_clickstream/2024_12_01.parquet (8974 rows)
2025-10-25 13:55:55,016 [INFO] 📂 Processing file: data/lms_loan_daily.csv
2025-10-25 13:55:55,491 [INFO] 📅 lms_loan_daily: 5531 records found for 2024-12-01


                                                                                

2025-10-25 13:55:56,694 [INFO] ✅ Saved Bronze file: datamart/bronze/lms_loan_daily/2024_12_01.parquet (5531 rows)
2025-10-25 13:55:56,695 [INFO] 🎉 Bronze layer processing completed successfully.

=== Processing snapshot_date: 2025-01-01 ===
2025-10-25 13:55:56,696 [INFO] 🔍 Resolving config path: /app/config/bronze_config.yaml
2025-10-25 13:55:56,702 [INFO] 🚀 Starting bronze processing for snapshot_date=2025-01-01
2025-10-25 13:55:56,702 [INFO] Using configuration from: /app/config/bronze_config.yaml
2025-10-25 13:55:56,703 [INFO] 📂 Processing file: data/features_attributes.csv
2025-10-25 13:55:56,948 [INFO] 📅 features_attributes: 526 records found for 2025-01-01
2025-10-25 13:55:57,736 [INFO] ✅ Saved Bronze file: datamart/bronze/features_attributes/2025_01_01.parquet (526 rows)
2025-10-25 13:55:57,737 [INFO] 📂 Processing file: data/features_financials.csv
2025-10-25 13:55:58,052 [INFO] 📅 features_financials: 526 records found for 2025-01-01
2025-10-25 13:55:58,935 [INFO] ✅ Saved Bronze

                                                                                

2025-10-25 13:56:00,941 [INFO] ✅ Saved Bronze file: datamart/bronze/lms_loan_daily/2025_01_01.parquet (5539 rows)
2025-10-25 13:56:00,942 [INFO] 🎉 Bronze layer processing completed successfully.


In [3]:
# -------------------------------------------------------------------------
# Run silver processing for each snapshot_date between 2023-01-01 and 2025-01-01
# -------------------------------------------------------------------------

start_date = datetime(2023, 1, 1)
end_date = datetime(2025, 1, 1)

current_date = start_date

while current_date <= end_date:
    snapshot_str = current_date.strftime("%Y-%m-%d")
    print(f"\n=== Processing snapshot_date: {snapshot_str} ===")

    try:
        process_silver_tables(snapshot_date=snapshot_str, spark=spark)
    except Exception as e:
        print(f"⚠️ Failed to process {snapshot_str}: {e}")

    # Increment by 1 month
    # Use relativedelta to ensure correct month transitions
    from dateutil.relativedelta import relativedelta
    current_date += relativedelta(months=1)


=== Processing snapshot_date: 2023-01-01 ===
2025-10-25 22:04:33,914 [INFO] 🔍 Resolving config path: /app/config/silver_config.yaml
2025-10-25 22:04:33,927 [INFO] 🚀 Starting silver layer processing for snapshot_date=2023-01-01
2025-10-25 22:04:33,929 [INFO] Using configuration from: /app/config/silver_config.yaml
2025-10-25 22:04:33,930 [INFO] 📂 Processing Silver table: features_attributes...


                                                                                

2025-10-25 22:04:39,400 [INFO] ✅ Saved Silver file: /app/datamart/silver/features_attributes/2023_01_01.parquet
2025-10-25 22:04:39,401 [INFO] 📂 Processing Silver table: features_financials...


                                                                                

2025-10-25 22:04:42,028 [INFO] ✅ Saved Silver file: /app/datamart/silver/features_financials/2023_01_01.parquet
2025-10-25 22:04:42,029 [INFO] 📂 Processing Silver table: feature_clickstream...
2025-10-25 22:04:43,497 [INFO] ✅ Saved Silver file: /app/datamart/silver/feature_clickstream/2023_01_01.parquet
2025-10-25 22:04:43,497 [INFO] 📂 Processing Silver table: lms_loan_daily...
2025-10-25 22:04:44,842 [INFO] ✅ Saved Silver file: /app/datamart/silver/lms_loan_daily/2023_01_01.parquet
2025-10-25 22:04:44,844 [INFO] 🎉 Silver layer processing completed successfully.

=== Processing snapshot_date: 2023-02-01 ===
2025-10-25 22:04:44,845 [INFO] 🔍 Resolving config path: /app/config/silver_config.yaml
2025-10-25 22:04:44,855 [INFO] 🚀 Starting silver layer processing for snapshot_date=2023-02-01
2025-10-25 22:04:44,856 [INFO] Using configuration from: /app/config/silver_config.yaml
2025-10-25 22:04:44,857 [INFO] 📂 Processing Silver table: features_attributes...
2025-10-25 22:04:46,195 [INFO] ✅ S

                                                                                

2025-10-25 22:05:07,391 [INFO] ✅ Saved Silver file: /app/datamart/silver/features_financials/2023_06_01.parquet
2025-10-25 22:05:07,392 [INFO] 📂 Processing Silver table: feature_clickstream...
2025-10-25 22:05:08,703 [INFO] ✅ Saved Silver file: /app/datamart/silver/feature_clickstream/2023_06_01.parquet
2025-10-25 22:05:08,704 [INFO] 📂 Processing Silver table: lms_loan_daily...
2025-10-25 22:05:09,959 [INFO] ✅ Saved Silver file: /app/datamart/silver/lms_loan_daily/2023_06_01.parquet
2025-10-25 22:05:09,960 [INFO] 🎉 Silver layer processing completed successfully.

=== Processing snapshot_date: 2023-07-01 ===
2025-10-25 22:05:09,962 [INFO] 🔍 Resolving config path: /app/config/silver_config.yaml
2025-10-25 22:05:09,972 [INFO] 🚀 Starting silver layer processing for snapshot_date=2023-07-01
2025-10-25 22:05:09,972 [INFO] Using configuration from: /app/config/silver_config.yaml
2025-10-25 22:05:09,974 [INFO] 📂 Processing Silver table: features_attributes...
2025-10-25 22:05:11,127 [INFO] ✅ S

In [3]:
# -------------------------------------------------------------------------
# Run gold processing for each snapshot_date between 2023-03-01 and 2025-01-01
# -------------------------------------------------------------------------

start_date = datetime(2023, 1, 1)
end_date = datetime(2025, 1, 1)

current_date = start_date

while current_date <= end_date:
    snapshot_str = current_date.strftime("%Y-%m-%d")
    print(f"\n=== Processing snapshot_date: {snapshot_str} ===")

    try:
        process_gold_tables(snapshot_date=snapshot_str, spark=spark)
    except Exception as e:
        print(f"⚠️ Failed to process {snapshot_str}: {e}")

    # Increment by 1 month
    # Use relativedelta to ensure correct month transitions
    from dateutil.relativedelta import relativedelta
    current_date += relativedelta(months=1)


=== Processing snapshot_date: 2023-01-01 ===
2025-10-26 10:52:03,021 [INFO] 🚀 Building Gold tables for 2023-01-01
2025-10-26 10:52:05,666 [INFO] ⏭️ Skipping feature_store generation for 2023-01-01: insufficient L3M clickstream data.

=== Processing snapshot_date: 2023-02-01 ===
2025-10-26 10:52:05,672 [INFO] 🚀 Building Gold tables for 2023-02-01
2025-10-26 10:52:06,317 [INFO] ⏭️ Skipping feature_store generation for 2023-02-01: insufficient L3M clickstream data.

=== Processing snapshot_date: 2023-03-01 ===
2025-10-26 10:52:06,322 [INFO] 🚀 Building Gold tables for 2023-03-01
2025-10-26 10:52:06,951 [INFO] ⏭️ Skipping feature_store generation for 2023-03-01: insufficient L3M clickstream data.

=== Processing snapshot_date: 2023-04-01 ===
2025-10-26 10:52:06,958 [INFO] 🚀 Building Gold tables for 2023-04-01


                                                                                

2025-10-26 10:52:14,267 [INFO] ✅ Feature store saved: /app/datamart/gold/feature_store/2023_04_01.parquet
2025-10-26 10:52:14,269 [INFO] ✅ Label store saved:   /app/datamart/gold/label_store/2023_04_01.parquet
2025-10-26 10:52:14,270 [INFO] 🎉 Completed Gold ETL for 2023-04-01

=== Processing snapshot_date: 2023-05-01 ===
2025-10-26 10:52:14,274 [INFO] 🚀 Building Gold tables for 2023-05-01
2025-10-26 10:52:18,110 [INFO] ✅ Feature store saved: /app/datamart/gold/feature_store/2023_05_01.parquet
2025-10-26 10:52:18,111 [INFO] ✅ Label store saved:   /app/datamart/gold/label_store/2023_05_01.parquet
2025-10-26 10:52:18,111 [INFO] 🎉 Completed Gold ETL for 2023-05-01

=== Processing snapshot_date: 2023-06-01 ===
2025-10-26 10:52:18,117 [INFO] 🚀 Building Gold tables for 2023-06-01
2025-10-26 10:52:21,577 [INFO] ✅ Feature store saved: /app/datamart/gold/feature_store/2023_06_01.parquet
2025-10-26 10:52:21,578 [INFO] ✅ Label store saved:   /app/datamart/gold/label_store/2023_06_01.parquet
2025-1

# Model Promotion

In [1]:
import sys
import os

# Ensure Python can find your project modules
sys.path.append(os.path.abspath("."))  # treat current directory as project root

from utils.promote_best_model import main

# Set snapshot date to score
snapshot_date = "2024-06-01"

# Run inference
main(snapshot_date=snapshot_date)

2025-10-28 13:14:40,892 [INFO] 🚀 Initial deployment date reached (2024-06-01). Selecting best candidate for first deployment.
2025-10-28 13:14:41,518 [INFO] 🚀 Promoted 2024-01-01 (OOT AUC=0.825) → deployed_model/
2025-10-28 13:14:41,520 [INFO] ✅ Initial model deployed from 2024-01-01 (OOT AUC = 0.825).


# Model Training Pipeline

In [3]:
import sys
import os

# Ensure Python can find your project modules
sys.path.append(os.path.abspath("."))  # current directory as project root

from utils.model_train import main

# Optionally, pick a snapshot_date manually for the run
snapshot_date = "2024-04-01"

# Call main()
main(snapshot_date=snapshot_date, config_path="config/ML_config.yaml")

2025-10-26 23:32:00,443 [INFO] 📂 Found 9 aligned feature/label pairs.
2025-10-26 23:32:01,699 [INFO] ✅ Loaded 4421 rows (2023-04-01 → 2023-12-01)
2025-10-26 23:32:01,710 [INFO] 📊 Split summary: train=2954, val=487, test=491, oot=489
2025-10-26 23:32:06,547 [INFO] 🏆 Best params={'subsample': 0.6, 'reg_lambda': 1, 'reg_alpha': 0, 'n_estimators': 50, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.8}, CV AUC=0.7837
2025-10-26 23:32:06,731 [INFO] 📊 Metrics: {
  "train": {
    "auc": 0.900134769830418,
    "gini": 0.8002695396608359,
    "ks": 0.639266087248076,
    "f1": 0.5808893093661306,
    "accuracy": 0.8500338524035207,
    "precision": 0.7892030848329049,
    "recall": 0.4595808383233533
  },
  "val": {
    "auc": 0.7881562881562882,
    "gini": 0.5763125763125765,
    "ks": 0.5377955377955378,
    "f1": 0.34532374100719426,
    "accuracy": 0.813141683778234,
    "precision": 0.5,
    "recall": 0.26373626373626374
  },
  "test": {
   

In [4]:
import sys
import os

# Ensure Python can find your project modules
sys.path.append(os.path.abspath("."))  # current directory as project root

from utils.model_train import main

# Optionally, pick a snapshot_date manually for the run
snapshot_date = "2024-05-01"

# Call main()
main(snapshot_date=snapshot_date, config_path="config/ML_config.yaml")

2025-10-26 23:32:07,008 [INFO] 📂 Found 10 aligned feature/label pairs.
2025-10-26 23:32:08,246 [INFO] ✅ Loaded 4396 rows (2023-04-01 → 2024-01-01)
2025-10-26 23:32:08,257 [INFO] 📊 Split summary: train=2931, val=491, test=489, oot=485
2025-10-26 23:32:09,023 [INFO] 🏆 Best params={'subsample': 0.6, 'reg_lambda': 2, 'reg_alpha': 1, 'n_estimators': 50, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.8}, CV AUC=0.8018
2025-10-26 23:32:09,080 [INFO] 📊 Metrics: {
  "train": {
    "auc": 0.8910177894429864,
    "gini": 0.7820355788859727,
    "ks": 0.6243341675313842,
    "f1": 0.5805207328833173,
    "accuracy": 0.8515864892528148,
    "precision": 0.7678571428571429,
    "recall": 0.4666666666666667
  },
  "val": {
    "auc": 0.7769559032716927,
    "gini": 0.5539118065433855,
    "ks": 0.4845898530109056,
    "f1": 0.37037037037037035,
    "accuracy": 0.7922606924643585,
    "precision": 0.5882352941176471,
    "recall": 0.2702702702702703
  },

In [4]:
import sys
import os

# Ensure Python can find your project modules
sys.path.append(os.path.abspath("."))  # current directory as project root

from utils.model_train import main

# Optionally, pick a snapshot_date manually for the run
snapshot_date = "2024-06-01"

# Call main()
main(snapshot_date=snapshot_date, config_path="config/ML_config.yaml")

2025-10-28 19:21:27,325 [INFO] 📂 Found 11 aligned feature/label pairs.
2025-10-28 19:21:28,712 [INFO] ✅ Loaded 4393 rows (2023-04-01 → 2024-02-01)
2025-10-28 19:21:28,725 [INFO] 📊 Split summary: train=2901, val=489, test=485, oot=518
2025-10-28 19:21:30,544 [INFO] 🏆 Best params={'subsample': 0.6, 'reg_lambda': 1.5, 'reg_alpha': 1, 'n_estimators': 50, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.8}, CV AUC=0.7889
2025-10-28 19:21:30,619 [INFO] 📊 Metrics: {
  "train": {
    "auc": 0.8857288505146855,
    "gini": 0.7714577010293711,
    "ks": 0.6058738296460575,
    "f1": 0.5489795918367347,
    "accuracy": 0.8476387452602551,
    "precision": 0.7865497076023392,
    "recall": 0.4216300940438871
  },
  "val": {
    "auc": 0.7696641626159966,
    "gini": 0.5393283252319931,
    "ks": 0.4524524966858153,
    "f1": 0.4175824175824176,
    "accuracy": 0.7832310838445807,
    "precision": 0.6551724137931034,
    "recall": 0.3064516129032258
 

# Model Inference Pipeline

In [3]:
import sys
import os

# Ensure Python can find your project modules
sys.path.append(os.path.abspath("."))  # treat current directory as project root

from utils.model_inference import main

# Set snapshot date to score
snapshot_date = "2024-06-01"

# Run inference
main(snapshot_date=snapshot_date)

2025-10-27 19:55:51,180 [INFO] ✅ Loaded deployed model from /app/model_store/deployed_model
2025-10-27 19:55:51,182 [INFO] Model version: credit_model_2024-01-01 | trained: 2025-10-26 23:32:09
2025-10-27 19:55:51,293 [INFO] 📦 Loaded feature data: 498 records to score.
2025-10-27 19:55:51,296 [INFO] Categorical imputer trained on: ['Occupation', 'Credit_Mix', 'Payment_Behaviour', 'Payment_of_Min_Amount']
2025-10-27 19:55:51,296 [INFO] Categorical columns in current batch: ['Occupation', 'Credit_Mix', 'Payment_Behaviour', 'Payment_of_Min_Amount']
2025-10-27 19:55:51,312 [INFO] 🔧 Preprocessed features with 117 columns.
2025-10-27 19:55:51,356 [INFO] 💾 Saved predictions to /app/datamart/gold/model_predictions/2024_06_01.parquet
2025-10-27 19:55:51,357 [INFO] 🎉 Inference completed successfully for snapshot=2024-06-01




In [4]:
import pandas as pd
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score
from scipy.stats import ks_2samp

# Paths
pred_path  = "datamart/gold/model_predictions/2024_06_01.parquet"
label_path = "datamart/gold/label_store/2024_10_01.parquet"

# Load
preds = pd.read_parquet(pred_path)
labels = pd.read_parquet(label_path)

# Sanity check columns
print("Predictions columns:", preds.columns.tolist())
print("Labels columns:", labels.columns.tolist())

# Merge on common identifiers
df_eval = preds.merge(labels, on="Customer_ID", how="inner")
print(f"Merged records: {len(df_eval)}")

# Ground truth column — adjust if needed (label / label_def etc.)
y_true = df_eval["label"]
y_prob = df_eval["prediction_score"]
y_pred = (y_prob > 0.5).astype(int)

# --- Compute metrics ---
auc = roc_auc_score(y_true, y_prob)
gini = 2 * auc - 1
ks = ks_2samp(y_prob[y_true == 1], y_prob[y_true == 0]).statistic
f1 = f1_score(y_true, y_pred)
acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred, zero_division=0)
rec = recall_score(y_true, y_pred)

print("\n📊 Evaluation Metrics")
print(f"AUC       : {auc:.4f}")
print(f"Gini      : {gini:.4f}")
print(f"KS        : {ks:.4f}")
print(f"F1 Score  : {f1:.4f}")
print(f"Accuracy  : {acc:.4f}")
print(f"Precision : {prec:.4f}")
print(f"Recall    : {rec:.4f}")

Predictions columns: ['Customer_ID', 'prediction_score', 'model_version', 'snapshot_date']
Labels columns: ['Customer_ID', 'loan_start_date', 'label', 'label_def', 'snapshot_date']
Merged records: 498

📊 Evaluation Metrics
AUC       : 0.8103
Gini      : 0.6206
KS        : 0.5295
F1 Score  : 0.4468
Accuracy  : 0.7912
Precision : 0.6667
Recall    : 0.3360


# Model Monitoring Pipeline

In [3]:
import sys
import os

# Ensure Python can find your project modules
sys.path.append(os.path.abspath("."))  # treat current directory as project root

from utils.model_monitoring import main

# Set snapshot date to score
snapshot_date = "2024-10-01"

# Run inference
main(snapshot_date=snapshot_date)

2025-10-28 19:37:18,477 [INFO] 🧮 Monitoring alignment check:
2025-10-28 19:37:18,479 [INFO]   → Label snapshot (ground truth available): 2024-10-01
2025-10-28 19:37:18,480 [INFO]   → Prediction snapshot (inference month): 2024-06-01
2025-10-28 19:37:18,480 [INFO]   → Evaluating model performance for predictions made in 2024-06-01 using labels from 2024-10-01
2025-10-28 19:37:18,594 [INFO] 📊 Performance metrics:
2025-10-28 19:37:18,595 [INFO]   auc: 0.8103
2025-10-28 19:37:18,595 [INFO]   gini: 0.6206
2025-10-28 19:37:18,595 [INFO]   ks: 0.5295
2025-10-28 19:37:18,596 [INFO]   f1: 0.4468
2025-10-28 19:37:18,597 [INFO]   accuracy: 0.7912
2025-10-28 19:37:18,597 [INFO]   precision: 0.6667
2025-10-28 19:37:18,599 [INFO]   recall: 0.3360
2025-10-28 19:37:18,607 [INFO] 📘 PSI reference snapshot (from deployed model): 2024_01_01
2025-10-28 19:37:18,609 [INFO] 🧮 PSI alignment check:
2025-10-28 19:37:18,609 [INFO]   → Baseline (training end): 2024_01_01
2025-10-28 19:37:18,610 [INFO]   → Current

  fig.tight_layout()
  plt.savefig(out_path, dpi=200)
