# Initialisation

In [1]:
import os
from pathlib import Path
import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
import pyspark.sql.functions as F

from pyspark.sql.functions import col, sum as spark_sum, isnan, when
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType

In [2]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/21 13:48:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Set Up

In [3]:
bronze_dir = Path("datamart/bronze")
silver_dir = Path("datamart/silver")
gold_dir = Path("datamart/gold")

In [5]:
csv_files = [
    bronze_dir / "features_credit_history.csv",
    bronze_dir / "features_demographic.csv",
    bronze_dir / "features_financial.csv",
    bronze_dir / "features_loan_terms.csv"
]

In [6]:
for file in csv_files:
    df = spark.read.option("header", True).option("inferSchema", True).csv(str(file))
    print(f"\n=== {file.name} ===")
    df.printSchema()
    if "snapshot_date" in df.columns:
        df.select("snapshot_date").printSchema()  # see data type
        df.select("snapshot_date").show(5, truncate=False)  # preview first 5 values
    else:
        print("⚠️ 'snapshot_date' column not found.")

                                                                                


=== features_credit_history.csv ===
root
 |-- member_id: string (nullable = true)
 |-- snapshot_date: date (nullable = true)
 |-- mort_acc: double (nullable = true)
 |-- num_tl_op_past_12m: double (nullable = true)
 |-- earliest_cr_line: string (nullable = true)
 |-- last_credit_pull_d: string (nullable = true)
 |-- inq_last_6mths: double (nullable = true)
 |-- inq_last_12m: double (nullable = true)
 |-- inq_fi: double (nullable = true)
 |-- mths_since_last_delinq: double (nullable = true)
 |-- mths_since_last_record: double (nullable = true)
 |-- mths_since_last_major_derog: double (nullable = true)
 |-- mths_since_recent_bc_dlq: double (nullable = true)
 |-- mths_since_recent_inq: double (nullable = true)
 |-- mths_since_recent_revol_delinq: double (nullable = true)
 |-- mths_since_rcnt_il: double (nullable = true)
 |-- mths_since_recent_bc: double (nullable = true)
 |-- acc_now_delinq: double (nullable = true)
 |-- delinq_2yrs: double (nullable = true)
 |-- pub_rec: double (nullabl

                                                                                


=== features_demographic.csv ===
root
 |-- member_id: string (nullable = true)
 |-- snapshot_date: date (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- application_type: string (nullable = true)
 |-- annual_inc_joint: string (nullable = true)
 |-- verification_status_joint: string (nullable = true)
 |-- sec_app_earliest_cr_line: string (nullable = true)
 |-- sec_app_inq_last_6mths: double (nullable = true)
 |-- sec_app_mort_acc: double (nullable = true)
 |-- sec_app_open_acc: double (nullable = true)
 |-- sec_app_revol_util: double (nullable = true)
 |-- sec_app_open_act_il: double (nullable = true)
 |-- sec_app_num_rev_accts: double (nullable = true)

root
 |-- snapshot_date: date (nullable = true)

+---------

                                                                                


=== features_financial.csv ===
root
 |-- member_id: string (nullable = true)
 |-- snapshot_date: date (nullable = true)
 |-- dti: double (nullable = true)
 |-- dti_joint: double (nullable = true)
 |-- revol_bal: integer (nullable = true)
 |-- revol_bal_joint: double (nullable = true)
 |-- revol_util: double (nullable = true)
 |-- total_rev_hi_lim: double (nullable = true)
 |-- tot_coll_amt: double (nullable = true)
 |-- tot_cur_bal: double (nullable = true)
 |-- avg_cur_bal: double (nullable = true)
 |-- all_util: double (nullable = true)
 |-- max_bal_bc: double (nullable = true)
 |-- il_util: double (nullable = true)
 |-- bc_util: double (nullable = true)
 |-- total_bal_il: double (nullable = true)
 |-- total_bal_ex_mort: double (nullable = true)
 |-- total_bc_limit: double (nullable = true)
 |-- total_il_high_credit_limit: double (nullable = true)
 |-- tot_hi_cred_lim: double (nullable = true)
 |-- open_acc: double (nullable = true)
 |-- total_acc: double (nullable = true)
 |-- open

[Stage 10:>                                                       (0 + 16) / 16]


=== features_loan_terms.csv ===
root
 |-- id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- snapshot_date: date (nullable = true)
 |-- loan_amnt: integer (nullable = true)
 |-- funded_amnt: integer (nullable = true)
 |-- funded_amnt_inv: double (nullable = true)
 |-- term: string (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- installment: double (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- issue_d: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- pymnt_plan: string (nullable = true)
 |-- url: string (nullable = true)
 |-- desc: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- title: string (nullable = true)
 |-- initial_list_status: string (nullable = true)
 |-- out_prncp: string (nullable = true)
 |-- out_prncp_inv: string (nullable = true)
 |-- total_pymnt: string (nullable = true)
 |-- total_pymnt_inv: string (nullable = true)
 |-- total_rec_p

                                                                                

# Config

In [8]:
snapshot_date_str = "2022-01-01"

start_date_str = "2022-01-01"
end_date_str = "2022-01-30"

In [9]:
def generate_date_list(start_date_str, end_date_str):
    start_date = datetime.strptime(start_date_str, "%Y-%m-%d").date()
    end_date = datetime.strptime(end_date_str, "%Y-%m-%d").date()

    current_date = start_date
    date_list = []

    while current_date <= end_date:
        date_list.append(current_date)
        current_date += timedelta(days=1)

    return date_list

In [10]:
date_list = generate_date_list(start_date_str, end_date_str)

In [11]:
date_list_str = [d.strftime("%Y-%m-%d") for d in date_list]

# Silver Level Processing

In [15]:
from utils.silver_processing import silver_processing

# silver_processing.py automatically splits the data by month and backfills the data accordingly
# data_window refers to the snapshot dates used
silver_processing(bronze_dir, silver_dir, data_window=date_list_str)



Silver-level processing complete.


                                                                                

## Silver Table Schemas

In [15]:
folders = [
    Path("datamart/silver/demographic"), 
    Path("datamart/silver/financial"), 
    Path("datamart/silver/credit_history"),
    Path("datamart/silver/loan_terms")
]

for folder in folders:
    parquet_files = list(folder.rglob("*.parquet"))
    sample_file = random.choice(parquet_files)
    print(f"📂 Sampled file: {sample_file}")
    df = spark.read.parquet(str(sample_file))
    df.printSchema()
    df.select("member_id").show(5, truncate=False)

📂 Sampled file: datamart/silver/demographic/demographic_2022-01-05/part-00000-2049cd1c-6a0a-4b0e-85a3-780ce44f683d-c000.snappy.parquet
root
 |-- member_id: string (nullable = true)
 |-- snapshot_date: date (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- application_type: string (nullable = true)

+-----------+
|member_id  |
+-----------+
|MEM_1000898|
|MEM_1001193|
|MEM_100162 |
|MEM_1002017|
|MEM_1002605|
+-----------+
only showing top 5 rows

📂 Sampled file: datamart/silver/financial/financial_2022-01-21/part-00000-ba8197c6-b9f8-4521-b14c-23dd55ddcc8f-c000.snappy.parquet
root
 |-- member_id: string (nullable = true)
 |-- snapshot_date: date (nullable = true)
 |-- dti: double (nullable = true)
 |-- revol_bal: i

# Gold Level Processing

## Table Processing

In [14]:
from utils.gold_processing import gold_processing

gold_processing(silver_dir, gold_dir, data_window=date_list_str)

                                                                                

Gold-level processing complete.


### Gold Table Schemas

In [16]:
folders = [
    Path("datamart/gold/demographic"), 
    Path("datamart/gold/credit_history"), 
    Path("datamart/gold/financial"), 
    Path("datamart/gold/loan_terms")
]

for folder in folders:
    parquet_files = list(folder.rglob("*.parquet"))
    sample_file = random.choice(parquet_files)
    print(f"📂 Sampled file: {sample_file}")
    df = spark.read.parquet(str(sample_file))
    df.printSchema()

📂 Sampled file: datamart/gold/demographic/demographic_2022-01-22/part-00000-3daccbf5-ea6c-4253-a6a9-7da5882c4001-c000.snappy.parquet
root
 |-- member_id: string (nullable = true)
 |-- snapshot_date: date (nullable = true)
 |-- annual_inc: string (nullable = true)
 |-- home_ownership_ohe: vector (nullable = true)
 |-- verification_status_ohe: vector (nullable = true)
 |-- application_type_ohe: vector (nullable = true)

📂 Sampled file: datamart/gold/credit_history/credit_history_2022-02-24/part-00000-8d0dc212-7bc0-4ee5-a955-341e62360d31-c000.snappy.parquet
root
 |-- member_id: string (nullable = true)
 |-- snapshot_date: date (nullable = true)
 |-- mort_acc: double (nullable = true)
 |-- num_tl_op_past_12m: double (nullable = true)
 |-- inq_last_6mths: double (nullable = true)
 |-- inq_last_12m: double (nullable = true)
 |-- inq_fi: double (nullable = true)
 |-- mths_since_last_delinq: double (nullable = true)
 |-- mths_since_recent_inq: double (nullable = true)
 |-- mths_since_rcnt_il: 

## Label Store

In [12]:
from utils.gold_label_store import create_gold_label_store

create_gold_label_store(silver_dir, gold_dir, data_window=date_list_str)

                                                                                

Gold label store written to: datamart/gold/label_store


In [13]:
label_df = spark.read.parquet("datamart/gold/label_store")
label_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- snapshot_date: date (nullable = true)
 |-- grade: string (nullable = true)



## Feature Store

In [10]:
from utils.gold_feature_store import create_feature_store

df = create_feature_store(silver_dir, gold_dir, data_window=date_list)

                                                                                

['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'int_rate', 'installment', 'term_0', 'term_1', 'term_2', 'loan_status_0', 'loan_status_1', 'loan_status_2', 'loan_status_3', 'loan_status_4', 'loan_status_5', 'loan_status_6', 'loan_status_7', 'loan_status_8', 'pymnt_plan_0', 'pymnt_plan_1', 'pymnt_plan_2', 'purpose_0', 'purpose_1', 'purpose_2', 'purpose_3', 'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8', 'purpose_9', 'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'purpose_14', 'purpose_15', 'purpose_16', 'purpose_17', 'purpose_18', 'purpose_19', 'purpose_20', 'purpose_21', 'purpose_22', 'purpose_23', 'purpose_24', 'purpose_25', 'purpose_26', 'purpose_27', 'initial_list_status_0', 'initial_list_status_1', 'initial_list_status_2', 'disbursement_method_0', 'disbursement_method_1', 'disbursement_method_2', 'debt_settlement_flag_0', 'debt_settlement_flag_1', 'debt_settlement_flag_2', 'emp_length', 'annual_inc', 'emp_title_ohe_0', 'emp_title_ohe_1', 'emp_title_ohe_2',

                                                                                

Gold label store written to: datamart/gold/feature_store


In [13]:
features_df = spark.read.parquet("datamart/gold/feature_store")
features_df.printSchema()

root
 |-- loan_amnt: float (nullable = true)
 |-- funded_amnt: float (nullable = true)
 |-- funded_amnt_inv: float (nullable = true)
 |-- int_rate: float (nullable = true)
 |-- installment: float (nullable = true)
 |-- term_0: double (nullable = true)
 |-- term_1: double (nullable = true)
 |-- term_2: double (nullable = true)
 |-- loan_status_0: double (nullable = true)
 |-- loan_status_1: double (nullable = true)
 |-- loan_status_2: double (nullable = true)
 |-- loan_status_3: double (nullable = true)
 |-- loan_status_4: double (nullable = true)
 |-- loan_status_5: double (nullable = true)
 |-- loan_status_6: double (nullable = true)
 |-- loan_status_7: double (nullable = true)
 |-- loan_status_8: double (nullable = true)
 |-- pymnt_plan_0: double (nullable = true)
 |-- pymnt_plan_1: double (nullable = true)
 |-- pymnt_plan_2: double (nullable = true)
 |-- purpose_0: double (nullable = true)
 |-- purpose_1: double (nullable = true)
 |-- purpose_2: double (nullable = true)
 |-- purpose