#Label

In [1]:
# ─── Jupyter Cell: Load a Silver Loan-Daily Parquet by Date ──────────────────
import os, glob
from pyspark.sql import SparkSession

# 1) Spark session
spark = SparkSession.builder.appName("PreviewSilverLoanDaily").getOrCreate()

# 2) Base directory that holds silver loan-daily parquet folders
loan_silver_base = "/app/datamart/silver/loan_daily"

# 3) Pick the date you want, e.g. "2023_02_01"
target_date = "2023_02_01"           # <— change as needed

folder_name = f"silver_loan_daily_{target_date}.parquet"
parquet_dir = os.path.join(loan_silver_base, folder_name)

# 4) Find the actual parquet file inside Spark’s output folder
parquet_files = glob.glob(os.path.join(parquet_dir, "*.parquet"))
if not parquet_files:
    raise FileNotFoundError(f"No parquet files found in {parquet_dir}")

sample_file = parquet_files[0]
print(f"Loading: {sample_file}")

# 5) Read into Spark DataFrame and show head
df_loan = spark.read.parquet(sample_file)
df_loan.show(10, truncate=False)

# (Optional) pandas preview
# import pandas as pd
# pd.set_option("display.max_columns", None)
# display(df_loan.limit(5).toPandas())


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/22 14:55:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Loading: /app/datamart/silver/loan_daily/silver_loan_daily_2023_02_01.parquet/part-00000-5c80abdf-24ac-41a9-a55f-9f8a26a9f2fb-c000.snappy.parquet
+---------------------+-----------+---------------+------+---------------+--------+-------+--------+-----------+-------+-------------+---+-------------------+-----------------+---+
|loan_id              |Customer_ID|loan_start_date|tenure|installment_num|loan_amt|due_amt|paid_amt|overdue_amt|balance|snapshot_date|mob|installments_missed|first_missed_date|dpd|
+---------------------+-----------+---------------+------+---------------+--------+-------+--------+-----------+-------+-------------+---+-------------------+-----------------+---+
|CUS_0x1037_2023_01_01|CUS_0x1037 |2023-01-01     |10    |1              |10000.0 |1000.0 |1000.0  |0.0        |9000.0 |2023-02-01   |1  |0                  |NULL             |0  |
|CUS_0x1069_2023_01_01|CUS_0x1069 |2023-01-01     |10    |1              |10000.0 |1000.0 |1000.0  |0.0        |9000.0 |2023-02-01

In [2]:
# ─── Jupyter Cell: Count total rows in the loan‐daily DataFrame ──────────────
total_rows = df_loan.count()
print(f"Total rows in silver_loan_daily_{target_date}: {total_rows}")


Total rows in silver_loan_daily_2023_02_01: 1031


In [3]:
# ─── Jupyter Cell: Total row-count across ALL Silver loan-daily Parquets ──────
import os
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("TotalRowsLoanDaily").getOrCreate()

loan_silver_base = "/app/datamart/silver/loan_daily"

# Identify every dated folder: silver_loan_daily_YYYY_MM_DD.parquet
folders = [
    d for d in os.listdir(loan_silver_base)
    if d.startswith("silver_loan_daily_") and d.endswith(".parquet")
]

grand_total = 0
per_date = {}

for folder in folders:
    path = os.path.join(loan_silver_base, folder)
    df = spark.read.parquet(path)          # Spark reads the whole folder
    cnt = df.count()
    per_date[folder] = cnt
    grand_total += cnt
    print(f"{folder}: {cnt} rows")

print(f"\n🧮 Grand total across {len(folders)} loan-daily snapshots: {grand_total}")


25/05/22 14:56:36 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


silver_loan_daily_2023_01_01.parquet: 530 rows
silver_loan_daily_2023_02_01.parquet: 1031 rows
silver_loan_daily_2023_03_01.parquet: 1537 rows
silver_loan_daily_2023_04_01.parquet: 2047 rows
silver_loan_daily_2023_05_01.parquet: 2568 rows
silver_loan_daily_2023_06_01.parquet: 3085 rows
silver_loan_daily_2023_07_01.parquet: 3556 rows
silver_loan_daily_2023_08_01.parquet: 4037 rows
silver_loan_daily_2023_09_01.parquet: 4491 rows
silver_loan_daily_2023_10_01.parquet: 4978 rows
silver_loan_daily_2023_11_01.parquet: 5469 rows
silver_loan_daily_2023_12_01.parquet: 5428 rows
silver_loan_daily_2024_01_01.parquet: 5412 rows
silver_loan_daily_2024_02_01.parquet: 5424 rows
silver_loan_daily_2024_03_01.parquet: 5425 rows
silver_loan_daily_2024_04_01.parquet: 5417 rows
silver_loan_daily_2024_05_01.parquet: 5391 rows
silver_loan_daily_2024_06_01.parquet: 5418 rows
silver_loan_daily_2024_07_01.parquet: 5442 rows
silver_loan_daily_2024_08_01.parquet: 5531 rows
silver_loan_daily_2024_09_01.parquet: 553

In [4]:
# ─── Jupyter Cell: Row-counts for Silver attributes, financials, clickstream ──
import os
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("TotalRowsAllSilverDomains").getOrCreate()

silver_domains = {
    "attributes":  "/app/datamart/silver/attributes",
    "financials":  "/app/datamart/silver/financials",
    "clickstream": "/app/datamart/silver/clickstream",
}

grand_total_all = 0

for domain, base_dir in silver_domains.items():
    print(f"\n📂 Domain: {domain}")
    
    # Pattern: silver_<domain>_YYYY_MM_DD.parquet
    pattern_prefix = f"silver_{domain}_"
    
    folders = [
        d for d in os.listdir(base_dir)
        if d.startswith(pattern_prefix) and d.endswith(".parquet")
    ]
    
    domain_total = 0
    for folder in sorted(folders):
        path = os.path.join(base_dir, folder)
        cnt = spark.read.parquet(path).count()
        domain_total += cnt
        grand_total_all += cnt
        print(f"  {folder}: {cnt} rows")
    
    print(f"  ➡️  Grand total ({domain}): {domain_total}")

print(f"\n🧮 Grand total across ALL three domains: {grand_total_all}")


25/05/22 15:02:28 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.



📂 Domain: attributes
  silver_attributes_2023_01_01.parquet: 530 rows
  silver_attributes_2023_02_01.parquet: 501 rows
  silver_attributes_2023_03_01.parquet: 506 rows
  silver_attributes_2023_04_01.parquet: 510 rows
  silver_attributes_2023_05_01.parquet: 521 rows
  silver_attributes_2023_06_01.parquet: 517 rows
  silver_attributes_2023_07_01.parquet: 471 rows
  silver_attributes_2023_08_01.parquet: 481 rows
  silver_attributes_2023_09_01.parquet: 454 rows
  silver_attributes_2023_10_01.parquet: 487 rows
  silver_attributes_2023_11_01.parquet: 491 rows
  silver_attributes_2023_12_01.parquet: 489 rows
  silver_attributes_2024_01_01.parquet: 485 rows
  silver_attributes_2024_02_01.parquet: 518 rows
  silver_attributes_2024_03_01.parquet: 511 rows
  silver_attributes_2024_04_01.parquet: 513 rows
  silver_attributes_2024_05_01.parquet: 491 rows
  silver_attributes_2024_06_01.parquet: 498 rows
  silver_attributes_2024_07_01.parquet: 505 rows
  silver_attributes_2024_08_01.parquet: 543 row

In [5]:
# ─── Jupyter Cell: Preview a Sample Silver *financials* Parquet ──────────────
import os, glob, pandas as pd
from pyspark.sql import SparkSession

# 1) Spark session (reuse if already running)
spark = SparkSession.builder.appName("PreviewSilverFinancialsSample").getOrCreate()

# 2) Locate Silver-financials directory
silver_fin_base = "/app/datamart/silver/financials"

# 3) Grab one dated folder (e.g. the most recent) ─ change index if desired
fin_folders = sorted(
    d for d in os.listdir(silver_fin_base)
    if d.startswith("silver_financials_") and d.endswith(".parquet")
)
sample_dir = os.path.join(silver_fin_base, fin_folders[-1])   # pick latest

# 4) Pick the first part-*.parquet file Spark wrote
parquet_file = glob.glob(os.path.join(sample_dir, "*.parquet"))[0]

# 5) Read with Spark and convert small sample to pandas
df_fin = spark.read.parquet(parquet_file)
sample_pd = df_fin.limit(5).toPandas()

# 6) Show all columns
pd.set_option("display.max_columns", None)
display(sample_pd)


25/05/22 15:03:30 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Unnamed: 0,Customer_ID,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,snapshot_date
0,CUS_0x103e,98690.8,8262.233333333334,4,6,9,1_,Student Loan,6,17,10.76,4.0,Good,706.96,26.860663456485163,26 Years and 11 Months,No,55.004407569291885,913.4813186573292,Low_spent_Small_value_payments,147.7376071067124,2024-12-01
1,CUS_0x1195,30429.91,2808.8258333333333,4,6,16,2,"Auto Loan, and Auto Loan",22,17,1.67,1549.0,Standard,362.48,33.349050386579805,28 Years and 11 Months,No,29.91407646402904,82.87878577514347,Low_spent_Large_value_payments,438.0897210941608,2024-12-01
2,CUS_0x1197,92300.01,7437.6675,2,4,11,3,"Credit-Builder Loan, Not Specified, and Credit...",27,9,18.96,2.0,_,755.17,26.98978682085289,18 Years and 11 Months,Yes,49236.0,220.8621525417414,Low_spent_Large_value_payments,581.1567885447394,2024-12-01
3,CUS_0x11e2,44986.55,3689.879166666667,6,5,11,1,Credit-Builder Loan,0,4,10.26,3.0,Good,753.21,25.58628563739529,20 Years and 0 Months,No,23.26713547724025,43.20363344633164,High_spent_Large_value_payments,542.5171477430948,2024-12-01
4,CUS_0x11ec,14867.69,1005.9741666666664,9,9,18,6,"Debt Consolidation Loan, Student Loan, Persona...",39,15,18.5,8.0,Standard,2344.06,24.34438751567912,17 Years and 2 Months,Yes,55.45960424165637,100.14574834721886,Low_spent_Medium_value_payments,224.99206407779144,2024-12-01


In [None]:
# Set the option to display full column width
# Using None tells pandas to not truncate the column content
pd.set_option('display.max_colwidth', None)

# Display the DataFrame or the specific column
print("Displaying 'sample_pd' with full column values:")
print(sample_pd)

# If you only want to see the specific column 'Type_of_Loan'
print("\nDisplaying only the 'Type_of_Loan' column with full values:")
print(sample_pd[['Type_of_Loan']])

# If you want to see the unique non-truncated values from the column
print("\nUnique non-truncated values from 'Type_of_Loan':")
for loan_type in sample_pd['Type_of_Loan'].unique():
    print(loan_type)

# Reset the option to default if you don't want it to affect subsequent displays
pd.reset_option('display.max_colwidth')

print("\nDisplay after resetting options (may show truncation again):")
print(sample_pd)    

Displaying 'sample_pd' with full column values:
  Customer_ID Annual_Income Monthly_Inhand_Salary Num_Bank_Accounts  \
0  CUS_0x103e       98690.8     8262.233333333334                 4   
1  CUS_0x1195      30429.91    2808.8258333333333                 4   
2  CUS_0x1197      92300.01             7437.6675                 2   
3  CUS_0x11e2      44986.55     3689.879166666667                 6   
4  CUS_0x11ec      14867.69    1005.9741666666664                 9   

  Num_Credit_Card  Interest_Rate Num_of_Loan  \
0               6              9          1_   
1               6             16           2   
2               4             11           3   
3               5             11           1   
4               9             18           6   

                                                                                                        Type_of_Loan  \
0                                                                                                       Student Loan

In [12]:
# ─── Jupyter Cell: Row counts per *gold_label_store* snapshot and grand total ─
import os
from pyspark.sql import SparkSession

# 1) Spark session
spark = SparkSession.builder.appName("RowCountsLabelStore").getOrCreate()

# 2) Directory with gold_label_store_<date>.parquet folders
label_base = "/app/datamart/gold/label_store"

# 3) Gather every dated parquet folder
folders = sorted(
    d for d in os.listdir(label_base)
    if d.startswith("gold_label_store_") and d.endswith(".parquet")
)

if not folders:
    raise FileNotFoundError(f"No gold_label_store_*.parquet folders in {label_base}")

grand_total = 0

print("Per-snapshot row counts:")
for folder in folders:
    path = os.path.join(label_base, folder)
    cnt = spark.read.parquet(path).count()
    grand_total += cnt
    print(f"  {folder}: {cnt:,} rows")

print(f"\n🧮  Grand total across {len(folders)} snapshots: {grand_total:,} rows")


25/05/22 15:18:56 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Per-snapshot row counts:
  gold_label_store_2023_01_01.parquet: 0 rows
  gold_label_store_2023_02_01.parquet: 0 rows
  gold_label_store_2023_03_01.parquet: 0 rows
  gold_label_store_2023_04_01.parquet: 0 rows
  gold_label_store_2023_05_01.parquet: 0 rows
  gold_label_store_2023_06_01.parquet: 0 rows
  gold_label_store_2023_07_01.parquet: 530 rows
  gold_label_store_2023_08_01.parquet: 501 rows
  gold_label_store_2023_09_01.parquet: 506 rows
  gold_label_store_2023_10_01.parquet: 510 rows
  gold_label_store_2023_11_01.parquet: 521 rows
  gold_label_store_2023_12_01.parquet: 517 rows
  gold_label_store_2024_01_01.parquet: 471 rows
  gold_label_store_2024_02_01.parquet: 481 rows
  gold_label_store_2024_03_01.parquet: 454 rows
  gold_label_store_2024_04_01.parquet: 487 rows
  gold_label_store_2024_05_01.parquet: 491 rows
  gold_label_store_2024_06_01.parquet: 489 rows
  gold_label_store_2024_07_01.parquet: 485 rows
  gold_label_store_2024_08_01.parquet: 518 rows
  gold_label_store_2024_09_

In [1]:
# utils/gold_feature_financial.py
import os, re
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import IntegerType, FloatType

# ------------------------------------------------------------------ #
#  Financial helpers                                                 #
# ------------------------------------------------------------------ #
def _one_hot_loans(df: DataFrame) -> DataFrame:
    df = df.withColumn(
        "loan_list",
        F.split(F.regexp_replace("Type_of_Loan", r"\s+and\s+", ","), ",\s*")
    )
    loans = (
        df.select(F.explode("loan_list").alias("raw"))
          .select(F.trim("raw").alias("loan"))
          .filter("loan != ''")
          .distinct()
          .rdd.flatMap(lambda r: r)
          .collect()
    )
    for loan in loans:
        safe = "loan_" + re.sub(r"[^A-Za-z0-9]", "_", loan).lower()
        df = df.withColumn(safe, F.array_contains("loan_list", loan).cast("int"))
    return df.drop("loan_list")

def _credit_history_to_months(df: DataFrame) -> DataFrame:
    return (
        df.withColumn("years_part",
            F.regexp_extract("Credit_History_Age", r"(\d+)\s*Years", 1).cast(IntegerType())
        )
        .withColumn("months_part",
            F.regexp_extract("Credit_History_Age", r"(\d+)\s*Months", 1).cast(IntegerType())
        )
        .fillna({"years_part": 0, "months_part": 0})
        .withColumn("Credit_History_Months",
            F.col("years_part")*12 + F.col("months_part")
        )
        .drop("years_part", "months_part")
    )

def _prepare_financials(df_fin: DataFrame) -> DataFrame:
    df_fin = _one_hot_loans(df_fin)
    df_fin = _credit_history_to_months(df_fin)
    return df_fin

# ------------------------------------------------------------------ #
#  Click-stream helper                                               #
# ------------------------------------------------------------------ #
def _dedup_click(df_cs: DataFrame) -> DataFrame:
    aggs = [F.mean(F.col(f"fe_{i}")).alias(f"fe_{i}") for i in range(1, 21)]
    return df_cs.groupBy("Customer_ID", "snapshot_date").agg(*aggs)

# ------------------------------------------------------------------ #
#  Build Gold for a single date                                      #
# ------------------------------------------------------------------ #
def build_gold_for_date(
    spark: SparkSession,
    silver_base: str,
    gold_root: str,
    date_str: str,
) -> DataFrame:
    """
    Build Gold feature store anchored on Financials for one snapshot date.
    Parquet is written to:  <gold_root>/feature_store/gold_feature_financial_<date>.parquet
    """
    fin_path  = f"{silver_base}/financials/silver_financials_{date_str}.parquet"
    attr_path = f"{silver_base}/attributes/silver_attributes_{date_str}.parquet"
    cs_path   = f"{silver_base}/clickstream/silver_clickstream_{date_str}.parquet"

    # 1. Load mandatory Financials
    fin = _prepare_financials(spark.read.parquet(fin_path))

    # 2. Optional left-join tables
    attrs  = spark.read.parquet(attr_path) if os.path.exists(attr_path) else None
    cs_raw = spark.read.parquet(cs_path)   if os.path.exists(cs_path)   else None
    clicks = _dedup_click(cs_raw) if cs_raw else None

    gold = fin
    if attrs:
        gold = gold.join(attrs, ["Customer_ID", "snapshot_date"], "left")
    if clicks:
        gold = gold.join(clicks, ["Customer_ID", "snapshot_date"], "left")

    # 3. Impute defaults
    fe_cols = [f"fe_{i}" for i in range(1, 21)]
    gold = gold.fillna(0, subset=fe_cols) \
               .fillna({"Age": 0, "Credit_History_Months": 0})

    # 4. Example engineered metric
    gold = gold.withColumn(
        "Debt_to_Income_Ratio",
        F.when(
            F.col("Annual_Income") > 0,
            F.col("Outstanding_Debt") / F.col("Annual_Income")
        ).otherwise(F.lit(None).cast(FloatType()))
    )

    # 5. Write to feature_store directory
    out_dir = f"{gold_root}/feature_store/gold_feature_financial_{date_str}.parquet"
    gold.repartition(1).write.mode("overwrite").parquet(out_dir)
    print(f"✅  Gold financial feature store written → {out_dir}")
    return gold




  F.split(F.regexp_replace("Type_of_Loan", r"\s+and\s+", ","), ",\s*")


In [None]:
# ─── Jupyter Orchestrator Cell (Financial-centric Gold build) ────────────────
import os, re, sys, pandas as pd
from pyspark.sql import SparkSession

# Spark session
spark = SparkSession.builder.appName("GoldFinancialOrchestrator").getOrCreate()

# utils import
sys.path.append("utils")
# from gold_feature_financial import build_gold_for_date

silver_base = "/app/datamart/silver"
gold_root   = "/app/datamart/gold"

# Collect snapshot dates present in Silver Financials
fin_dates = {
    re.search(r"(\d{4}_\d{2}_\d{2})", f).group(1)
    for f in os.listdir(f"{silver_base}/financials")
    if f.startswith("silver_financials_") and f.endswith(".parquet")
}

print("Financial snapshot dates:", sorted(fin_dates))

# Build Gold for each date
for d in sorted(fin_dates):
    build_gold_for_date(
        spark=spark,
        silver_base=silver_base,
        gold_root=gold_root,
        date_str=d
    )

# ---- Preview latest snapshot ----
latest_folder = sorted(
    f for f in os.listdir(f"{gold_root}/feature_store")
    if f.startswith("gold_feature_financial_") and f.endswith(".parquet")
)[-1]

sample_path = f"{gold_root}/feature_store/{latest_folder}"
df_preview  = spark.read.parquet(sample_path)

row_cnt = df_preview.count()
print(f"\nRows in latest Gold snapshot ({latest_folder}): {row_cnt}")

pd.set_option("display.max_columns", None)
# display(df_preview.limit(5).toPandas())

spark.stop()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/22 15:42:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Financial snapshot dates: ['2023_01_01', '2023_02_01', '2023_03_01', '2023_04_01', '2023_05_01', '2023_06_01', '2023_07_01', '2023_08_01', '2023_09_01', '2023_10_01', '2023_11_01', '2023_12_01', '2024_01_01', '2024_02_01', '2024_03_01', '2024_04_01', '2024_05_01', '2024_06_01', '2024_07_01', '2024_08_01', '2024_09_01', '2024_10_01', '2024_11_01', '2024_12_01']


25/05/22 15:42:13 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


✅  Gold financial feature store written → /app/datamart/gold/feature_store/gold_feature_financial_2023_01_01.parquet
✅  Gold financial feature store written → /app/datamart/gold/feature_store/gold_feature_financial_2023_02_01.parquet
✅  Gold financial feature store written → /app/datamart/gold/feature_store/gold_feature_financial_2023_03_01.parquet
✅  Gold financial feature store written → /app/datamart/gold/feature_store/gold_feature_financial_2023_04_01.parquet
✅  Gold financial feature store written → /app/datamart/gold/feature_store/gold_feature_financial_2023_05_01.parquet
✅  Gold financial feature store written → /app/datamart/gold/feature_store/gold_feature_financial_2023_06_01.parquet
✅  Gold financial feature store written → /app/datamart/gold/feature_store/gold_feature_financial_2023_07_01.parquet
✅  Gold financial feature store written → /app/datamart/gold/feature_store/gold_feature_financial_2023_08_01.parquet
✅  Gold financial feature store written → /app/datamart/gold/fea

Unnamed: 0,Customer_ID,snapshot_date,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,loan_home_equity_loan,loan_payday_loan,loan_personal_loan,loan_debt_consolidation_loan,loan_mortgage_loan,loan_student_loan,loan_credit_builder_loan,loan_auto_loan,loan_not_specified,Credit_History_Months,Name,Age,SSN,Occupation,fe_1,fe_2,fe_3,fe_4,fe_5,fe_6,fe_7,fe_8,fe_9,fe_10,fe_11,fe_12,fe_13,fe_14,fe_15,fe_16,fe_17,fe_18,fe_19,fe_20,Debt_to_Income_Ratio
0,CUS_0x103e,2024-12-01,98690.8,8262.233333333334,4,6,9,1_,Student Loan,6,17,10.76,4.0,Good,706.96,26.860663456485163,26 Years and 11 Months,No,55.004407569291885,913.4813186573292,Low_spent_Small_value_payments,147.7376071067124,0,0,0,0,0,1,0,0,0,323,Tim Kellyf,40,155-72-8070,Scientist,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007163
1,CUS_0x1195,2024-12-01,30429.91,2808.8258333333333,4,6,16,2,"Auto Loan, and Auto Loan",22,17,1.67,1549.0,Standard,362.48,33.349050386579805,28 Years and 11 Months,No,29.91407646402904,82.87878577514347,Low_spent_Large_value_payments,438.0897210941608,0,0,0,0,0,0,0,1,0,347,Alexk,31,822-48-3629,Manager,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011912
2,CUS_0x1197,2024-12-01,92300.01,7437.6675,2,4,11,3,"Credit-Builder Loan, Not Specified, and Credit...",27,9,18.96,2.0,_,755.17,26.98978682085289,18 Years and 11 Months,Yes,49236.0,220.8621525417414,Low_spent_Large_value_payments,581.1567885447394,0,0,0,0,0,0,1,0,1,227,Nayako,28,799-23-8283,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008182
3,CUS_0x11e2,2024-12-01,44986.55,3689.879166666667,6,5,11,1,Credit-Builder Loan,0,4,10.26,3.0,Good,753.21,25.58628563739529,20 Years and 0 Months,No,23.26713547724025,43.20363344633164,High_spent_Large_value_payments,542.5171477430948,0,0,0,0,0,0,1,0,0,240,Valetkevitchr,34,809-04-1419,Musician,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016743
4,CUS_0x11ec,2024-12-01,14867.69,1005.9741666666664,9,9,18,6,"Debt Consolidation Loan, Student Loan, Persona...",39,15,18.5,8.0,Standard,2344.06,24.34438751567912,17 Years and 2 Months,Yes,55.45960424165637,100.14574834721886,Low_spent_Medium_value_payments,224.99206407779144,0,0,1,1,0,1,1,1,0,206,William Schombergh,34,417-74-2163,Journalist,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.157661


In [3]:
# ─── Jupyter Cell: Preview & row-count for the Gold feature store ─────────────
import os, glob, pandas as pd
from pyspark.sql import SparkSession

# 1) Spark session (reuse if running)
spark = SparkSession.builder.appName("PreviewGoldFeatureStore").getOrCreate()

# 2) Gold directory that now holds gold_feature_store_<YYYY_MM_DD>.parquet folders
gold_base = "/app/datamart/gold"

# 3) Gather all date folders
gold_folders = sorted(
    d for d in os.listdir(gold_base)
    if d.startswith("gold_feature_store_") and d.endswith(".parquet")
)

if not gold_folders:
    raise FileNotFoundError(f"No gold_feature_store_*.parquet folders in {gold_base}")

# 4) Show total rows across ALL dates
df_gold_all = spark.read.parquet(*[os.path.join(gold_base, d) for d in gold_folders])
total_rows = df_gold_all.count()
print(f"🧮  Total rows across all Gold feature snapshots: {total_rows}\n")

# 5) Take the most recent snapshot and display a sample
sample_folder = gold_folders[-1]                 # latest; change index if desired
sample_path   = os.path.join(gold_base, sample_folder)
sample_file   = glob.glob(os.path.join(sample_path, "*.parquet"))[0]

df_sample = spark.read.parquet(sample_file)

# Convert small sample to pandas and show all columns
pd.set_option("display.max_columns", None)
display(df_sample.limit(5).toPandas())


FileNotFoundError: No gold_feature_store_*.parquet folders in /app/datamart/gold

In [None]:
# utils/gold_feature_store.py
import os
import re
from typing import List

from pyspark.sql import SparkSession, DataFrame
import pyspark.sql.functions as F
from pyspark.sql.types import FloatType

# ------------------------------------------------------------------ #
#  Helpers
# ------------------------------------------------------------------ #
def _date_from_fname(fname: str) -> str:
    """Extract YYYY_MM_DD from any silver_<domain>_<YYYY_MM_DD>.parquet string."""
    m = re.search(r"(\d{4}_\d{2}_\d{2})", fname)
    if not m:
        raise ValueError(f"Cannot parse date in {fname}")
    return m.group(1)

def _deduplicate_clickstream(df: DataFrame) -> DataFrame:
    """
    Handle many duplicate clickstream rows per Customer_ID & snapshot_date
    by taking the mean of each fe_i feature.
    """
    agg_exprs = [
        F.mean(F.col(f"fe_{i}")).alias(f"fe_{i}") for i in range(1, 21)
    ]
    return (
        df.groupBy("Customer_ID", "snapshot_date")
          .agg(*agg_exprs)
    )

def _engineer_features(df: DataFrame) -> DataFrame:
    """Example engineered features."""
    return (
        df
        .withColumn(
            "Debt_to_Income_Ratio",
            F.when(
                F.col("Annual_Income") > 0,
                F.col("Outstanding_Debt") / F.col("Annual_Income")
            ).otherwise(F.lit(None).cast(FloatType()))
        )
        .withColumn(
            "Monthly_Savings",
            F.col("Monthly_Inhand_Salary")
            - F.col("Total_EMI_per_month")
            - F.col("Amount_invested_monthly")
        )
    )

# ------------------------------------------------------------------ #
#  Public API
# ------------------------------------------------------------------ #
def build_gold_for_date(
    spark: SparkSession,
    silver_base: str,
    gold_base: str,
    date_str: str,
) -> DataFrame:
    """
    Build Gold feature set for one snapshot date (YYYY_MM_DD).
    Writes to gold_base/gold_features_<YYYY_MM_DD>.parquet
    and returns the resulting DataFrame.
    """
    # 1. Silver paths for this date
    attr_path = os.path.join(
        silver_base, "attributes", f"silver_attributes_{date_str}.parquet"
    )
    fin_path = os.path.join(
        silver_base, "financials", f"silver_financials_{date_str}.parquet"
    )
    cs_path = os.path.join(
        silver_base, "clickstream", f"silver_clickstream_{date_str}.parquet"
    )

    # 2. Load
    attrs = spark.read.parquet(attr_path)
    fins  = spark.read.parquet(fin_path)
    clicks_raw = spark.read.parquet(cs_path)

    # 3. Deduplicate click-stream
    clicks = _deduplicate_clickstream(clicks_raw)

    # 4. Join  (Attributes ⊕ Financials ⊕ Clickstream)
    df = (
        attrs.join(fins, ["Customer_ID", "snapshot_date"], how="inner")
             .join(clicks, ["Customer_ID", "snapshot_date"], how="left")
    )

    # 5. Feature engineering
    df = _engineer_features(df)

    # 6. Write Gold Parquet
    out_dir = os.path.join(gold_base, f"gold_features_{date_str}.parquet")
    df.repartition(1).write.mode("overwrite").parquet(out_dir)
    print(f"✅  Gold features written → {out_dir}")

    return df


In [None]:
# ─── Jupyter Cell: Bronze→Silver already done; build Gold now ────────────────
import os, re, sys
from pyspark.sql import SparkSession

# 1) Spark session
spark = SparkSession.builder.appName("BuildGoldFeatureStore").getOrCreate()

# 2) Import helper
sys.path.append("utils")
from gold_feature_store import build_gold_for_date

# 3) Locate silver folders to discover common snapshot dates
silver_base = "/app/datamart/silver"
gold_base   = "/app/datamart/gold"

attr_dates = {
    re.search(r"(\d{4}_\d{2}_\d{2})", d).group(1)
    for d in os.listdir(os.path.join(silver_base, "attributes"))
    if d.startswith("silver_attributes_")
}
fin_dates = {
    re.search(r"(\d{4}_\d{2}_\d{2})", d).group(1)
    for d in os.listdir(os.path.join(silver_base, "financials"))
    if d.startswith("silver_financials_")
}
click_dates = {
    re.search(r"(\d{4}_\d{2}_\d{2})", d).group(1)
    for d in os.listdir(os.path.join(silver_base, "clickstream"))
    if d.startswith("silver_clickstream_")
}

common_dates = sorted(attr_dates & fin_dates & click_dates)
print("Common snapshot dates:", common_dates)

# 4) Build Gold tables for each common date
for date_str in common_dates:
    build_gold_for_date(
        spark=spark,
        silver_base=silver_base,
        gold_base=gold_base,
        date_str=date_str
    )

spark.stop()
