# 01_attributes_feature_store.ipynb  
**Context:** You’re a data scientist at a bank preparing a feature store for a loan‐default prediction model.  
We’ll use **Medallion Architecture**:  
- **Bronze**: raw landing (CSV)  
- **Silver**: cleaned & typed (Parquet)  
- **Gold**: ML-ready features (Parquet)  

**Data:** `features_attributes.csv` holds per‐customer demographics at each snapshot.  
Goal: build Bronze→Silver→Gold tables for these attributes.


In [14]:
# Cell 1: Imports & Spark setup
import os, glob
from datetime import datetime
import pandas as pd
import numpy as np

import pyspark
import pyspark.sql.functions as F
from pyspark.sql.functions import col, to_date, when
from pyspark.sql.types import IntegerType, StringType

# Initialize Spark
spark = (pyspark.sql.SparkSession.builder
         .appName("AttrFeatureStore")
         .master("local[*]")
         .getOrCreate())
spark.sparkContext.setLogLevel("ERROR")

# Paths
RAW_CSV     = "data/features_attributes.csv"
BRONZE_DIR  = "datamart/bronze/attributes"   # raw CSV slices
SILVER_DIR  = "datamart/silver/attributes"   # cleaned Parquet
GOLD_DIR    = "datamart/gold/attributes"     # ML-ready Parquet

for d in (BRONZE_DIR,SILVER_DIR,GOLD_DIR):
    os.makedirs(d, exist_ok=True)


In [16]:
# Cell 2: Generate monthly snapshot dates
start, end = "2023-01-01","2024-12-01"
def gen_month_starts(s,e):
    s_dt, e_dt = datetime.strptime(s,"%Y-%m-%d"), datetime.strptime(e,"%Y-%m-%d")
    out, cur = [], datetime(s_dt.year,s_dt.month,1)
    while cur <= e_dt:
        out.append(cur.strftime("%Y-%m-%d"))
        # advance month
        m = cur.month+1
        y = cur.year + (m//13)
        cur = datetime(y, (m-1)%12+1, 1)
    return out

dates = gen_month_starts(start,end)
print("Snapshots:", dates[:3], "...", dates[-3:])


Snapshots: ['2023-01-01', '2023-02-01', '2023-03-01'] ... ['2024-10-01', '2024-11-01', '2024-12-01']


In [17]:
# Cell 3: BRONZE – raw CSV slices (for audit & replay)
for d in dates:
    df = (spark.read.csv(RAW_CSV, header=True, inferSchema=True)
          .filter(col("snapshot_date")==d))
    out = f"{BRONZE_DIR}/bronze_attributes_{d.replace('-','_')}.csv"
    df.toPandas().to_csv(out, index=False)
    print(f"{d}: {df.count()} rows → {out}")


2023-01-01: 530 rows → datamart/bronze/attributes/bronze_attributes_2023_01_01.csv
2023-02-01: 501 rows → datamart/bronze/attributes/bronze_attributes_2023_02_01.csv
2023-03-01: 506 rows → datamart/bronze/attributes/bronze_attributes_2023_03_01.csv
2023-04-01: 510 rows → datamart/bronze/attributes/bronze_attributes_2023_04_01.csv
2023-05-01: 521 rows → datamart/bronze/attributes/bronze_attributes_2023_05_01.csv
2023-06-01: 517 rows → datamart/bronze/attributes/bronze_attributes_2023_06_01.csv
2023-07-01: 471 rows → datamart/bronze/attributes/bronze_attributes_2023_07_01.csv
2023-08-01: 481 rows → datamart/bronze/attributes/bronze_attributes_2023_08_01.csv
2023-09-01: 454 rows → datamart/bronze/attributes/bronze_attributes_2023_09_01.csv
2023-10-01: 487 rows → datamart/bronze/attributes/bronze_attributes_2023_10_01.csv
2023-11-01: 491 rows → datamart/bronze/attributes/bronze_attributes_2023_11_01.csv
2023-12-01: 489 rows → datamart/bronze/attributes/bronze_attributes_2023_12_01.csv
2024

In [18]:
# Cell 4: Inspect Bronze slice
b = pd.read_csv(f"{BRONZE_DIR}/bronze_attributes_2023_01_01.csv")
print("Bronze shape:", b.shape)
display(b.head())


Bronze shape: (530, 6)


Unnamed: 0,Customer_ID,Name,Age,SSN,Occupation,snapshot_date
0,CUS_0x1037,Matthewm,45,230-22-9583,Accountant,2023-01-01
1,CUS_0x1069,Andreas Cremero,32,761-27-5143,Accountant,2023-01-01
2,CUS_0x114a,Valetkevitchu,43,133-89-5234,Developer,2023-01-01
3,CUS_0x1184,Cohenq,49,963-76-2464,Lawyer,2023-01-01
4,CUS_0x1297,Edwardsz,46,#F%$D@*&8,Manager,2023-01-01


In [19]:
# Cell 5: SILVER – clean & type (drop PII, enforce schema)
for d in dates:
    silver = spark.read.csv(
        f"{BRONZE_DIR}/bronze_attributes_{d.replace('-','_')}.csv",
        header=True, inferSchema=True)

    # 1) Parse date, cast Age
    silver = (silver
              .withColumn("snapshot_date", to_date(col("snapshot_date"),"yyyy-MM-dd"))
              .withColumn("Age", col("Age").cast(IntegerType())))
    # 2) Drop PII: Name, SSN
    silver = silver.drop("Name","SSN")
    # 3) Filter invalid Ages (only adult borrowers)
    silver = silver.filter((col("Age")>=18)&(col("Age")<=100))
    # 4) (Optional) Impute missing Occupation
    silver = silver.withColumn("Occupation",
        when(col("Occupation").isNull(),"Unknown").otherwise(col("Occupation")))

    # Write Parquet
    out = f"{SILVER_DIR}/silver_attributes_{d.replace('-','_')}.parquet"
    silver.write.mode("overwrite").parquet(out)
    print(f"{d}: SILVER → {silver.count()} rows → {out}")


2023-01-01: SILVER → 466 rows → datamart/silver/attributes/silver_attributes_2023_01_01.parquet
2023-02-01: SILVER → 442 rows → datamart/silver/attributes/silver_attributes_2023_02_01.parquet
2023-03-01: SILVER → 438 rows → datamart/silver/attributes/silver_attributes_2023_03_01.parquet
2023-04-01: SILVER → 436 rows → datamart/silver/attributes/silver_attributes_2023_04_01.parquet
2023-05-01: SILVER → 466 rows → datamart/silver/attributes/silver_attributes_2023_05_01.parquet
2023-06-01: SILVER → 457 rows → datamart/silver/attributes/silver_attributes_2023_06_01.parquet
2023-07-01: SILVER → 433 rows → datamart/silver/attributes/silver_attributes_2023_07_01.parquet
2023-08-01: SILVER → 425 rows → datamart/silver/attributes/silver_attributes_2023_08_01.parquet
2023-09-01: SILVER → 386 rows → datamart/silver/attributes/silver_attributes_2023_09_01.parquet
2023-10-01: SILVER → 422 rows → datamart/silver/attributes/silver_attributes_2023_10_01.parquet
2023-11-01: SILVER → 423 rows → datamart

In [20]:
# Cell 6: Inspect Silver schema & sample

sv = spark.read.parquet(f"{SILVER_DIR}/silver_attributes_2023_01_01.parquet")
sv.printSchema()
display(sv.limit(5).toPandas())


root
 |-- Customer_ID: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- snapshot_date: date (nullable = true)



Unnamed: 0,Customer_ID,Age,Occupation,snapshot_date
0,CUS_0x1037,45,Accountant,2023-01-01
1,CUS_0x1069,32,Accountant,2023-01-01
2,CUS_0x114a,43,Developer,2023-01-01
3,CUS_0x1184,49,Lawyer,2023-01-01
4,CUS_0x1297,46,Manager,2023-01-01
