# Bronze to Silver Transformation 

`This notebook transforms raw Bronze data into cleaned and enriched Silver tables.`\
`It focuses on data validation, cleaning, and structural preparation for analytics.`

_Remark_ :
`Cloud storage access configuration has been intentionally removed from this notebook.`\
`Credentials and sensitive parameters are managed securely outside the repository and are not exposed in version-controlled code.`
 

In [0]:
dim_product_df = spark.read.format("delta").load(f"{bronze_path}/dim_product")
dim_customer_df = spark.read.format("delta").load(f"{bronze_path}/dim_customer")
fact_loan_df = spark.read.format("delta").load(f"{bronze_path}/fact_loan")
default_events_df = spark.read.format("delta").load(f"{bronze_path}/default_events")
cashflows_df = spark.read.format("delta").load(f"{bronze_path}/cashflows")
print("dim_customer :", dim_customer_df.count())
print("dim_product  :", dim_product_df.count())
print("fact_loan    :", fact_loan_df.count())
print("default_events :", default_events_df.count())

dim_customer : 100000
dim_product  : 6
fact_loan    : 135924
default_events : 8355


In [0]:
bronze_base = f"abfss://{container}@{storage_account}.dfs.core.windows.net/Bronze"
silver_base = f"abfss://{container}@{storage_account}.dfs.core.windows.net/Silver"

print("Bronze base:", bronze_base)
print("Silver base:", silver_base)


`Schema harmonization is performed to ensure consistent data types across tables.`\
`Key numerical and date columns are explicitly cast to their expected types to avoid downstream inconsistencies during joins and analytics.`


In [0]:

fact_loan_df = (
    fact_loan_df
    .withColumn("maturity_months", F.col("maturity_months").cast("int"))
    .withColumn("principal_amount", F.col("principal_amount").cast("double"))
    .withColumn("interest_rate_annual", F.col("interest_rate_annual").cast("double"))
)

dim_customer_df = (
    dim_customer_df
    .withColumn("age", F.col("age").cast("int"))
    .withColumn("annual_income", F.col("annual_income").cast("double"))
    .withColumn("credit_score", F.col("credit_score").cast("int"))
    .withColumn("risk_score_raw", F.col("risk_score_raw").cast("double"))
)

default_events_df = (
    default_events_df
    .withColumn("default_flag", F.col("default_flag").cast("int"))
)



`The main enrichment step consists in joining loan data with customer and product information to obtain a consolidated loan-level dataset.`


In [0]:
loan_enriched = (
    fact_loan_df.alias("l")
    .join(
        dim_customer_df.alias("c"),
        on="customer_id",
        how="left"
    )
    .join(
        dim_product_df.alias("p"),
        on="product_id",
        how="left"
    )
    .join(
        default_events_df.select(
            "loan_id",
            "default_flag",
            "default_date",
            "months_since_origination_at_default",
            "default_reason"
        ).alias("d"),
        on="loan_id",
        how="left"
    )
)

print("Nb de lignes loan_enriched (avant dérivées) :", loan_enriched.count())
display(loan_enriched.limit)


Nb de lignes loan_enriched (avant dérivées) : 135924


<bound method DataFrame.limit of DataFrame[loan_id: bigint, product_id: string, customer_id: int, origination_date: date, principal_amount: double, maturity_months: int, interest_rate_annual: double, age: int, annual_income: double, employment_status: string, nb_past_loans: int, has_previous_defaults: int, avg_past_arrears: double, credit_score: int, risk_score_raw: double, product_name: string, product_type: string, regulatory_portfolio: string, ifrs9_segment: string, interest_rate_type: string, base_margin_bp: int, typical_maturity_min_months: int, typical_maturity_max_months: int, typical_amount_min: double, typical_amount_max: double, collateral_type: string, secured_flag: int, base_lgd_level: double, lgd_sensitivity_to_macro: string, ead_profile: string, ccf_baseline: double, default_flag: int, default_date: date, months_since_origination_at_default: int, default_reason: string]>

In [0]:
# The enriched datasets are written to the Silver layer
       silver_base = f"abfss://{container}@{storage_account}.dfs.core.windows.net/Silver"
silver_loan_path = f"{silver_base}/loan_enriched"   
print(silver_loan_path)



In [0]:
display(loan_enriched.limit)


<bound method DataFrame.limit of DataFrame[loan_id: bigint, product_id: string, customer_id: int, origination_date: date, principal_amount: double, maturity_months: int, interest_rate_annual: double, age: int, annual_income: double, employment_status: string, nb_past_loans: int, has_previous_defaults: int, avg_past_arrears: double, credit_score: int, risk_score_raw: double, product_name: string, product_type: string, regulatory_portfolio: string, ifrs9_segment: string, interest_rate_type: string, base_margin_bp: int, typical_maturity_min_months: int, typical_maturity_max_months: int, typical_amount_min: double, typical_amount_max: double, collateral_type: string, secured_flag: int, base_lgd_level: double, lgd_sensitivity_to_macro: string, ead_profile: string, ccf_baseline: double, default_flag: int, default_date: date, months_since_origination_at_default: int, default_reason: string]>

In [0]:
(
    loan_enriched
    .repartition(4)      
    .write
    .format("delta")
    .mode("overwrite")   
    .save(silver_loan_path)
)

print(" Silver écrit dans :", silver_loan_path)


In [0]:
loan_enriched_silver = spark.read.format("delta").load(silver_loan_path)

print("Nb de lignes relues :", loan_enriched_silver.count())
display(loan_enriched_silver.limit(10))


Nb de lignes relues : 135924


loan_id,product_id,customer_id,origination_date,principal_amount,maturity_months,interest_rate_annual,age,annual_income,employment_status,nb_past_loans,has_previous_defaults,avg_past_arrears,credit_score,risk_score_raw,product_name,product_type,regulatory_portfolio,ifrs9_segment,interest_rate_type,base_margin_bp,typical_maturity_min_months,typical_maturity_max_months,typical_amount_min,typical_amount_max,collateral_type,secured_flag,base_lgd_level,lgd_sensitivity_to_macro,ead_profile,ccf_baseline,default_flag,default_date,months_since_origination_at_default,default_reason
8589961901,CONSO_PERSO,44952,2021-11-17,26970.37402517656,37,0.0497942491991489,21,45057.86837163758,CDD,2,0,0.5823640433544514,721,-0.1479091275240266,Crédit consommation personnel,CONSO,Retail unsecured,Retail – unsecured,FIXE,350,12,60,1000.0,30000.0,NONE,0,0.85,LOW,AMORTIZING,0.0,,,,
8589941872,CONSO_PERSO,30298,2023-09-30,20657.53435379078,18,0.0471712088537269,56,82359.76886963609,CDI,1,0,3.5084179852647948,879,-2.03628234931354,Crédit consommation personnel,CONSO,Retail unsecured,Retail – unsecured,FIXE,350,12,60,1000.0,30000.0,NONE,0,0.85,LOW,AMORTIZING,0.0,,,,
8589955992,CONSO_AUTO,40618,2019-04-19,3839.0406702274495,62,0.0405236630402615,38,45276.664880487006,CDD,2,0,2.5735857714711123,740,-0.3368852571262646,Crédit auto,CONSO,Retail unsecured,Retail – unsecured,FIXE,300,24,84,3000.0,50000.0,VEHICLE,1,0.6,MEDIUM,AMORTIZING,0.0,,,,
8589963180,CONSO_PERSO,45916,2024-11-15,8601.656794730112,25,0.0424884268472333,34,63915.20107414192,CDI,5,0,4.615313666963663,640,0.4302454068606589,Crédit consommation personnel,CONSO,Retail unsecured,Retail – unsecured,FIXE,350,12,60,1000.0,30000.0,NONE,0,0.85,LOW,AMORTIZING,0.0,,,,
8589936753,CONSO_AUTO,26562,2020-03-29,3820.04038173793,46,0.039262648279205,31,21400.2783605274,CDD,2,0,0.4503146476941938,561,1.1265989180823464,Crédit auto,CONSO,Retail unsecured,Retail – unsecured,FIXE,300,24,84,3000.0,50000.0,VEHICLE,1,0.6,MEDIUM,AMORTIZING,0.0,,,,
8589937364,CONSO_PERSO,27015,2020-06-24,19963.927862778848,57,0.0432091529588,38,28312.67825632152,CDI,3,0,3.965457258964911,719,-0.1545664145248255,Crédit consommation personnel,CONSO,Retail unsecured,Retail – unsecured,FIXE,350,12,60,1000.0,30000.0,NONE,0,0.85,LOW,AMORTIZING,0.0,1.0,2020-09-24,3.0,DELINQUENCY
8589944806,CONSO_AUTO,32431,2024-09-30,18557.1397410996,46,0.0424902987394063,36,32868.53091173815,CDI,0,0,2.4061074365656627,735,-0.6750576666155405,Crédit auto,CONSO,Retail unsecured,Retail – unsecured,FIXE,300,24,84,3000.0,50000.0,VEHICLE,1,0.6,MEDIUM,AMORTIZING,0.0,,,,
8589962566,CONSO_AUTO,45462,2020-09-16,29083.61446349642,26,0.0400947978775467,54,22335.39213692309,CHOMEUR,1,0,3.278503798959478,536,1.4826279237294242,Crédit auto,CONSO,Retail unsecured,Retail – unsecured,FIXE,300,24,84,3000.0,50000.0,VEHICLE,1,0.6,MEDIUM,AMORTIZING,0.0,,,,
8589945715,CONSO_AUTO,33113,2021-06-17,41611.89974553743,27,0.035104618324804,32,35595.03540649403,CHOMEUR,1,0,3.827392847695936,604,0.8837181485463116,Crédit auto,CONSO,Retail unsecured,Retail – unsecured,FIXE,300,24,84,3000.0,50000.0,VEHICLE,1,0.6,MEDIUM,AMORTIZING,0.0,1.0,2023-01-17,19.0,BANKRUPTCY
8589951491,CONSO_PERSO,37317,2024-12-25,5037.201298259659,37,0.0410318135891932,60,40590.42106010997,CDI,2,0,0.4972560384633401,673,-0.0290876260702426,Crédit consommation personnel,CONSO,Retail unsecured,Retail – unsecured,FIXE,350,12,60,1000.0,30000.0,NONE,0,0.85,LOW,AMORTIZING,0.0,,,,


`The Silver cash flow table is built at a monthly payment granularity, with one row per payment.`\
`It provides a time-dynamic view of cash flows and is essential for advanced credit risk analysis, including EAD over time, LGD estimation, payment behavior modeling, and pre- and post-default analysis.`\
`The table contains detailed payment-level information such as loan identifiers, payment dates, monthly indexes, payment decomposition, remaining balances, and optional default indicators.`\
`Additional time-based features including vintage, elapsed time since origination, and post-payment default flags make this dataset particularly well suited for IFRS 9 analysis.`


In [0]:
cashflows_df = (
    cashflows_df
    .withColumn("payment_date", F.col("payment_date").cast("date"))
    .withColumn("payment_amount", F.col("payment_amount").cast("double"))
    .withColumn("principal_paid", F.col("principal_paid").cast("double"))
    .withColumn("interest_paid", F.col("interest_paid").cast("double"))
    .withColumn("remaining_balance", F.col("remaining_balance").cast("double"))
)


In [0]:
cashflows_enriched = (
    cashflows_df.alias("cf")
    .join(
        fact_loan_df.select("loan_id", "origination_date", "maturity_months").alias("l"),
        on="loan_id",
        how="left"
    )
)


In [0]:
# adding information of default 

default_df = spark.read.format("delta").load(f"{bronze_base}/default_events")

cashflows_enriched = (
    cashflows_enriched
    .join(
        default_df.select(
            "loan_id",
            "default_date",
            "default_flag"
        ).alias("d"),
        on="loan_id",
        how="left"
    )
)


In [0]:
cashflows_enriched = cashflows_enriched.withColumn(
    "months_since_origination",
    F.floor(F.months_between(F.col("payment_date"), F.col("origination_date"))).cast("int")
)

`For each payment, a flag is used to indicate whether the payment occurs before or after the default event.`\
`This distinction allows a clear separation between pre-default and post-default cash flows for risk and IFRS 9 analyses.`


In [0]:

cashflows_enriched = cashflows_enriched.withColumn(
    "is_before_default",
    F.when(F.col("default_date").isNull(), None)
     .when(F.col("payment_date") <= F.col("default_date"), 1)
     .otherwise(0)
)

cashflows_enriched = cashflows_enriched.withColumn(
    "is_after_default",
    F.when(F.col("default_date").isNull(), None)
     .when(F.col("payment_date") > F.col("default_date"), 1)
     .otherwise(0)
)


`At each payment date, Exposure at Default (EAD) is approximated by the remaining outstanding balance.`\
`This definition provides a simple and consistent time-dependent exposure measure for downstream risk analysis.`


In [0]:

cashflows_enriched = cashflows_enriched.withColumn(
    "ead_at_payment",
    F.col("remaining_balance")
)


In [0]:
cashflows_enriched = (
    cashflows_enriched
    .withColumn("payment_year", F.year("payment_date"))
    .withColumn("payment_month", F.month("payment_date"))
    .withColumn("payment_quarter", F.quarter("payment_date"))
)


In [0]:
cashflows_silver = cashflows_enriched.select(
    "loan_id",
    "payment_date",
    "month_index",
    "payment_amount",
    "principal_paid",
    "interest_paid",
    "remaining_balance",
    
    "origination_date",
    "maturity_months",
    "default_date",
    "default_flag",
    "months_since_origination",
    "is_before_default",
    "is_after_default",
    "ead_at_payment",
    "payment_year",
    "payment_month",
    "payment_quarter"
)


In [0]:
# we save the data in delta format in the silver container 

cashflows_silver_path = f"{silver_base}/cashflows_silver"

(
    cashflows_silver
    .repartition(4)
    .write
    .format("delta")
    .mode("overwrite")
    .save(cashflows_silver_path)
)

print(" cashflows_silver écrit dans :", cashflows_silver_path)


In [0]:
# Verification
check = spark.read.format("delta").load(cashflows_silver_path)
print("Nb lignes :", check.count())
display(check.limit(10))


Nb lignes : 13242051


loan_id,payment_date,month_index,payment_amount,principal_paid,interest_paid,remaining_balance,origination_date,maturity_months,default_date,default_flag,months_since_origination,is_before_default,is_after_default,ead_at_payment,payment_year,payment_month,payment_quarter
8589937317,2022-01-28,6,851.7101973905255,729.7111622408254,121.99903514970012,37817.73289121519,2021-07-28,54,,,6,,,37817.73289121519,2022,1,1
8589939436,2033-03-10,139,3412.4295766132004,3251.851413692876,160.57816292032487,66549.6537851428,2021-08-10,159,,,139,,,66549.6537851428,2033,3,1
8589959974,2020-07-08,5,2472.7126210938814,1227.9233553214317,1244.78926577245,506191.17268285993,2020-02-08,290,,,5,,,506191.17268285993,2020,7,3
8589953647,2035-08-02,198,3798.328442155904,3778.900305787667,19.4281363682366,7567.502226225288,2019-02-02,200,,,198,,,7567.502226225288,2035,8,3
8589968463,2027-03-30,64,2324.498977924027,1350.9342239789958,973.5647539450308,417492.3645507181,2021-11-30,297,,,64,,,417492.3645507181,2027,3,1
8589964645,2038-12-03,190,3338.219703941168,2739.529918794027,598.6897851471408,254138.47277945065,2023-02-03,274,,,190,,,254138.47277945065,2038,12,4
8589959226,2052-05-24,339,2141.628957625017,2035.3979265141068,106.23103111091008,39620.106759963805,2024-02-24,358,,,339,,,39620.106759963805,2052,5,2
8589952308,2020-12-09,23,1098.222005233584,1065.9596208375162,32.26238439606771,10803.983325042562,2019-01-09,33,,,23,,,10803.983325042562,2020,12,4
8589951945,2031-02-15,135,456.6707045436378,335.30006756632304,121.3706369773148,56119.13039134737,2019-11-15,278,,,135,,,56119.13039134737,2031,2,1
8589965518,2020-10-21,18,2566.9932385203606,1911.090997070517,655.9022414498436,310883.46642893495,2019-04-21,158,,,18,,,310883.46642893495,2020,10,4
