# Data Generation

`This notebook is executed in a cloud-based analytics environment.`\
`Data is read from and written to cloud storage using secure access mechanisms.`\
`Sensitive configuration details are intentionally omitted and replaced with sanitized placeholders.`

In [0]:
# NOTE: credentials are not stored in the repository.
# they are provided at runtime via environment variables or Databricks secrets.

storage_account = "<STORAGE_ACCOUNT_NAME>"
container = "<CONTAINER_NAME>"

tenant_id = dbutils.secrets.get(scope="kv-scope", key="tenant-id")
client_id = dbutils.secrets.get(scope="kv-scope", key="client-id")
client_secret = dbutils.secrets.get(scope="kv-scope", key="client-secret")

spark.conf.set(f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net", "OAuth")
spark.conf.set(
    f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net",
    "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider"
)
spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net", client_secret)
spark.conf.set(
    f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net",
    f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
)

dbutils.fs.ls(f"abfss://{container}@{storage_account}.dfs.core.windows.net")


`Data is read from and written to cloud storage using secure access mechanisms.`

#### DIM_PRODUCT

`This section generates the DIM_PRODUCT table, which defines a controlled set of retail credit products.`\
`The table is created from a predefined catalog (hard-coded reference data) to ensure stability, consistency, and reproducibility across runs.`\
`It is written to the Bronze layer in Delta format.`

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

bronze_path = f"abfss://{container}@{storage_account}.dfs.core.windows.net/Bronze"

`Each row represents one product with:`\
` - identifiers and labels (product_id, product_name, product_type)`\
`  - regulatory / IFRS9 segmentation (regulatory_portfolio, ifrs9_segment)`\
`  - typical pricing and contractual ranges (margin, maturity, amount ranges)`\
`  - collateral and risk behavior assumptions (secured_flag, base LGD, CCF, etc.)`

  `Note: This is intentionally hard-coded because it is "dimension/reference" data, not an observational dataset.`

In [0]:
data_dim_product = [

    ("CONSO_PERSO",  "Crédit consommation personnel",  "CONSO",   "Retail unsecured", "Retail – unsecured",
     "FIXE",          350,              12,            60,         1000.0,    30000.0,
     "NONE",          0,              0.85,            "LOW",                  "AMORTIZING", 0.0),

    ("CONSO_AUTO",   "Crédit auto",                    "CONSO",   "Retail unsecured", "Retail – unsecured",
     "FIXE",          300,              24,            84,         3000.0,    50000.0,
     "VEHICLE",       1,              0.60,            "MEDIUM",               "AMORTIZING", 0.0),

    ("IMMO_RESID",   "Prêt immo résidence principale", "IMMO",    "Retail mortgage",  "Retail – secured",
     "FIXE",          180,              120,           360,        50000.0,   600000.0,
     "REAL_ESTATE",   1,              0.30,            "HIGH",                 "AMORTIZING", 0.0),

    ("IMMO_INVEST",  "Prêt immo investissement",       "IMMO",    "Retail mortgage",  "Retail – secured",
     "FIXE",          220,              120,           300,        70000.0,   800000.0,
     "REAL_ESTATE",   1,              0.35,            "HIGH",                 "AMORTIZING", 0.0),

    ("REVOLVING_STD","Crédit renouvelable",            "REVOLVING","Retail unsecured", "Retail – revolving",
     "VARIABLE",      600,              6,             60,         500.0,     10000.0,
     "NONE",          0,              0.90,            "MEDIUM",               "REVOLVING",  0.75),

    ("CREDIT_CARTE", "Ligne de crédit carte",          "REVOLVING","Retail unsecured", "Retail – revolving",
     "VARIABLE",      650,              6,             60,         500.0,     15000.0,
     "NONE",          0,              0.92,            "MEDIUM",               "REVOLVING",  0.85),
]


In [0]:
# We define the schema 

schema = StructType([
    StructField("product_id", StringType(), False),
    StructField("product_name", StringType(), False),
    StructField("product_type", StringType(), False),
    StructField("regulatory_portfolio", StringType(), False),
    StructField("ifrs9_segment", StringType(), False),

    StructField("interest_rate_type", StringType(), False),
    StructField("base_margin_bp", IntegerType(), False),

    StructField("typical_maturity_min_months", IntegerType(), False),
    StructField("typical_maturity_max_months", IntegerType(), False),
    StructField("typical_amount_min", DoubleType(), False),
    StructField("typical_amount_max", DoubleType(), False),

    StructField("collateral_type", StringType(), False),
    StructField("secured_flag", IntegerType(), False),
    StructField("base_lgd_level", DoubleType(), False),
    StructField("lgd_sensitivity_to_macro", StringType(), False),
    StructField("ead_profile", StringType(), False),
    StructField("ccf_baseline", DoubleType(), False),
])


In [0]:
dim_product_df = spark.createDataFrame(data_dim_product, schema=schema)
display(dim_product_df)


product_id,product_name,product_type,regulatory_portfolio,ifrs9_segment,interest_rate_type,base_margin_bp,typical_maturity_min_months,typical_maturity_max_months,typical_amount_min,typical_amount_max,collateral_type,secured_flag,base_lgd_level,lgd_sensitivity_to_macro,ead_profile,ccf_baseline
CONSO_PERSO,Crédit consommation personnel,CONSO,Retail unsecured,Retail – unsecured,FIXE,350,12,60,1000.0,30000.0,NONE,0,0.85,LOW,AMORTIZING,0.0
CONSO_AUTO,Crédit auto,CONSO,Retail unsecured,Retail – unsecured,FIXE,300,24,84,3000.0,50000.0,VEHICLE,1,0.6,MEDIUM,AMORTIZING,0.0
IMMO_RESID,Prêt immo résidence principale,IMMO,Retail mortgage,Retail – secured,FIXE,180,120,360,50000.0,600000.0,REAL_ESTATE,1,0.3,HIGH,AMORTIZING,0.0
IMMO_INVEST,Prêt immo investissement,IMMO,Retail mortgage,Retail – secured,FIXE,220,120,300,70000.0,800000.0,REAL_ESTATE,1,0.35,HIGH,AMORTIZING,0.0
REVOLVING_STD,Crédit renouvelable,REVOLVING,Retail unsecured,Retail – revolving,VARIABLE,600,6,60,500.0,10000.0,NONE,0,0.9,MEDIUM,REVOLVING,0.75
CREDIT_CARTE,Ligne de crédit carte,REVOLVING,Retail unsecured,Retail – revolving,VARIABLE,650,6,60,500.0,15000.0,NONE,0,0.92,MEDIUM,REVOLVING,0.85


In [0]:
# We write the data in a delta format in the bronze layer 
# Bronze keeps the raw generated reference data before any enrichment/transformation.

dim_product_path = f"{bronze_path}/dim_product"

(
    dim_product_df
    .write
    .format("delta")
    .mode("overwrite")
    .save(dim_product_path)
)
 

In [0]:
df_check = spark.read.format("delta").load(dim_product_path)
display(df_check)


product_id,product_name,product_type,regulatory_portfolio,ifrs9_segment,interest_rate_type,base_margin_bp,typical_maturity_min_months,typical_maturity_max_months,typical_amount_min,typical_amount_max,collateral_type,secured_flag,base_lgd_level,lgd_sensitivity_to_macro,ead_profile,ccf_baseline
CONSO_PERSO,Crédit consommation personnel,CONSO,Retail unsecured,Retail – unsecured,FIXE,350,12,60,1000.0,30000.0,NONE,0,0.85,LOW,AMORTIZING,0.0
CONSO_AUTO,Crédit auto,CONSO,Retail unsecured,Retail – unsecured,FIXE,300,24,84,3000.0,50000.0,VEHICLE,1,0.6,MEDIUM,AMORTIZING,0.0
IMMO_RESID,Prêt immo résidence principale,IMMO,Retail mortgage,Retail – secured,FIXE,180,120,360,50000.0,600000.0,REAL_ESTATE,1,0.3,HIGH,AMORTIZING,0.0
IMMO_INVEST,Prêt immo investissement,IMMO,Retail mortgage,Retail – secured,FIXE,220,120,300,70000.0,800000.0,REAL_ESTATE,1,0.35,HIGH,AMORTIZING,0.0
REVOLVING_STD,Crédit renouvelable,REVOLVING,Retail unsecured,Retail – revolving,VARIABLE,600,6,60,500.0,10000.0,NONE,0,0.9,MEDIUM,REVOLVING,0.75
CREDIT_CARTE,Ligne de crédit carte,REVOLVING,Retail unsecured,Retail – revolving,VARIABLE,650,6,60,500.0,15000.0,NONE,0,0.92,MEDIUM,REVOLVING,0.85


In [0]:
%sql
SELECT *
FROM delta.`abfss://ifrsdatalake@databankuniverse.dfs.core.windows.net/Bronze/dim_product`;



product_id,product_name,product_type,regulatory_portfolio,ifrs9_segment,interest_rate_type,base_margin_bp,typical_maturity_min_months,typical_maturity_max_months,typical_amount_min,typical_amount_max,collateral_type,secured_flag,base_lgd_level,lgd_sensitivity_to_macro,ead_profile,ccf_baseline
CONSO_PERSO,Crédit consommation personnel,CONSO,Retail unsecured,Retail – unsecured,FIXE,350,12,60,1000.0,30000.0,NONE,0,0.85,LOW,AMORTIZING,0.0
CONSO_AUTO,Crédit auto,CONSO,Retail unsecured,Retail – unsecured,FIXE,300,24,84,3000.0,50000.0,VEHICLE,1,0.6,MEDIUM,AMORTIZING,0.0
IMMO_RESID,Prêt immo résidence principale,IMMO,Retail mortgage,Retail – secured,FIXE,180,120,360,50000.0,600000.0,REAL_ESTATE,1,0.3,HIGH,AMORTIZING,0.0
IMMO_INVEST,Prêt immo investissement,IMMO,Retail mortgage,Retail – secured,FIXE,220,120,300,70000.0,800000.0,REAL_ESTATE,1,0.35,HIGH,AMORTIZING,0.0
REVOLVING_STD,Crédit renouvelable,REVOLVING,Retail unsecured,Retail – revolving,VARIABLE,600,6,60,500.0,10000.0,NONE,0,0.9,MEDIUM,REVOLVING,0.75
CREDIT_CARTE,Ligne de crédit carte,REVOLVING,Retail unsecured,Retail – revolving,VARIABLE,650,6,60,500.0,15000.0,NONE,0,0.92,MEDIUM,REVOLVING,0.85


#### DIM_CUSTOMER

`We generate a realistic customer population using a latent risk factor.`\
`This latent score drives correlated characteristics (income, credit history, arrears, credit score),
so risk is coherent rather than purely random.`


In [0]:
import numpy as np
N_Customers = 100000 # the number of customers 

`To generate a realistic customer population, we introduce a latent risk variable at the individual level.`\
`This latent variable represents an unobserved creditworthiness factor that cannot be directly measured in real life, but that implicitly drives many observable customer characteristics.`\
`It is modeled as a continuous variable following a standard normal distribution, with one value per customer.`\
`Rather than generating customer attributes independently, this latent risk score acts as a common underlying driver:
higher latent risk increases the likelihood of lower income levels, unstable employment, past defaults, higher arrears, and lower observed credit scores.`\
`Observable variables are therefore conditionally generated based on the latent risk level, ensuring that risk is coherent and structured rather than purely random.
The latent risk variable itself is not meant to be used in downstream analysis and is removed before persisting the final customer dimension.`


In [0]:
# latent variable generation
risk_scores = np.random.normal(loc=0, scale=1, size=N_Customers)

In [0]:
#Age, independant 
ages = np.random.randint(21, 70, size=N_Customers)

In [0]:
# the income is correlated with the risk score

income = []
for r in risk_scores:
    if r < -1:
        income.append(np.random.uniform(40000, 120000))
    elif r <= 1:
        income.append(np.random.uniform(25000, 70000))
    else:
        income.append(np.random.uniform(15000, 40000))


In [0]:
# employment statut

employment = []
for r in risk_scores:
    u = np.random.rand()
    if r < -0.8:
        # good profile
        if u < 0.80: employment.append("CDI")
        elif u < 0.95: employment.append("CDD")
        else: employment.append("CHOMEUR")
    elif r <= 0.8:
        # intermediate
        if u < 0.50: employment.append("CDI")
        elif u < 0.85: employment.append("CDD")
        else: employment.append("CHOMEUR")
    else:
        # bad
        if u < 0.25: employment.append("CDI")
        elif u < 0.65: employment.append("CDD")
        else: employment.append("CHOMEUR")


`A Poisson distribution is used to model the number of past loans because it naturally represents count data (non-negative integers) and captures the idea of discrete credit events over time.`

`The distribution is truncated to enforce realistic bounds:
very large numbers of past loans are unlikely in a retail credit context and would introduce unrealistic outliers.
Truncation ensures numerical stability, prevents extreme values, and keeps the generated population consistent with typical retail lending behavior.`


In [0]:
past_loans = []
for r in risk_scores:
    if r < -1:
        past_loans.append(int(np.clip(np.random.poisson(3), 1, 8)))
    elif r <= 1:
        past_loans.append(int(np.clip(np.random.poisson(2), 0, 6)))
    else:
        past_loans.append(int(np.clip(np.random.poisson(1), 0, 4)))


In [0]:
#  passed defaults
prev_defaults = []
for r in risk_scores:
    u = np.random.rand()
    if r < -1:
        prev_defaults.append(1 if u < 0.01 else 0)
    elif r <= 0.5:
        prev_defaults.append(1 if u < 0.04 else 0)
    else:
        prev_defaults.append(1 if u < 0.12 else 0)


In [0]:
# Average arrears (in days), dependant on default status and risk level
arrears = []
for r, d in zip(risk_scores, prev_defaults):
    if d == 0:
        arrears.append(np.random.uniform(0, 5))
    else:
        if r <= 0:
            arrears.append(np.random.uniform(5, 30))
        else:
            arrears.append(np.random.uniform(30, 90))


In [0]:
# Credit Score (observed)
credit_score = []
for r in risk_scores:
    score = 700 - 100*r + np.random.normal(0, 20)
    credit_score.append(int(np.clip(score, 300, 900)))


In [0]:

data_customer_dim = list(zip(
    range(1, N_Customers+1),
    ages,
    income,
    employment,
    past_loans,
    prev_defaults,
    arrears,
    credit_score,
    risk_scores
))

schema = StructType([
    StructField("customer_id", IntegerType(), False),
    StructField("age", IntegerType(), False),
    StructField("annual_income", DoubleType(), False),
    StructField("employment_status", StringType(), False),
    StructField("nb_past_loans", IntegerType(), False),
    StructField("has_previous_defaults", IntegerType(), False),
    StructField("avg_past_arrears", DoubleType(), False),
    StructField("credit_score", IntegerType(), False),
    StructField("risk_score_raw", DoubleType(), False)  # Temporary
])

dim_customer_df = spark.createDataFrame(data_customer_dim, schema)
display(dim_customer_df)


In [0]:
dim_customer_path = f"{bronze_path}/dim_customer"
dim_customer_df.write.format("delta").mode("overwrite").save(dim_customer_path)


#### FACT_LOAN

#### FACT LOAN 

`This step generates the FACT_LOAN table, which represents all loans granted to the synthetic customer population.`\
`The loan portfolio is generated using a customer-centric approach, where loans emerge as outcomes of individual customer profiles rather than independent random events.`\
`The number of loans per customer is modeled as a bounded random process to reflect realistic borrowing behavior in a retail credit context.`\
`Loan characteristics are conditionally generated based on predefined product definitions, ensuring that amounts, maturities, and pricing remain coherent with product-specific constraints.`\
`Interest rates are derived from product-level base margins with controlled variability, while origination dates are drawn within a fixed historical window to simulate portfolio seasoning.`\
`This methodology ensures that loan attributes are internally consistent, structurally realistic, and aligned with typical retail banking practices.`

In [0]:
dim_customer_path = f"{bronze_path}/dim_customer"
dim_product_path  = f"{bronze_path}/dim_product"

dim_customer_df = spark.read.format("delta").load(dim_customer_path)
dim_product_df  = spark.read.format("delta").load(dim_product_path)

print(dim_customer_df.count(), "clients")
display(dim_product_df)


100000 clients


product_id,product_name,product_type,regulatory_portfolio,ifrs9_segment,interest_rate_type,base_margin_bp,typical_maturity_min_months,typical_maturity_max_months,typical_amount_min,typical_amount_max,collateral_type,secured_flag,base_lgd_level,lgd_sensitivity_to_macro,ead_profile,ccf_baseline
CONSO_PERSO,Crédit consommation personnel,CONSO,Retail unsecured,Retail – unsecured,FIXE,350,12,60,1000.0,30000.0,NONE,0,0.85,LOW,AMORTIZING,0.0
CONSO_AUTO,Crédit auto,CONSO,Retail unsecured,Retail – unsecured,FIXE,300,24,84,3000.0,50000.0,VEHICLE,1,0.6,MEDIUM,AMORTIZING,0.0
IMMO_RESID,Prêt immo résidence principale,IMMO,Retail mortgage,Retail – secured,FIXE,180,120,360,50000.0,600000.0,REAL_ESTATE,1,0.3,HIGH,AMORTIZING,0.0
IMMO_INVEST,Prêt immo investissement,IMMO,Retail mortgage,Retail – secured,FIXE,220,120,300,70000.0,800000.0,REAL_ESTATE,1,0.35,HIGH,AMORTIZING,0.0
REVOLVING_STD,Crédit renouvelable,REVOLVING,Retail unsecured,Retail – revolving,VARIABLE,600,6,60,500.0,10000.0,NONE,0,0.9,MEDIUM,REVOLVING,0.75
CREDIT_CARTE,Ligne de crédit carte,REVOLVING,Retail unsecured,Retail – revolving,VARIABLE,650,6,60,500.0,15000.0,NONE,0,0.92,MEDIUM,REVOLVING,0.85


`Here, a simple discrete distribution is used to assign the number of loans per customer:`\

`50% of customers are assigned one loan.`\
`25% of customers are assigned two loans.`\
`10% of customers are assigned three loans.`\
`15% of customers are assigned no loan.`


In [0]:
customers_with_n = (
    dim_customer_df
    .withColumn(
        "n_loans",
        F.when(F.rand() < 0.15, F.lit(0))                  
         .when(F.rand() < 0.50, F.lit(1))                  
         .when(F.rand() < 0.80, F.lit(2))                  
         .otherwise(F.lit(3))                             
    )
)

display(customers_with_n.groupBy("n_loans").count())


n_loans,count
1,42471
3,8665
2,33729
0,15135


On garde uniquement ceux qui ont au moins 1 prêt, et on crée une ligne par prêt :

In [0]:
customers_with_loans = customers_with_n.filter(F.col("n_loans") > 0)

fact_base = (
    customers_with_loans
    .withColumn("loan_seq", F.expr("sequence(1, n_loans)"))
    .withColumn("loan_instance", F.explode("loan_seq"))
    .drop("loan_seq")
)

display(fact_base.limit(5))
print("Nombre de prêts générés (avant produits) :", fact_base.count())


customer_id,age,annual_income,employment_status,nb_past_loans,has_previous_defaults,avg_past_arrears,credit_score,risk_score_raw,n_loans,loan_instance
25001,25,92460.70100627582,CDI,6,0,2.7039542174603644,830,-1.2347980905201157,1,1
25002,51,50709.673807448526,CDD,1,0,0.0534552267466803,757,-0.623542341014754,2,1
25002,51,50709.673807448526,CDD,1,0,0.0534552267466803,757,-0.623542341014754,2,2
25003,22,21438.171674564703,CHOMEUR,3,0,1.0648888889885373,528,1.7286979429711435,1,1
25005,48,50688.02695479481,CDI,5,0,1.4735721927121204,660,0.5847164088078997,2,1


Nombre de prêts générés (avant produits) : 135924


`A random draw is used with arbitrary but realistic weights.`


In [0]:
p = F.rand()
fact_with_product = (
    fact_base
    .withColumn(
        "product_id",
        F.when(p < 0.35, F.lit("CONSO_PERSO"))      
         .when(p < 0.55, F.lit("CONSO_AUTO"))       
         .when(p < 0.75, F.lit("IMMO_RESID"))       
         .when(p < 0.85, F.lit("IMMO_INVEST"))     
         .when(p < 0.95, F.lit("REVOLVING_STD"))    
         .otherwise(F.lit("CREDIT_CARTE"))          
    )
)
display(fact_with_product.groupBy("product_id").count())

product_id,count
CONSO_AUTO,48646
CREDIT_CARTE,71
IMMO_RESID,29553
IMMO_INVEST,8377
CONSO_PERSO,47843
REVOLVING_STD,1434


`A random draw is used with arbitrary but realistic weights.`


In [0]:
fact_joined = (
    fact_with_product
    .join(dim_product_df, on="product_id", how="left")
)

display(fact_joined.limit(5))


product_id,customer_id,age,annual_income,employment_status,nb_past_loans,has_previous_defaults,avg_past_arrears,credit_score,risk_score_raw,n_loans,loan_instance,product_name,product_type,regulatory_portfolio,ifrs9_segment,interest_rate_type,base_margin_bp,typical_maturity_min_months,typical_maturity_max_months,typical_amount_min,typical_amount_max,collateral_type,secured_flag,base_lgd_level,lgd_sensitivity_to_macro,ead_profile,ccf_baseline
REVOLVING_STD,25001,25,92460.70100627582,CDI,6,0,2.7039542174603644,830,-1.2347980905201157,1,1,Crédit renouvelable,REVOLVING,Retail unsecured,Retail – revolving,VARIABLE,600,6,60,500.0,10000.0,NONE,0,0.9,MEDIUM,REVOLVING,0.75
CONSO_PERSO,25002,51,50709.673807448526,CDD,1,0,0.0534552267466803,757,-0.623542341014754,2,1,Crédit consommation personnel,CONSO,Retail unsecured,Retail – unsecured,FIXE,350,12,60,1000.0,30000.0,NONE,0,0.85,LOW,AMORTIZING,0.0
CONSO_AUTO,25002,51,50709.673807448526,CDD,1,0,0.0534552267466803,757,-0.623542341014754,2,2,Crédit auto,CONSO,Retail unsecured,Retail – unsecured,FIXE,300,24,84,3000.0,50000.0,VEHICLE,1,0.6,MEDIUM,AMORTIZING,0.0
IMMO_RESID,25003,22,21438.171674564703,CHOMEUR,3,0,1.0648888889885373,528,1.7286979429711435,1,1,Prêt immo résidence principale,IMMO,Retail mortgage,Retail – secured,FIXE,180,120,360,50000.0,600000.0,REAL_ESTATE,1,0.3,HIGH,AMORTIZING,0.0
IMMO_INVEST,25005,48,50688.02695479481,CDI,5,0,1.4735721927121204,660,0.5847164088078997,2,1,Prêt immo investissement,IMMO,Retail mortgage,Retail – secured,FIXE,220,120,300,70000.0,800000.0,REAL_ESTATE,1,0.35,HIGH,AMORTIZING,0.0


%md
`Each loan is generated with knowledge of its product-specific constraints, including typical amount ranges, maturity ranges, base margins, and contractual structure (amortizing or revolving).`\
`Loan amount (principal_amount), maturity, interest rate, and origination date are generated in a coherent manner within these constraints.`\
`In particular, the loan amount is drawn within the typical minimum and maximum range defined for each product.`


In [0]:
fact_enriched = (
    fact_joined
    .withColumn(
        "principal_amount",
        F.expr("typical_amount_min + rand() * (typical_amount_max - typical_amount_min)")
    )
)


In [0]:
fact_enriched = (
    fact_enriched
    .withColumn(
        "maturity_months",
        F.expr(
            "cast(typical_maturity_min_months + " 
            "floor(rand() * (typical_maturity_max_months - typical_maturity_min_months + 1)) as int)"
        )
    )
)


`The annual interest rate is generated as the combination of an approximate risk-free rate, the product base margin, and a small random noise term.`

In [0]:
fact_enriched = (
    fact_enriched
    .withColumn(
        "interest_rate_annual",
        0.01 + F.col("base_margin_bp") / 10000.0 + (F.rand() - 0.5) * 0.01
    )
)


In [0]:
fact_enriched = (
    fact_enriched
    .withColumn(
        "origination_date",
        F.expr("date_add(to_date('2018-01-01'), cast(rand() * 2557 as int))")
        # 2557 ≈ 7 years * 365
    )
)


`A unique loan identifier is generated for each loan.`


In [0]:
fact_enriched = fact_enriched.withColumn(
    "loan_id",
    F.monotonically_increasing_id()
)


`Columns are cleaned and only relevant fields are retained before writing the data to the Bronze layer.`







In [0]:
fact_loan_df = (
    fact_enriched.select(
        "loan_id",
        "customer_id",
        "product_id",
        "origination_date",
        "principal_amount",
        "maturity_months",
        "interest_rate_annual"
    )
)

display(fact_loan_df.limit(5))
print("Nombre final de prêts :", fact_loan_df.count())


loan_id,customer_id,product_id,origination_date,principal_amount,maturity_months,interest_rate_annual
0,25001,REVOLVING_STD,2020-11-24,3055.599406327775,47,0.0692788945505874
1,25002,CONSO_PERSO,2021-04-24,2528.691304500111,30,0.0432589008520836
2,25002,CONSO_AUTO,2024-09-14,5048.96963433808,65,0.035625398861127
3,25003,IMMO_RESID,2019-04-19,498928.70978710585,145,0.0312960801908493
4,25005,IMMO_INVEST,2022-11-20,766582.1987441902,169,0.0324395718964925


Nombre final de prêts : 135924


In [0]:
fact_loan_path = f"{bronze_path}/fact_loan"

(
    fact_loan_df
    .write
    .format("delta")
    .mode("overwrite")
    .save(fact_loan_path)
)


In [0]:
spark.read.format("delta").load(fact_loan_path).show(5)


+-------+-----------+-------------+----------------+------------------+---------------+--------------------+
|loan_id|customer_id|   product_id|origination_date|  principal_amount|maturity_months|interest_rate_annual|
+-------+-----------+-------------+----------------+------------------+---------------+--------------------+
|      0|          1|REVOLVING_STD|      2020-11-24| 3055.599406327775|             47| 0.06927889455058749|
|      1|          2|  CONSO_PERSO|      2021-04-24| 2528.691304500111|             30|  0.0432589008520836|
|      2|          2|   CONSO_AUTO|      2024-09-14|  5048.96963433808|             65|0.035625398861127075|
|      3|          3|   IMMO_RESID|      2019-04-19|498928.70978710585|            145| 0.03129608019084937|
|      4|          5|  IMMO_INVEST|      2022-11-20| 766582.1987441902|            169|0.032439571896492596|
+-------+-----------+-------------+----------------+------------------+---------------+--------------------+
only showing top 5 

#### Default_events


`At this stage, loan-level default and recovery information is generated to simulate credit events occurring over the life of the portfolio.`\
`Default events are assigned in a controlled manner, with approximately 3% to 8% of loans entering default, consistent with typical retail credit risk levels.`\
`Corresponding recovery information is generated for defaulted loans to reflect post-default outcomes.`


In [0]:
dim_customer_df = spark.read.format("delta").load(dim_customer_path)
fact_loan_df    = spark.read.format("delta").load(fact_loan_path)

print("Clients :", dim_customer_df.count())
print("Prêts    :", fact_loan_df.count())

Clients : 100000
Prêts    : 135924



`Customer-level risk information, such as credit score and previous default history, is joined to each loan to ensure that default behavior remains consistent with borrower risk profiles.`


In [0]:

loans_with_customer = (
    fact_loan_df.alias("l")
    .join(
        dim_customer_df.select(
            "customer_id",
            "credit_score",
            "has_previous_defaults"
        ).alias("c"),
        on="customer_id",
        how="left"
    )
)

display(loans_with_customer.limit(5))


customer_id,loan_id,product_id,origination_date,principal_amount,maturity_months,interest_rate_annual,credit_score,has_previous_defaults
1,0,REVOLVING_STD,2020-11-24,3055.599406327775,47,0.0692788945505874,484,0
2,1,CONSO_PERSO,2021-04-24,2528.691304500111,30,0.0432589008520836,569,1
2,2,CONSO_AUTO,2024-09-14,5048.96963433808,65,0.035625398861127,569,1
3,3,IMMO_RESID,2019-04-19,498928.70978710585,145,0.0312960801908493,810,0
5,4,IMMO_INVEST,2022-11-20,766582.1987441902,169,0.0324395718964925,640,0


%md
`A loan-level probability of default is constructed by combining product risk and borrower risk characteristics.`\
`A base default probability is first assigned according to the loan product type, reflecting structural differences in risk across retail products.`\
`This base probability is adjusted using multiplicative risk factors derived from the borrower’s credit score and previous default history.`\
`Lower credit scores and past default events increase the probability of default, while stronger borrower profiles reduce it.`\
`The resulting default probability is capped at an upper bound to avoid unrealistic extreme values and ensure numerical stability.`


In [0]:

df = loans_with_customer
df = df.withColumn(
    "base_prob",
    F.when(F.col("product_id").isin("CONSO_PERSO", "CONSO_AUTO"), F.lit(0.08))   # 8% conso
     .when(F.col("product_id").isin("IMMO_RESID", "IMMO_INVEST"), F.lit(0.03))   # 3% immo
     .when(F.col("product_id").isin("REVOLVING_STD", "CREDIT_CARTE"), F.lit(0.12)) # 12% revolving
     .otherwise(F.lit(0.05))                                                     # par défaut
)


df = df.withColumn(
    "factor_score",
    F.when(F.col("credit_score") < 550,  F.lit(1.8))   # très risqué
     .when(F.col("credit_score") < 650, F.lit(1.2))   # moyen
     .otherwise(F.lit(0.7))                           # bon client
)


df = df.withColumn(
    "factor_prev_default",
    F.when(F.col("has_previous_defaults") == 1, F.lit(1.4))
     .otherwise(F.lit(1.0))
)

df = df.withColumn(
    "default_prob_raw",
    F.col("base_prob") * F.col("factor_score") * F.col("factor_prev_default")
)

df = df.withColumn(
    "default_prob",
    F.when(F.col("default_prob_raw") > 0.5, F.lit(0.5))
     .otherwise(F.col("default_prob_raw"))
)

display(df.select("product_id","credit_score","has_previous_defaults","base_prob","default_prob").limit(10))


product_id,credit_score,has_previous_defaults,base_prob,default_prob
REVOLVING_STD,830,0,0.12,0.0839999999999999
CONSO_AUTO,830,0,0.08,0.0559999999999999
IMMO_RESID,757,0,0.03,0.0209999999999999
IMMO_RESID,528,0,0.03,0.054
CONSO_AUTO,528,0,0.08,0.144
IMMO_INVEST,684,0,0.03,0.0209999999999999
IMMO_RESID,684,0,0.03,0.0209999999999999
CONSO_PERSO,660,0,0.08,0.0559999999999999
CONSO_AUTO,676,0,0.08,0.0559999999999999
CONSO_PERSO,699,0,0.08,0.0559999999999999


%md
`Default events are simulated by drawing a uniform random variable at the loan level.`\
`A loan is classified as defaulted when the random draw falls below its previously computed probability of default.`


In [0]:

df = df.withColumn("u", F.rand())

df = df.withColumn(
    "default_flag",
    F.when(F.col("u") < F.col("default_prob"), F.lit(1)).otherwise(F.lit(0))
)

defaults_df = df.filter(F.col("default_flag") == 1)

print("Nombre de prêts en défaut simulés :", defaults_df.count())
display(defaults_df.select("loan_id","product_id","credit_score","default_prob").limit(10))


Nombre de prêts en défaut simulés : 8359


loan_id,product_id,credit_score,default_prob
8589934593,CONSO_AUTO,830,0.0559999999999999
8589934597,IMMO_INVEST,684,0.0209999999999999
8589934614,IMMO_RESID,684,0.0209999999999999
8589934661,CONSO_PERSO,581,0.1344
8589934679,CONSO_AUTO,608,0.096
8589934680,CONSO_PERSO,608,0.096
8589934717,CONSO_PERSO,578,0.1344
8589934721,IMMO_INVEST,743,0.0209999999999999
8589934751,CONSO_AUTO,516,0.144
8589934752,IMMO_RESID,516,0.054


`Given the loan origination date and maturity, the timing of default is simulated along the loan life.`\
`The default event is assigned to a random month between loan inception and approximately 80% of the loan duration, to avoid systematic defaults at maturity.`


In [0]:

defaults_df = defaults_df.withColumn(
    "months_since_origination_at_default",
    F.expr(
        "cast(1 + floor(rand() * (maturity_months * 0.8)) as int)"
    )
)

defaults_df = defaults_df.withColumn(
    "default_date",
    F.add_months(
        F.col("origination_date"),
        F.col("months_since_origination_at_default")
    )
)


`A default reason is generated for each defaulted loan, selected from a predefined set of typical credit events such as delinquency, bankruptcy, or restructuring.`







In [0]:

r = F.rand()

defaults_df = defaults_df.withColumn(
    "default_reason",
    F.when(r < 0.6,  F.lit("DELINQUENCY"))
     .when(r < 0.85, F.lit("BANKRUPTCY"))
     .otherwise(F.lit("RESTRUCTURING"))
)



In [0]:
default_events_df = defaults_df.select(
    "loan_id",
    "default_flag",
    "default_date",
    "months_since_origination_at_default",
    "default_reason"
)

display(default_events_df.limit(10))
print("Nb de lignes dans default_events :", default_events_df.count())


loan_id,default_flag,default_date,months_since_origination_at_default,default_reason
8589934593,1,2024-08-26,2,RESTRUCTURING
8589934597,1,2023-12-24,15,BANKRUPTCY
8589934614,1,2031-01-20,95,BANKRUPTCY
8589934661,1,2021-04-09,7,RESTRUCTURING
8589934679,1,2021-05-04,12,DELINQUENCY
8589934680,1,2021-06-05,12,BANKRUPTCY
8589934717,1,2025-11-27,17,DELINQUENCY
8589934721,1,2040-04-04,215,DELINQUENCY
8589934751,1,2023-10-09,23,BANKRUPTCY
8589934752,1,2041-05-20,224,DELINQUENCY


Nb de lignes dans default_events : 8359


In [0]:
# we save the dataframe


default_events_path = f"{bronze_path}/default_events"

(
    default_events_df
    .write
    .format("delta")
    .mode("overwrite")
    .save(default_events_path)
)

# Vérification
spark.read.format("delta").load(default_events_path).show(5)


+-------+------------+------------+-----------------------------------+--------------+
|loan_id|default_flag|default_date|months_since_origination_at_default|default_reason|
+-------+------------+------------+-----------------------------------+--------------+
|      1|           1|  2021-06-24|                                  2| RESTRUCTURING|
|      5|           1|  2023-04-10|                                 13|    BANKRUPTCY|
|     22|           1|  2027-01-29|                                 58|    BANKRUPTCY|
|     50|           1|  2019-12-27|                                  7| RESTRUCTURING|
|     76|           1|  2026-10-08|                                 23|   DELINQUENCY|
+-------+------------+------------+-----------------------------------+--------------+
only showing top 5 rows



#### Cashflows


%md
`To generate theoretical cash flows, we first convert the annual interest rate into a monthly rate:  r_m = r_annual / 12.`\
`For amortizing loans, the periodic payment is computed using the standard annuity formula:`\
   ` Payment = P × r_m / (1 − (1 + r_m)^(−n)), where P is the principal amount and n is the maturity in months.`\
`This payment amount is then used as the basis for building the monthly cash flow schedule.`


In [0]:

loans = fact_loan_df.withColumn(
    "monthly_rate",
    F.col("interest_rate_annual") / F.lit(12.0)
)


loans = loans.withColumn(
    "payment_amount",
    F.col("principal_amount") *
    (
        F.col("monthly_rate") /
        (F.lit(1.0) - F.pow(F.lit(1.0) + F.col("monthly_rate"), -F.col("maturity_months")))
    )
)

display(loans.select("loan_id", "principal_amount", "interest_rate_annual", "monthly_rate", "maturity_months", "payment_amount").limit(5))


loan_id,principal_amount,interest_rate_annual,monthly_rate,maturity_months,payment_amount
8589934592,7188.928685747126,0.0721183703701215,0.0060098641975101,40,202.72707128176225
8589934593,18693.453234676177,0.0419278205718587,0.0034939850476548,32,618.4550536774628
8589934594,139633.9529836643,0.0292432040913524,0.0024369336742793,325,622.506232141315
8589934595,448008.81147192506,0.0285616333159287,0.0023801361096607,163,3319.2984749809434
8589934596,6488.8009123465545,0.0430082835892724,0.0035840236324393,74,99.9840233820114


%md
`Using the computed monthly payment, a theoretical cash flow schedule is constructed for each loan.`\
`At each period, the payment is decomposed into interest and principal components based on the outstanding balance and the monthly interest rate.`\
`The outstanding principal is progressively reduced over time, ensuring consistency between payments, maturity, and total repaid amount.`


In [0]:
loans_expanded = (
    loans
    .withColumn("month_index", F.expr("sequence(1, maturity_months)"))
    .withColumn("month_index", F.explode("month_index"))
)

display(loans_expanded.limit(5))
print("Nb de lignes (prêt x mois) :", loans_expanded.count())


loan_id,customer_id,product_id,origination_date,principal_amount,maturity_months,interest_rate_annual,monthly_rate,payment_amount,month_index
8589934592,25001,REVOLVING_STD,2024-05-21,7188.928685747126,40,0.0721183703701215,0.0060098641975101,202.72707128176225,1
8589934592,25001,REVOLVING_STD,2024-05-21,7188.928685747126,40,0.0721183703701215,0.0060098641975101,202.72707128176225,2
8589934592,25001,REVOLVING_STD,2024-05-21,7188.928685747126,40,0.0721183703701215,0.0060098641975101,202.72707128176225,3
8589934592,25001,REVOLVING_STD,2024-05-21,7188.928685747126,40,0.0721183703701215,0.0060098641975101,202.72707128176225,4
8589934592,25001,REVOLVING_STD,2024-05-21,7188.928685747126,40,0.0721183703701215,0.0060098641975101,202.72707128176225,5


Nb de lignes (prêt x mois) : 13242051


`Monthly cash flows are computed by approximating the remaining outstanding balance at each period using the annuity formula.`\
`For each month, the outstanding balance is used to derive the interest component as the product of the balance and the monthly interest rate.`\
`The principal component of the payment is obtained as the difference between the total payment and the interest paid.`\
`Payment dates are generated by shifting the origination date by the corresponding month index.`


In [0]:
loans_cf = loans_expanded.withColumn(
    "power_n",
    F.pow(F.lit(1.0) + F.col("monthly_rate"), F.col("maturity_months"))
).withColumn(
    "power_k",
    F.pow(F.lit(1.0) + F.col("monthly_rate"), F.col("month_index"))
)

loans_cf = loans_cf.withColumn(
    "remaining_balance",
    F.col("principal_amount") *
    (F.col("power_n") - F.col("power_k")) /
    (F.col("power_n") - F.lit(1.0))
)

loans_cf = loans_cf.withColumn(
    "interest_paid",
    F.col("remaining_balance") * F.col("monthly_rate")
)


loans_cf = loans_cf.withColumn(
    "principal_paid",
    F.col("payment_amount") - F.col("interest_paid")
)

loans_cf = loans_cf.withColumn(
    "payment_date",
    F.add_months(F.col("origination_date"), F.col("month_index"))
)


In [0]:
cashflows_df = loans_cf.select(
    "loan_id",
    "payment_date",
    "month_index",
    "payment_amount",
    "principal_paid",
    "interest_paid",
    "remaining_balance"
)

display(cashflows_df.limit(10))
print("Nb total de lignes cashflows :", cashflows_df.count())


loan_id,payment_date,month_index,payment_amount,principal_paid,interest_paid,remaining_balance
8589934592,2024-06-21,1,202.72707128176225,160.48129523406325,42.245776047699,7029.406099592288
8589934592,2024-07-21,2,202.72707128176225,161.44576602466051,41.28130525710174,6868.924804358225
8589934592,2024-08-21,3,202.72707128176225,162.41603315373172,40.31103812803053,6707.479038333563
8589934592,2024-09-21,4,202.72707128176225,163.39213145648395,39.33493982527831,6545.063005179832
8589934592,2024-10-21,5,202.72707128176225,164.37409597747913,38.35297530428311,6381.670873723348
8589934592,2024-11-21,6,202.72707128176225,165.3619619718923,37.36510930986995,6217.296777745865
8589934592,2024-12-21,7,202.72707128176225,166.3557649067772,36.37130637498504,6051.934815773972
8589934592,2025-01-21,8,202.72707128176225,167.35554046233986,35.3715308194224,5885.579050867197
8589934592,2025-02-21,9,202.72707128176225,168.36132453321943,34.36574674854281,5718.223510404855
8589934592,2025-03-21,10,202.72707128176225,169.37315322977702,33.35391805198525,5549.862185871638


Nb total de lignes cashflows : 13242051


In [0]:
# we write the data in the container

cashflows_path = f"{bronze_path}/cashflows"

(
    cashflows_df
    .write
    .format("delta")
    .mode("overwrite")
    .save(cashflows_path)
)

spark.read.format("delta").load(cashflows_path).show(5)


+-------+------------+-----------+-----------------+------------------+------------------+------------------+
|loan_id|payment_date|month_index|   payment_amount|    principal_paid|     interest_paid| remaining_balance|
+-------+------------+-----------+-----------------+------------------+------------------+------------------+
|      0|  2020-12-24|          1|74.41785452325144|57.104930238311496|17.312924284939935| 2998.822264226176|
|      0|  2021-01-24|          2|74.41785452325144|57.434610775003044| 16.98324374824839| 2941.717333987866|
|      0|  2021-02-24|          3|74.41785452325144| 57.76619463695599|16.651659886295445|2884.2827232128643|
|      0|  2021-03-24|          4|74.41785452325144|58.099692812526186|16.318161710725253|  2826.51652857591|
|      0|  2021-04-24|          5|74.41785452325144|58.435116353507894|15.982738169743543|2768.4168357633835|
+-------+------------+-----------+-----------------+------------------+------------------+------------------+
only showi