#Bronze Layer - Raw Data Ingestion

This notebook handles the ingestion of raw customer churn data into the Bronze layer of the medallion architecture. The dataset is a CSV file containing customer demographic, financial, and engagement details from a European bank. It defines and applies an explicit schema for data consistency and type safety, performs column renaming to ensure naming consistency, and writes the raw data to a Delta table (`bronze_customers`) for further processing in the Silver layer. 



In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

file_path = "/Volumes/workspace/default/volumes/Customer-Churn-Records.csv"

# Define the schema for the dataset
schema = StructType([
    StructField("RowNumber", IntegerType(), True),
    StructField("CustomerId", IntegerType(), True),
    StructField("Surname", StringType(), True),
    StructField("CreditScore", IntegerType(), True),
    StructField("Geography", StringType(), True),
    StructField("Gender", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Tenure", IntegerType(), True),
    StructField("Balance", FloatType(), True),
    StructField("NumOfProducts", IntegerType(), True),
    StructField("HasCrCard", IntegerType(), True),
    StructField("IsActiveMember", IntegerType(), True),
    StructField("EstimatedSalary", FloatType(), True),
    StructField("Exited", IntegerType(), True),
    StructField("Complain", IntegerType(), True),
    StructField("Satisfaction Score", IntegerType(), True),
    StructField("Card Type", StringType(), True),
    StructField("Point Earned", IntegerType(), True),
])

# Read the CSV file 
df_bronze = (
    spark.read.format("csv")
    .option("header", "true")
    .schema(schema)
    .load(file_path)
    .withColumn("time_ingested", current_timestamp())
)

# Rename the columns in the dataset to ensure consistency and remove spaces to ensure compatibility with Delta Lake
df_bronze = (df_bronze
             .withColumnRenamed("RowNumber", "row_number")
             .withColumnRenamed("CustomerId", "customer_id")
             .withColumnRenamed("Surname", "surname")
             .withColumnRenamed("CreditScore", "credit_score")
             .withColumnRenamed("Geography", "geography")
             .withColumnRenamed("Gender", "gender")
             .withColumnRenamed("Age", "age")
             .withColumnRenamed("Tenure", "tenure")
             .withColumnRenamed("NumOfProducts", "num_of_products")
             .withColumnRenamed("HasCrCard", "has_cr_card")
             .withColumnRenamed("IsActiveMember", "is_active_member")
             .withColumnRenamed("EstimatedSalary", "estimated_salary")
             .withColumnRenamed("Exited", "exited")
             .withColumnRenamed("Complain", "complain")
             .withColumnRenamed("Satisfaction Score", "satisfaction_score")
             .withColumnRenamed("Card Type", "card_type")
             .withColumnRenamed("Point Earned", "point_earned")
             )

# Save the bronze table as a delta table
df_bronze.write.format("delta").mode("overwrite").saveAsTable("bronze_customers")


In [0]:
# Display the first 10 records of the table
display(df_bronze.limit(10))

row_number,customer_id,surname,credit_score,geography,gender,age,tenure,Balance,num_of_products,has_cr_card,is_active_member,estimated_salary,exited,complain,satisfaction_score,card_type,point_earned,time_ingested
1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1,1,2,DIAMOND,464,2025-07-30T18:16:01.307Z
2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,1,3,DIAMOND,456,2025-07-30T18:16:01.307Z
3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,1,3,DIAMOND,377,2025-07-30T18:16:01.307Z
4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0,0,5,GOLD,350,2025-07-30T18:16:01.307Z
5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,0,5,GOLD,425,2025-07-30T18:16:01.307Z
6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.7,1,1,5,DIAMOND,484,2025-07-30T18:16:01.307Z
7,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0,0,2,SILVER,206,2025-07-30T18:16:01.307Z
8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1,1,2,DIAMOND,282,2025-07-30T18:16:01.307Z
9,15792365,He,501,France,Male,44,4,142051.06,2,0,1,74940.5,0,0,3,GOLD,251,2025-07-30T18:16:01.307Z
10,15592389,H?,684,France,Male,27,2,134603.88,1,1,1,71725.73,0,0,3,GOLD,342,2025-07-30T18:16:01.307Z
