In [1]:
"""
Bronze Layer: Raw Data Ingestion
Purpose: Load CSV file into Bronze layer with zero transformations
Requirements: Preserve original schema, validate row count and basic integrity
"""

from pyspark.sql.functions import input_file_name, current_timestamp, col, trim

# Step 1: Load raw CSV file with original schema preserved
#inferSchema=false: Keep all columns as strings
df_raw = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "false") \
    .option("encoding", "UTF-8") \
    .load("Files/bronze/house-price-timeseries.csv")

# Step 2: Add technical metadata columns
df_bronze = df_raw \
    .withColumn("_source_file", input_file_name()) \
    .withColumn("_ingestion_time", current_timestamp()) \
    .withColumn("_batch_id", current_timestamp().cast("string"))

# Step 3: Validate row count and basic integrity (no truncation)
initial_row_count = df_bronze.count()
print(f"Total rows loaded: {initial_row_count}")

# Check critical columns for null values to detect truncation
critical_columns = ["property_id", "price", "month"]
for column in critical_columns:
    if column in df_bronze.columns:
        null_count = df_bronze.filter(col(column).isNull() | (trim(col(column)) == "")).count()
        print(f"{column} null/empty count: {null_count}")

# Step 4: Save to Bronze layer as Delta table
bronze_table_name = "houses_price_bronze"
df_bronze.write \
    .mode("overwrite") \
    .format("delta") \
    .saveAsTable(bronze_table_name)

print(f"Bronze table saved: {bronze_table_name} with {initial_row_count} rows")
print("All columns preserved as string type, no transformations applied")

StatementMeta(, 85978354-5040-4a55-a44e-3b70e36861e7, 3, Finished, Available, Finished)

Total rows loaded: 19620
property_id null/empty count: 0
price null/empty count: 0
month null/empty count: 0
Bronze table saved: houses_price_bronze with 19620 rows
All columns preserved as string type, no transformations applied
