In [0]:
%sql
DROP TABLE IF EXISTS silver_census_population;


In [0]:
from pyspark.sql.functions import col, upper, trim, initcap

bronze_df = spark.read.table("bronze_census_population")

silver_df = (
    bronze_df
    # -----------------------
    # Data quality checks
    # -----------------------
    .filter(col("region").isNotNull())
    .filter(col("year").between(1990, 2100))
    .filter(col("population") > 0)
    .filter(col("literacy_rate").between(0, 100))
    .filter(col("employment_rate").between(0, 100))

    # -----------------------
    # Standardization
    # -----------------------
    .withColumn("region", upper(trim(col("region"))))
    .withColumn("gender", initcap(trim(col("gender"))))
    .withColumn("age_group", trim(col("age_group")))
    .withColumn("ethnic_group", initcap(trim(col("ethnic_group"))))
)

silver_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("silver_census_population")


In [0]:
spark.read.table("silver_census_population").printSchema()
spark.read.table("silver_census_population").show(5)


root
 |-- region: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- age_group: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- ethnic_group: string (nullable = true)
 |-- population: integer (nullable = true)
 |-- literacy_rate: double (nullable = true)
 |-- employed: integer (nullable = true)
 |-- employment_rate: double (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = true)
 |-- source_path: string (nullable = true)

+-------+----+---------+------+------------+----------+-------------+--------+---------------+--------------------+--------------------+
| region|year|age_group|gender|ethnic_group|population|literacy_rate|employed|employment_rate| ingestion_timestamp|         source_path|
+-------+----+---------+------+------------+----------+-------------+--------+---------------+--------------------+--------------------+
|   WEST|2023|    15-24|  Male|     Group A|     23291|        57.13|    5684|           24.4|2026-01-02 08:06:.