In [0]:
from pyspark.sql import functions as F

SILVER_BASE = "/Volumes/workspace/default/prueba/etl_project/silver"
GOLD_BASE = "/Volumes/workspace/default/prueba/etl_project/gold"


# Load Silver
population_silver = spark.read.format("delta").load(f"{SILVER_BASE}/population")
co2_silver = spark.read.format("delta").load(f"{SILVER_BASE}/co2")


In [0]:
# Join datasets by country and year
gold_df = population_silver.join(
    co2_silver,
    (population_silver.country == co2_silver.country) & (population_silver.year == co2_silver.year),
    "inner"
)

In [0]:
# Calculate CO2 per 1000 people
gold_df = gold_df.withColumn("co2_per_1000", (F.col("co2") / F.col("population")) * 1000)

In [0]:
# Select final columns
gold_df = gold_df.select(
    population_silver.country,
    population_silver.year,
    "population",
    "co2",
    "co2_per_capita",
    "co2_per_1000"
)

# Show top results
gold_df.orderBy(F.desc("co2_per_1000")).show(10)

In [0]:
# Save Gold
gold_df.write.format("delta").mode("overwrite").save(f"{GOLD_BASE}/population_co2_metrics")

print("âœ… Gold layer created")

In [0]:
gold_df.explain(True)