### Data ingestion to Bronze 

In [0]:
from pyspark.sql import functions as F

raw_root = "/Volumes/project/default/my_data_volume/weather/station=central_park/"

raw = (
    spark.read
      .format("csv")
      .option("header", "true")
      .option("inferSchema", "true")
      .option("basePath", raw_root)   
      .load(f"{raw_root}/year=*/month=*")
)

display(raw.limit(10))

date,tmin_c,tmax_c,tmin_f,tmax_f,prcp_mm,prcp_in,snow_mm,snow_in,snwd_mm,snwd_in,year,month
2022-01-01,10.0,13.3,50.0,55.94,20.1,0.7913385826771655,0.0,0.0,0.0,0.0,2022,1
2022-01-02,2.8,15.0,37.04,59.0,1.0,0.0393700787401574,0.0,0.0,0.0,0.0,2022,1
2022-01-03,-5.5,2.8,22.1,37.04,0.0,0.0,0.0,0.0,0.0,0.0,2022,1
2022-01-04,-7.1,1.1,19.22,33.98,0.0,0.0,0.0,0.0,0.0,0.0,2022,1
2022-01-05,-0.5,8.3,31.1,46.94,5.8,0.2283464566929134,0.0,0.0,0.0,0.0,2022,1
2022-01-06,1.1,5.0,33.98,41.0,0.0,0.0,0.0,0.0,0.0,0.0,2022,1
2022-01-07,-3.8,1.1,25.16,33.98,9.7,0.3818897637795275,147.0,5.78740157480315,150.0,5.905511811023622,2022,1
2022-01-08,-6.6,-1.0,20.12,30.2,0.0,0.0,0.0,0.0,80.0,3.1496062992125986,2022,1
2022-01-09,-1.6,4.4,29.12,39.92,2.5,0.0984251968503937,0.0,0.0,80.0,3.1496062992125986,2022,1
2022-01-10,-4.3,4.4,24.26,39.92,0.0,0.0,0.0,0.0,0.0,0.0,2022,1


In [0]:
from pyspark.sql import functions as F

spark.sql('''
    DROP TABLE IF EXISTS project.taxi_bronze.weather_raw;
''')

bronze = (
    raw
    .withColumn("year", F.col("year").cast("int"))
    .withColumn("month", F.col("month").cast("int"))
    .withColumn("ingest_ts", F.current_timestamp())
)

(bronze.write
  .format("delta")
  .mode("overwrite")              
  .partitionBy("year", "month")
  .saveAsTable("project.taxi_bronze.weather_raw")
)


In [0]:
%sql
SELECT year, month, COUNT(*) AS trips
FROM project.taxi_bronze.weather_raw
GROUP BY year, month
ORDER BY year, month;


year,month,trips
2022,1,31
2022,2,28


In [0]:
weather_bronze = spark.table("project.taxi_bronze.weather_raw")

weather_silver = (
    weather_bronze
    .withColumn("date", F.to_date("date"))
    .withColumn("min_temp_c", F.col("tmin_c").cast("double"))
    .withColumn("max_temp_c", F.col("tmax_c").cast("double"))
    .withColumn("rain_mm", F.col("prcp_mm").cast("double"))
    .withColumn("snow_mm", F.col("snow_mm").cast("double"))

    # ---- WEATHER VALIDITY ----
    .filter(F.col("date").isNotNull())
    .filter(F.col("min_temp_c").between(-50, 50))
    .filter(F.col("max_temp_c").between(-50, 60))
    .filter(F.col("rain_mm") >= 0)
    .filter(F.col("snow_mm") >= 0)

    .withColumn("is_rain", F.col("rain_mm") > 0)
    .withColumn("is_snow", F.col("snow_mm") > 0)
)

weather_silver.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("project.taxi_silver.weather_daily")
