In [0]:
from pyspark.sql.functions import * 
from delta.tables import DeltaTable 

######Table names

In [0]:
SILVER_TRIPS = "databricks_catalog.silver.NYC_Taxi_Trips"          
DIM_LOC_TBL = "databricks_catalog.gold.dim_location_scd2"     
DIM_PAY_TBL="databricks_catalog.gold.dim_payment_type_scd1"
DIM_RATE_TBL="databricks_catalog.gold.dim_rate_code_scd1"
GOLD_FACT    = "databricks_catalog.gold.fact_trip"     

####Data Reading

In [0]:
t=spark.table(SILVER_TRIPS)

In [0]:
# Required keys in Silver
required = ["trip_key", "PULocationID", "DOLocationID", "tpep_pickup_datetime", "tpep_dropoff_datetime"]
missing = [c for c in required if c not in t.columns]
if missing:
    raise ValueError(f"Silver trips missing columns: {missing}")

# Ensure trip_date exists (good for partitioning)
if "trip_date" not in t.columns:
    t = t.withColumn("trip_date", F.to_date("tpep_pickup_datetime"))

####Read Dimension

In [0]:
loc = spark.table(DIM_LOC_TBL)

loc_curr = (
    loc.select(
        col("locationid").alias("location_id"),
        col("location_sk").alias("location_sk"),
        col("effective_from").alias("effective_from"),
        coalesce(col("effective_to"),to_timestamp(lit("2999-12-31"))).alias("effective_to")
    )
)

####Reading dim_payment

In [0]:
pay = spark.table(DIM_PAY_TBL)
pay_lkp = pay.select(
    col("payment_type").alias("payment_type"),
    col("payment_sk").alias("payment_sk")
)

#####Reading Dim_rate

In [0]:
rate = spark.table(DIM_RATE_TBL)
rate_lkp = rate.select(
    col("RatecodeID").alias("ratecode_id"),
    col("rate_sk").alias("ratecode_sk")
)

#####BUILD FACT (joins: PU loc SCD2 + DO loc SCD2 + payment + ratecode)

In [0]:
fact = (
    t.alias("t")

    # PU SCD2 join (valid at pickup time)
    .join(
        loc_curr.alias("pu"),
        (col("t.PULocationID") == col("pu.location_id")) &
        (col("t.tpep_pickup_datetime") >= col("pu.effective_from")) &
        (col("t.tpep_pickup_datetime") <  col("pu.effective_to")),
        "left"
    )

    # DO SCD2 join (valid at dropoff time)
    .join(
        loc_curr.alias("do"),
        (col("t.DOLocationID") == col("do.location_id")) &
        (col("t.tpep_dropoff_datetime") >= col("do.effective_from")) &
        (col("t.tpep_dropoff_datetime") <  col("do.effective_to")),
        "left"
    )

    # Payment SCD1 join
    .join(
        pay_lkp.alias("p"),
        col("t.payment_type") == col("p.payment_type"),
        "left"
    )

    # Ratecode SCD1 join
    .join(
        rate_lkp.alias("r"),
        col("t.RatecodeID") == col("r.ratecode_id"),
        "left"
    )

    .select(
        # PK
        col("t.trip_key"),
        col("t.trip_date"),

        # timestamps
        col("t.tpep_pickup_datetime").alias("pickup_ts"),
        col("t.tpep_dropoff_datetime").alias("dropoff_ts"),

        # Dimension surrogate keys (default -1 if not found)
        coalesce(col("pu.location_sk"), lit(-1)).alias("pu_location_sk"),
        coalesce(col("do.location_sk"), lit(-1)).alias("do_location_sk"),
        coalesce(col("p.payment_sk"),  lit(-1)).alias("payment_sk"),
        coalesce(col("r.ratecode_sk"), lit(-1)).alias("ratecode_sk"),

        # Measures
        col("t.passenger_count"),
        col("t.trip_distance"),
        col("t.fare_amount"),
        col("t.extra"),
        col("t.mta_tax"),
        col("t.tip_amount"),
        col("t.tolls_amount"),
        col("t.improvement_surcharge"),
        col("t.total_amount"),

        # Audit
        current_timestamp().alias("load_ts")
    )
)


In [0]:
fact = fact.dropDuplicates(["trip_key"])

####Write Gold fact (INSERT ONLY)

In [0]:
if spark.catalog.tableExists(GOLD_FACT):
    dt = DeltaTable.forName(spark, GOLD_FACT)

    (
        dt.alias("trg")
        .merge(fact.alias("src"), "trg.trip_key = src.trip_key")
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )
else:
    (
        fact.write
        .format("delta")
        .mode("overwrite")
        .partitionBy("trip_date")  # recommended
        .saveAsTable(GOLD_FACT)
    )