In [0]:
from pyspark.sql.functions import col, when
from pyspark.sql.types import DoubleType, TimestampType

In [0]:
bronze_table_path = "gs://databricks-glk-dbx-ext-storage/nyc_taxi_lakehouse/bronze/bronze_fhv_tripdata"
silver_table_path = "gs://databricks-glk-dbx-ext-storage/nyc_taxi_lakehouse/silver/silver_fhv_tripdata"

catalog_name = "nyc_taxi_catalog"
schema = "nyc_taxi_lakehouse"
silver_table_name = "silver_fhv_tripdata"

In [0]:
spark.sql(f"USE CATALOG {catalog_name}")
spark.sql(f"USE SCHEMA {schema}")

In [0]:
# Read bronze table
df_bronze = spark.read.format("delta").load(bronze_table_path)
print("Bronze table read successfully")
df_bronze.printSchema()
print(f"Total number of bronze table : {df_bronze.count()}")

In [0]:
# Deduplicate rows
dedup_cols = ["trip_id"] if "trip_id" in df_bronze.columns else df_bronze.columns
df_clean = df_bronze.dropDuplicates(dedup_cols)
print(f"Total rows after deduplication: {df_clean.count()}")

In [0]:
# Typecasting columns

typecast_mapping = {
    "pickup_datetime": TimestampType(),
    "dropoff_datetime": TimestampType(),
    "request_datetime": TimestampType(),
    "on_scene_datetime": TimestampType(),
    "base_passenger_fare": DoubleType(),
    "tolls": DoubleType(),
    "sales_tax": DoubleType(),
    "congestion_surcharge": DoubleType(),
    "tips": DoubleType(),
    "driver_pay": DoubleType(),
    "trip_miles": DoubleType(),
    "trip_time": DoubleType(),
}

for col_name,dtype in typecast_mapping.items():
    if col_name in df_clean.columns:
        df_clean = df_clean.withColumn(col_name, col(col_name).cast(dtype))

# Null & invalid handling
# Drop rows with critical nulls
critical_cols = [c for c in ["pickup_datetime", "dropoff_datetime"] if c in df_clean.columns]
if critical_cols:
    df_clean = df_clean.dropna(subset=critical_cols)

print(f"Total rows after null handling: {df_clean.count()}")

In [0]:
# Write to Silver Delta Table
df_clean.write.mode("overwrite").format("delta").option("mergeSchema","true").save(silver_table_path)

# Register table in catalog
spark.sql(f"drop table if exists {catalog_name}.{schema}.{silver_table_name}")

spark.sql(f"""
          CREATE TABLE {catalog_name}.{schema}.{silver_table_name}
          USING DELTA
          LOCATION '{silver_table_path}'
          """)
print("Silver table created successfully and registered in catalog.")

In [0]:
spark.sql(f"select * from {catalog_name}.{schema}.{silver_table_name}").limit(5).display()