In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

####Data Reading

In [0]:
df=spark.read.format('parquet').load('abfss://bronze@databrickseteprojectsa.dfs.core.windows.net/yellow_tripdata')

In [0]:
df=df.drop("_rescued_data")

######Parse & Standardize Timestamps

In [0]:
df=df.withColumn('tpep_pickup_datetime',to_timestamp(col('tpep_pickup_datetime')))
df=df.withColumn('tpep_dropoff_datetime',to_timestamp(col('tpep_dropoff_datetime')))

#####Fix  Data Types

In [0]:
df=df.withColumn('passenger_count',col('passenger_count').cast('integer'))\
    .withColumn('trip_distance',col('trip_distance').cast('double'))\
        .withColumn('RatecodeID',col('RatecodeID').cast('integer'))\
            .withColumn('payment_type',col('payment_type').cast('integer'))


#####Remove Bad / Invalid Rows

In [0]:
df=df.filter((col("trip_distance")>0)&(col('fare_amount')>0)& (col('total_amount')>0) & (col('tpep_dropoff_datetime')>col('tpep_pickup_datetime')))

####Drop Rows with NULLs in Critical Columns

In [0]:
critical_columns = [
    "tpep_pickup_datetime",
    "tpep_dropoff_datetime",
    "fare_amount",
    "PULocationID",
    "DOLocationID"
]

df=df.dropna(subset=critical_columns)

#####Deduplication

In [0]:
dedup=['VendorID','PULocationID','DOLocationID','tpep_pickup_datetime','tpep_dropoff_datetime']
df=df.dropDuplicates(dedup)

#####Standardized Derived Columns

In [0]:
df=df.withColumn('trip_date',to_date(col('tpep_pickup_datetime')))\
    .withColumn('trip_duration_minutes',(unix_timestamp(col('tpep_dropoff_datetime'))-unix_timestamp(col('tpep_pickup_datetime')))/60)\
        .withColumn('load_date',current_date())

#####Create surrogate key for unique trip

In [0]:
df_silver=df.withColumn('trip_key',sha2(concat_ws('||',*[col(c).cast("string") for c in dedup]),256)).dropDuplicates(['trip_key'])

####Incremental MERGE

In [0]:
from delta.tables import DeltaTable

In [0]:
if spark.catalog.tableExists('databricks_catalog.silver.NYC_Taxi_Trips'):
    dt=DeltaTable.forName(spark,'databricks_catalog.silver.NYC_Taxi_Trips')

    (dt.alias('trg') .merge (df_silver.alias('src'),'src.trip_key'=='trg.trip_key')\
    .whenNotMatchedInsertAll()\
    .execute()
    )
else:
    df_silver.write.format('delta').mode('overwrite').partitionBy('trip_date').saveAsTable('databricks_catalog.silver.NYC_Taxi_Trips')