In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, sum as fsum, to_date, lit


spark = SparkSession.builder.getOrCreate()

In [0]:
dbutils.widgets.text("catalog", "new_york_taxi")            
dbutils.widgets.text("bronze_schema", "bronze")   
dbutils.widgets.text("silver_schema", "silver")    

CATALOG       = dbutils.widgets.get("catalog")
BRONZE_SCHEMA = dbutils.widgets.get("bronze_schema")
SILVER_SCHEMA = dbutils.widgets.get("silver_schema")

In [0]:
green_raw = spark.table(f"{CATALOG}.{BRONZE_SCHEMA}.green_tripdata")

In [0]:
# 2. Standartize the green trips
green_silver = (
    green_raw
    .select(
        lit("green").alias("service_type"),
        col("VendorID").cast("string").alias("vendor_id"),
        col("lpep_pickup_datetime").alias("pickup_ts"),
        col("lpep_dropoff_datetime").alias("dropoff_ts"),
        col("PULocationID").cast("int").alias("pickup_location_id"),
        col("DOLocationID").cast("int").alias("dropoff_location_id"),
        col("passenger_count").cast("int").alias("passenger_count"),
        col("trip_distance").cast("double").alias("trip_distance"),
        col("fare_amount").cast("double").alias("fare_amount"),
        col("tip_amount").cast("double").alias("tip_amount"),
        col("total_amount").cast("double").alias("total_amount"),
        col("payment_type").cast("string").alias("payment_type")
    )
    .filter(col("pickup_ts").isNotNull())
    .filter(col("dropoff_ts").isNotNull())
    .filter(col("trip_distance") >= 0)
    .withColumn("trip_date", to_date(col("pickup_ts")))
)

In [0]:
green_silver.write.mode("overwrite").saveAsTable(f"{CATALOG}.{SILVER_SCHEMA}.trips_green")