## Bronze Load - Yellow Taxi

In [0]:
dbutils.library.restartPython()


In [0]:
import sys

sys.path.insert(0, "../lib/")

from utils import import_schema, export_schema, table_exists

def cast_yellow_taxi_data(df):

    return (df
        .withColumn("VendorID", col("VendorID").cast(ByteType()))
        .withColumn("passenger_count", col("passenger_count").cast(ByteType()))
        .withColumn("trip_distance", col("trip_distance").cast("double"))
        .withColumn("RatecodeID", col("RatecodeID").cast(ByteType()))
        .withColumn("PULocationID", col("PULocationID").cast(ShortType()))
        .withColumn("DOLocationID", col("DOLocationID").cast(ShortType()))
        .withColumn("payment_type", col("payment_type").cast(ByteType()))
        .withColumn("fare_amount", col("fare_amount").cast(DecimalType(10, 2)))
        .withColumn("extra", col("extra").cast(DecimalType(7, 2)))
        .withColumn("mta_tax", col("mta_tax").cast(DecimalType(5, 2)))
        .withColumn("tip_amount", col("tip_amount").cast(DecimalType(12, 2)))
        .withColumn("tolls_amount", col("tolls_amount").cast(DecimalType(7, 2)))
        .withColumn("improvement_surcharge", col("improvement_surcharge").cast(DecimalType(3, 2)))
        .withColumn("total_amount", col("total_amount").cast(DecimalType(10, 2)))
        .withColumn("congestion_surcharge", col("congestion_surcharge").cast(DecimalType(4, 2)))
        .withColumn("airport_fee", col("airport_fee").cast(DecimalType(3, 2))))

def cast_green_taxi_data(df):

    return (df
        .withColumn("VendorID", col("VendorID").cast(ByteType()))
        .withColumn("passenger_count", col("passenger_count").cast(ByteType()))
        .withColumn("trip_distance", col("trip_distance").cast("double"))
        .withColumn("RatecodeID", col("RatecodeID").cast(ByteType()))
        .withColumn("PULocationID", col("PULocationID").cast(ShortType()))
        .withColumn("DOLocationID", col("DOLocationID").cast(ShortType()))
        .withColumn("payment_type", col("payment_type").cast(ByteType()))
        .withColumn("fare_amount", col("fare_amount").cast(DecimalType(10, 2)))
        .withColumn("extra", col("extra").cast(DecimalType(7, 2)))
        .withColumn("mta_tax", col("mta_tax").cast(DecimalType(5, 2)))
        .withColumn("tip_amount", col("tip_amount").cast(DecimalType(12, 2)))
        .withColumn("tolls_amount", col("tolls_amount").cast(DecimalType(7, 2)))
        .withColumn("improvement_surcharge", col("improvement_surcharge").cast(DecimalType(3, 2)))
        .withColumn("total_amount", col("total_amount").cast(DecimalType(10, 2)))
        .withColumn("trip_type", col("trip_type").cast(ByteType()))
        .withColumn("congestion_surcharge", col("congestion_surcharge").cast(DecimalType(4, 2)))
        .withColumn("ehail_fee", col("ehail_fee").cast(DecimalType(3, 2)))
    )

In [0]:
spark.conf.set("spark.sql.parquet.enableVectorizedReader", "false")


In [0]:
from pyspark.sql.functions import col, year, month
from pyspark.sql.types import DecimalType, ByteType, ShortType

catalog = "bronze"
schema = "taxi"
table_name = "yellow_taxi_partitioned"
path = f"/Volumes/raw/{schema}/yellow_taxi"

for y in range(2011, 2022):
    for m in range(1, 13):
        file_path = f"{y}/yellow_tripdata_{y}-{m:02d}.parquet"
        df = spark.read.format('parquet').load(f"{path}/{file_path}")

        df_casted = cast_yellow_taxi_data(df).withColumn("year", year(col("tpep_pickup_datetime"))).withColumn("month", month(col("tpep_pickup_datetime")))

        if not table_exists(spark, catalog, schema, table_name):
            df_casted.write.format("delta").mode("overwrite").partitionBy("year", "month").saveAsTable(f"{catalog}.{schema}.{table_name}")
        else:
            df_casted.write.format("delta").mode("append").partitionBy("year", "month").saveAsTable(f"{catalog}.{schema}.{table_name}")

## Bronze Load - Green Taxi

In [0]:

table_name = "green_taxi"
schema = 'taxi'
path = f"/Volumes/raw/{schema}/{table_name}/2014"
path = '/Volumes/raw/taxi/green_taxi/2014/green_tripdata_2014-01.parquet'

spark.read.format('parquet').load(path).display()


In [0]:
from pyspark.sql.functions import col
from pyspark.sql.types import DecimalType, ByteType, ShortType

catalog = 'bronze'
table_name = "green_taxi_partitioned"
path = f"/Volumes/raw/{schema}/green_taxi"

for year in range(2014, 2022):
    for month in range(1, 13):
        file_path = f"{year}/green_tripdata_{year}-{month:02d}.parquet"
        df = spark.read.format('parquet').load(f"{path}/{file_path}")

        df_casted = cast_green_taxi_data(df).withColumn("year", year(col("lpep_pickup_datetime"))).withColumn("month", month(col("lpep_pickup_datetime")))

        if not table_exists(spark, catalog, schema, table_name):
            df_casted.write.format("delta").mode("overwrite").partitionBy("year", "month").saveAsTable(f"{catalog}.{schema}.{table_name}")
        else:
            df_casted.write.format("delta").mode("append").partitionBy("year", "month").saveAsTable(f"{catalog}.{schema}.{table_name}")


In [0]:
from pyspark.sql.functions import col
from pyspark.sql.types import DecimalType, ByteType, ShortType

catalog = 'bronze'
table_name = "green_taxi"
path = f"/Volumes/raw/{schema}/{table_name}"

for year in range(2014, 2022):
# for month in range(1, 13):
    file_path = f"{year}"
    df = spark.read.format('parquet').load(f"{path}/{file_path}")

    df_casted = cast_green_taxi_data(df)

    if not table_exists(spark, catalog, schema, table_name):
        df_casted.write.format("delta").mode("overwrite").saveAsTable(f"{catalog}.{schema}.{table_name}_t")
    else:
        df_casted.write.format("delta").mode("append").saveAsTable(f"{catalog}.{schema}.{table_name}_t")


In [0]:
%sql
SELECT
  date_trunc('month', tpep_pickup_datetime) AS month,
  COUNT(*) AS trip_count
FROM bronze.taxi.yellow_taxi
GROUP BY month
ORDER BY month

In [0]:
%sql
SELECT
  date_trunc('month', lpep_pickup_datetime) AS month,
  COUNT(*) AS trip_count
FROM bronze.taxi.green_taxi
GROUP BY month
ORDER BY month