### Run Spark

In [39]:
import pyspark
from pyspark.sql import SparkSession

In [40]:
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('test') \
    .getOrCreate()

### Load and Write File with Partitioning

In [41]:
from pyspark.sql import types
from pyspark.sql import functions as F

In [45]:
yellow_schema = types.StructType(
    [
        types.StructField("VendorID", types.LongType(), True),
        types.StructField("tpep_pickup_datetime", types.TimestampType(), True),
        types.StructField("tpep_dropoff_datetime", types.TimestampType(), True),
        types.StructField("passenger_count", types.DoubleType(), True),
        types.StructField("trip_distance", types.DoubleType(), True),
        types.StructField("RatecodeID", types.DoubleType(), True),
        types.StructField("store_and_fwd_flag", types.StringType(), True),
        types.StructField("PULocationID", types.LongType(), True),
        types.StructField("DOLocationID", types.LongType(), True),
        types.StructField("payment_type", types.LongType(), True),
        types.StructField("fare_amount", types.DoubleType(), True),
        types.StructField("extra", types.DoubleType(), True),
        types.StructField("mta_tax", types.DoubleType(), True),
        types.StructField("tip_amount", types.DoubleType(), True),
        types.StructField("tolls_amount", types.DoubleType(), True),
        types.StructField("improvement_surcharge", types.DoubleType(), True),
        types.StructField("total_amount", types.DoubleType(), True),
        types.StructField("congestion_surcharge", types.DoubleType(), True),
        types.StructField("airport_fee", types.DoubleType(), True)
    ]
)

green_schema = types.StructType(
    [
        types.StructField("VendorID", types.LongType(), True),
        types.StructField("lpep_pickup_datetime", types.TimestampType(), True),
        types.StructField("lpep_dropoff_datetime", types.TimestampType(), True),
        types.StructField("store_and_fwd_flag", types.StringType(), True),
        types.StructField("RatecodeID", types.DoubleType(), True),
        types.StructField("PULocationID", types.LongType(), True),
        types.StructField("DOLocationID", types.LongType(), True),
        types.StructField("passenger_count", types.DoubleType(), True),
        types.StructField("trip_distance", types.DoubleType(), True),
        types.StructField("fare_amount", types.DoubleType(), True),
        types.StructField("extra", types.DoubleType(), True),
        types.StructField("mta_tax", types.DoubleType(), True),
        types.StructField("tip_amount", types.DoubleType(), True),
        types.StructField("tolls_amount", types.DoubleType(), True),
        types.StructField("ehail_fee", types.DoubleType(), True),
        types.StructField("improvement_surcharge", types.DoubleType(), True),
        types.StructField("total_amount", types.DoubleType(), True),
        types.StructField("payment_type", types.LongType(), True),
        types.StructField("trip_type", types.IntegerType(), True),
        types.StructField("congestion_surcharge", types.DoubleType(), True)
    ]
)

In [46]:
def cast_columns(df, schema):

    for field in schema.fields:

        col_name = field.name
        target_type = field.dataType

        if col_name in df.columns:

            # INT32 -> BIGINT(LongType)
            if isinstance(df.schema[col_name].dataType, types.IntegerType) and isinstance(target_type, types.LongType):
                df = df.withColumn(col_name, F.col(col_name).cast(types.LongType()))
            else:
                df = df.withColumn(col_name, F.col(col_name).cast(target_type))
    
    return df

In [47]:
years = [2023, 2024]
taxi_types = ["yellow", "green"]

for year in years:

    months = range(1, 13) if year == 2023 else range(1, 12)  

    for taxi_type in taxi_types:

        schema = yellow_schema if taxi_type == 'yellow' else green_schema

        for month in months:

            print(f"Processing Data for {taxi_type}/{year}/{month}...")

            input_folderpath = f"data/raw/{taxi_type}/{year}/{month:02d}/"
            output_folderpath = f"data/pq/{taxi_type}/{year}/{month:02d}/"

            df = spark.read \
                .option("header", "true") \
                .option("inferSchema", "true") \
                .parquet(input_folderpath)

            df = cast_columns(df, schema)

            df.repartition(4) \
                .write \
                .mode("overwrite") \
                .parquet(output_folderpath)

Processing Data for yellow/2023/1...


                                                                                

Processing Data for yellow/2023/2...


                                                                                

Processing Data for yellow/2023/3...


                                                                                

Processing Data for yellow/2023/4...


                                                                                

Processing Data for yellow/2023/5...


                                                                                

Processing Data for yellow/2023/6...


                                                                                

Processing Data for yellow/2023/7...


                                                                                

Processing Data for yellow/2023/8...


                                                                                

Processing Data for yellow/2023/9...


                                                                                

Processing Data for yellow/2023/10...


                                                                                

Processing Data for yellow/2023/11...


                                                                                

Processing Data for yellow/2023/12...


                                                                                

Processing Data for green/2023/1...
Processing Data for green/2023/2...
Processing Data for green/2023/3...
Processing Data for green/2023/4...
Processing Data for green/2023/5...
Processing Data for green/2023/6...
Processing Data for green/2023/7...
Processing Data for green/2023/8...
Processing Data for green/2023/9...
Processing Data for green/2023/10...
Processing Data for green/2023/11...
Processing Data for green/2023/12...
Processing Data for yellow/2024/1...


                                                                                

Processing Data for yellow/2024/2...


                                                                                

Processing Data for yellow/2024/3...


                                                                                

Processing Data for yellow/2024/4...


                                                                                

Processing Data for yellow/2024/5...


                                                                                

Processing Data for yellow/2024/6...


                                                                                

Processing Data for yellow/2024/7...


                                                                                

Processing Data for yellow/2024/8...


                                                                                

Processing Data for yellow/2024/9...


                                                                                

Processing Data for yellow/2024/10...


                                                                                

Processing Data for yellow/2024/11...


                                                                                

Processing Data for green/2024/1...
Processing Data for green/2024/2...
Processing Data for green/2024/3...
Processing Data for green/2024/4...
Processing Data for green/2024/5...
Processing Data for green/2024/6...
Processing Data for green/2024/7...
Processing Data for green/2024/8...
Processing Data for green/2024/9...
Processing Data for green/2024/10...
Processing Data for green/2024/11...
