## Concat All Months Into A Single Dataframe

Note: This notebook uses Azure Synapse Analytics with PySpark

This notebook takes all the parquet files for each month of data from 2021 through 2024 and then concatenates all the data into one large parquet file.
You only have to do this once

In [None]:
base_input_path = "<ADD YOUR PATH HERE>/bronze/"
base_output_path = "<ADD YOUR PATH HERE>/bronze/"
output_file_path = f"{base_output_path}union"
input_file_paths = []

for year in range(2021,2025):
    for month in range(1,13):       
        # File Names
        if month < 10:
            month = f"0{month}"
        file_name = f"yellow_tripdata_{year}-{month}"
        full_input_path = f"{base_input_path}{file_name}.parquet"
        input_file_paths.append(full_input_path)

In [None]:
%%pyspark
i = 0
for k in range(0,len(input_file_paths)):
    # Load the data
    full_input_path = input_file_paths[k]
    df = spark.read.load(full_input_path, format='parquet')
    
    # Change Airport_fee to airport_fee if present
    if "Airport_fee" in df.columns:
        df = df.withColumnRenamed("Airport_fee", "airport_fee")

    # Add the Year-Month column to enable better partitioning of data
    # The Year-Month from the Pickup datetime will be used.
    # The output column will be named "pu_year_month" where pu stands for pickup
    df = df.withColumn("pu_year_month", F.concat(F.date_format("tpep_pickup_datetime", "y").cast("string"),F.lit("-"),F.date_format("tpep_pickup_datetime", "M").cast("string")))

    # Union Data
    if i == 0:
        i+=1
        union_df = df.limit(0)
    union_df = union_df.unionByName(df)
    
# Clean up
del df

In [None]:
# Save the unioned data to avoid re-processing raw files again
union_df.write.partitionBy("pu_year_month").parquet(output_file_path, mode='overwrite')