##Merging Green Taxi dataset with Yellow Taxi dataset. 

Preparation of Green Taxi dataset before merge: 

In [0]:
clean_green = spark.read.parquet("/CleanedData/green/cleaned_*")
clean_green.count()

Out[2]: 62758794

In [0]:
clean_green.printSchema()


root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: double (nullable = true)
 |-- trip_type: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- filename: string (nullable = true)
 |-- trip_speed: double (nullable = true)
 |-- trip_durati

No information about ehail_fee has been given in the data dictionary. It is therefore being dropped. 

In [0]:
unique_ehail_fee_values = clean_green.select("ehail_fee").distinct().collect()

# Print the unique values
for row in unique_ehail_fee_values:
    print(row.ehail_fee)


None
0.0
1.95


Schemas of Green Taxi dataset and Yellow Taxi dataset not only need to be same but the columns from both the dataset need to be in the exact order for the merge to be successful. 

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
from pyspark.sql.types import DoubleType


output_folder = "/CombinedCleanedData/"

output_filename = output_folder + "cleaned_green_transformed.parquet"


# Drop the 'ehail_fee' column from the DataFrame
clean_green = clean_green.drop("ehail_fee")
clean_green = clean_green.withColumn("color", lit("green")) #Adding the color column so that the Green Taxi dataset can be processed separately later for future queries. 
clean_green = clean_green.withColumn("airport_fee", lit(None).cast(DoubleType())) # Matching the datatypes and comparing with the schema of Yellow taxi dataset below. 
clean_green = clean_green.withColumn("payment_type", col("payment_type").cast("long")) # Matching the datatypes and comparing with the schema of Yellow taxi dataset below. 

clean_green.write.parquet(output_filename, mode="overwrite")

Tranforming both the datasets so their schemas match. Both the dataframes are going to be saved in the CombinedCleanedData folder as shown below. 

Comparing the schema with that of Yellow Taxi dataset. 

In [0]:
clean_yellow = spark.read.parquet("/CleanedData/yellow/cleaned_*")
clean_yellow.count()

Out[6]: 646503433

In [0]:
clean_yellow.printSchema()


root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- filename: string (nullable = true)
 |-- trip_speed: double (nullable = true)
 |-- trip_duration: long (nullable = true)



In [0]:
from pyspark.sql.functions import lit


clean_yellow = clean_yellow.withColumn("color", lit("yellow"))
clean_yellow = clean_yellow.withColumn("trip_type",lit(None).cast(DoubleType()))
output_folder = "/CombinedCleanedData/"

output_filename = output_folder + "cleaned_yellow_transformed.parquet"

clean_yellow.write.parquet(output_filename, mode="overwrite") #Saving the Tranformed Yellow Dataset. 

In [0]:
clean_green.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- filename: string (nullable = true)
 |-- trip_speed: double (nullable = true)
 |-- trip_duration: long (nullable = true)
 |-- color: 

In [0]:
clean_yellow = spark.read.parquet("/CombinedCleanedData/cleaned_yellow_transformed.parquet")
clean_yellow.count()

Out[5]: 646503433

In [0]:
from pyspark.sql.functions import col

output_folder = "/CombinedCleanedData/"

output_filename = output_folder + "cleaned_green_transformed.parquet"


# Define the desired column order based on your provided schema
desired_column_order = [
    "VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime", 
    "passenger_count", "trip_distance", "RatecodeID", "store_and_fwd_flag", 
    "PULocationID", "DOLocationID", "payment_type", "fare_amount", 
    "extra", "mta_tax", "tip_amount", "tolls_amount", "improvement_surcharge", 
    "total_amount", "congestion_surcharge", "airport_fee", "filename", 
    "trip_speed", "trip_duration", "color", "trip_type"
]

# Select columns in the desired order
clean_green = clean_green.select(*desired_column_order)
clean_green.write.parquet(output_filename, mode="overwrite")


In [0]:
clean_green = clean_green.withColumn("payment_type", col("payment_type").cast("long"))
clean_green.write.parquet(output_filename, mode="overwrite")


In [0]:
clean_green = spark.read.parquet("/CombinedCleanedData/cleaned_green_transformed.parquet")
clean_green.count()

Out[2]: 62758794

##Combining both the transformed dataframes. 

In [0]:
clean_green = spark.read.parquet("/CombinedCleanedData/cleaned_green_transformed.parquet")
clean_yellow = spark.read.parquet("/CombinedCleanedData/cleaned_yellow_transformed.parquet")


Few columns were added during the data cleaning phase for filtering a few erroneous entries. Those columns will be removed to make the merge process smoother. 

In [0]:
# List of columns to remove
columns_to_remove = ["filename", "trip_speed", "trip_duration", "trip_type"]

# Select only the columns you want to keep
clean_yellow = clean_yellow.select([col for col in clean_yellow.columns if col not in columns_to_remove])
output_folder = "/CombinedCleanedData/"

output_filename = output_folder + "cleaned_yellow_newcols.parquet"

clean_yellow.write.parquet(output_filename, mode="overwrite")

Unfortunately it took too long to apply both tranformations (removal of columns) and merge at the same time. The transformed Green and Yellow steps are saved as intermediary parquet files to facilate the merge. 

In [0]:
clean_green = spark.read.parquet("/CombinedCleanedData/cleaned_green_transformed.parquet")

In [0]:
# List of columns to remove
columns_to_remove = ["filename", "trip_speed", "trip_duration", "trip_type"]

# Select only the columns you want to keep
clean_green = clean_green.select([col for col in clean_green.columns if col not in columns_to_remove])
output_folder = "/CombinedCleanedData/"

output_filename = output_folder + "cleaned_green_newcols.parquet"

clean_green.write.parquet(output_filename, mode="overwrite")

In [0]:

if clean_yellow.schema == clean_green.schema:
    print("The schemas are equal.")
else:
    print("The schemas are not equal.")


The schemas are equal.


In [0]:
clean_green = spark.read.parquet("/CombinedCleanedData/cleaned_green_newcols.parquet")
clean_yellow = spark.read.parquet("/CombinedCleanedData/cleaned_yellow_newcols.parquet")


In [0]:
clean_green.cache()
clean_yellow.cache()


Out[5]: DataFrame[VendorID: bigint, tpep_pickup_datetime: timestamp, tpep_dropoff_datetime: timestamp, passenger_count: double, trip_distance: double, RatecodeID: double, store_and_fwd_flag: string, PULocationID: bigint, DOLocationID: bigint, payment_type: bigint, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double, congestion_surcharge: double, airport_fee: double, filename: string, trip_speed: double, trip_duration: bigint, color: string]

In [0]:
combined_df = clean_yellow.union(clean_green)
combined_df.count()


Out[6]: 709262227

In [0]:
combined_df.write.mode('append').parquet('/CombinedCleanedData/cleaned_combined.parquet')

Following the above steps, we were able to successfully merge both the Green and the Yellow taxi datasets. 