In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [0]:
# Load AWS Credentials
spark.conf.set("fs.s3a.access.key", "")
spark.conf.set("fs.s3a.secret.key", "")

In [0]:
# Read joined dataframe
taxi_df = spark.read.option("recursiveFileLookup", "true").parquet("s3://capstone-techcatalyst-conformed/group4/taxi_data/*",multiLine=True, encoding="utf8")

In [0]:
# Count null values in each column
display(taxi_df.select([sum(col(c).isNull().cast("integer")).alias(c) for c in taxi_df.columns]))

VendorID,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,ehail_fee,trip_type,Taxi_Type,date,year,month,day_of_month,day_of_week,is_weekend,trip_duration
0,0,0,0,0,2204270,2204270,0,0,31234,0,0,0,0,0,0,0,0,544905,30423026,29909403,0,0,0,0,0,0,0,0


In [0]:
# Fill the null values in Airport_fee with 0
taxi_df = taxi_df.fillna({'Airport_fee':0})

In [0]:
# Drop ehail_fee and trip_type
taxi_df = taxi_df.drop('ehail_fee', 'trip_type')

In [0]:
# fill the null values in store_and_fwd_flag with N 
taxi_df = taxi_df.fillna({'store_and_fwd_flag':'N'})

In [0]:
# Check nulls one more time
display(taxi_df.select([sum(col(c).isNull().cast("integer")).alias(c) for c in taxi_df.columns]))

VendorID,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,Taxi_Type,date,year,month,day_of_month,day_of_week,is_weekend,trip_duration
0,0,0,0,0,2204270,0,0,0,31234,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
from pyspark.sql.window import Window

In [0]:
# Calculate Trip Duration % of Total Time Spent Active for the day.
taxi_df = taxi_df.withColumn('trip_distance_percentage', col('trip_distance')/ sum('trip_distance').over(Window.partitionBy(to_date("Pickup_datetime")))*100)
taxi_df = taxi_df.withColumn('trip_duration_percentage', col('trip_duration')/ sum('trip_duration').over(Window.partitionBy(to_date("Pickup_datetime")))*100)

In [0]:
# Write taxi DataFrame to parquet files partitioned by year, month, and taxi type
taxi_df = taxi_df.withColumn("year_", col("year"))
taxi_df = taxi_df.withColumn("month_", col("month"))
taxi_df = taxi_df.withColumn("taxi_type_", col("Taxi_Type"))

taxi_parquet_path = f"s3a://capstone-techcatalyst-transformed/group4/taxi_data/"

taxi_df.write.partitionBy(["year_", "month_", "taxi_type_"]).parquet(taxi_parquet_path)