In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [0]:
# Load AWS Credentials
spark.conf.set("fs.s3a.access.key", "")
spark.conf.set("fs.s3a.secret.key", "")

In [0]:
# Read HVFHV DataFrames and drop duplicate rows
HVFHV_df = spark.read.option("recursiveFileLookup", "true").parquet("s3://capstone-techcatalyst-raw/hvfhv/*",multiLine=True, encoding="utf8").drop_duplicates()

In [0]:
HVFHV_df.count()

Out[5]: 100741442

In [0]:
# Create datetime column called "date" based on the pickup date
HVFHV_df = HVFHV_df.withColumn("date", HVFHV_df["pickup_datetime"].cast("date"))

In [0]:
# Add additional date feature columns based on the date column
HVFHV_df = HVFHV_df.withColumn("year", date_format('date', 'yyyy'))
HVFHV_df = HVFHV_df.withColumn("month", date_format('date', 'MM'))
HVFHV_df = HVFHV_df.withColumn("day_of_month", dayofmonth('date'))
HVFHV_df = HVFHV_df.withColumn("day_of_week", date_format('date', 'EEEE')) 
HVFHV_df = HVFHV_df.withColumn("is_weekend", dayofweek("date").isin([1,7]).cast("boolean"))
HVFHV_df = HVFHV_df.withColumn('trip_duration', (unix_timestamp('dropoff_datetime') - unix_timestamp('pickup_datetime'))/60)

# Filter DataFrame to only include rows where the date is between January 2024 - May 2024
HVFHV_df = HVFHV_df.filter(HVFHV_df["date"].between("2024-01-01", "2024-05-31"))  

In [0]:
# Count null values in each column
display(HVFHV_df.select([sum(col(c).isNull().cast("integer")).alias(c) for c in HVFHV_df.columns]))

hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag,date,year,month,day_of_month,day_of_week,is_weekend,trip_duration
0,0,25957560,0,25957172,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
# Write HVHFV DataFrame to parquet files partitioned by year, month, and day
HVFHV_df = HVFHV_df.withColumn("year_", col("year"))
HVFHV_df = HVFHV_df.withColumn("month_", col("month"))
HVFHV_df = HVFHV_df.withColumn("day_", col("day_of_month"))

HVFHV_parquet_path = f"s3a://capstone-techcatalyst-conformed/group4/hvfhv_data/"

HVFHV_df.write.partitionBy(["year_", "month_", "day_"]).parquet(HVFHV_parquet_path)