## Preprocessing the test data 

In [2]:
# Importing the libraries

from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import isnan, when, count, col
from pyspark.sql.functions import date_format



In [3]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZon", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "8g")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/21 18:02:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/08/21 18:02:40 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/08/21 18:02:40 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/08/21 18:02:40 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


### Yellow taxi dataset

In [None]:
# Define the test directory

main_dir = '../data/raw/raw_test/yellow/'
mth = range(1,5)
yr = '2022'

# Define the schema for the spark dataframe
emptyRDD = spark.sparkContext.emptyRDD()
sch = spark.read.parquet('../data/raw/raw_test/yellow/2022-01.parquet')
sdf_yellow_test = spark.createDataFrame(emptyRDD, sch.schema )

# Merging the test data from 2022 into one single dataframe

for month in mth:
    
    month = str(month).zfill(2)
    sdf = spark.read.parquet(f'{main_dir}{yr}-{month}.parquet')

    # The airport_fee column has different data types in different files
    # Hence converting into a same data type and joining the dataframes into a 
    # single dataframe

    sdf_updated = sdf.withColumn(
        'airport_fee',
        F.col('airport_fee').cast('DOUBLE')
    )

    sdf_yellow_test = sdf_yellow_test.unionByName(sdf_updated)

In [None]:
# Checking for the null and nan values

sdf_yellow_test.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) 
for c in (sdf_yellow_test.columns)[3:]])

In [None]:
# Filling the null values in the airport_fee and congestion surcharge column 
# with 0s
sdf_yellow_test = sdf_yellow_test.fillna(value = 0.0, subset=['airport_fee', 
'congestion_surcharge'])

#Dropping the null values from dataframe

sdf_yellow_test = sdf_yellow_test.dropna()
sdf_yellow_test.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) 
for c in (sdf_yellow_test.columns)[3:]]).show()

In [None]:
# Converting the pickup and dropoff location ids to integer

for field in ('PU', 'DO'):
    field = f'{field}LocationID'
    sdf_yellow_test = sdf_yellow_test.withColumn(
        field,
        F.col(field).cast('INT')
    )

In [None]:
# Removing the invalid trips

sdf_yellow_test = sdf_yellow_test.withColumn(
    'is_valid_record',
   
    F.when(
        ((F.col('total_amount') > 0) & (F.col('PULocationID').between(1,263)) 
        & (F.col('DOLocationID').between(1,263)) & 
        (F.col('passenger_count').between(1,4)) & 
        (((F.col('tpep_dropoff_datetime').cast("long")) - 
        (F.col('tpep_pickup_datetime').cast("long"))) > 0)),
        True
    ).otherwise(False)
)

In [None]:
# Filtering for trips paid by only credit cards 

sdf_yellow_test.createOrReplaceTempView('yellow_test')

sdf_yellow_test = spark.sql(""" 

SELECT 
    *
FROM 
    yellow_test
WHERE
    Payment_type = 1 AND is_valid_record IS TRUE

""")

In [None]:
# Define the rate codes

sdf_yellow_test = sdf_yellow_test.withColumn('rate_code', 
    F.when((F.col('RateCodeID') == 1 ), 'Standard')\
    .when((F.col('RateCodeID') == 2 ), 'JFK')\
    .when((F.col('RateCodeID') == 3 ), 'Newark')\
    .when((F.col('RateCodeID') == 4 ), 'Nasau or Westchester')\
    .when((F.col('RateCodeID') == 5 ), 'Negotiated fare')\
    .when((F.col('RateCodeID') == 6 ), 'Shared ride')\
    .otherwise('Standard')
)

In [None]:
# Make the taxi type column 

sdf_yellow_test.createOrReplaceTempView('temp_yellow')

sdf_yellow_test = spark.sql(""" 

SELECT 
    *,
    'Yellow taxi' AS vehicle_type
FROM 
    temp_yellow
    
""")

In [None]:
# Define the ride type

sdf_yellow_test = sdf_yellow_test.withColumn('vehicle_and_ride_type', 
    F.when(((F.col('vehicle_type') == 'Yellow taxi') & 
    (F.col('rate_code') == 'Standard')), 'Yellow-Standard') \

    .when(((F.col('vehicle_type') == 'Yellow taxi') & 
    (F.col('rate_code') == 'Shared ride')), 'Yellow-Shared ride') \

    .when(((F.col('vehicle_type') == 'Yellow taxi') & 
    (F.col('rate_code') == 'JFK')), 'Yellow-JFK') \
    .when(((F.col('vehicle_type') == 'Yellow taxi') & 
    (F.col('rate_code') == 'Negotiated fare')), 'Yellow-Negotiated fare') \

    .when(((F.col('vehicle_type') == 'Yellow taxi') & 
    (F.col('rate_code') == 'Newark')), 'Yellow-Newark') \

    .when(((F.col('vehicle_type') == 'Yellow taxi') & 
    (F.col('rate_code') == 'Nasau or Westchester')), 'Yellow-Nasau or Westchester')
)

### FHVHV Dataset

In [None]:
# Define the HVFVHV test data directory

main_dir = '../data/raw/raw_test/FHVHV/'
mth = range(1,5)
yr = '2022'

# Define the schema for the spark dataframe

emptyRDD = spark.sparkContext.emptyRDD()
sch = spark.read.parquet('../data/raw/raw_test/FHVHV/2022-01.parquet')
sdf_FHVHV_test = spark.createDataFrame(emptyRDD, sch.schema )

# Make a single dataframe for the FHVHV testing data
for month in mth:
    
    month = str(month).zfill(2)
    sdf = spark.read.parquet(f'{main_dir}{yr}-{month}.parquet')

    #the airport_fee column has different data types in different files
    #Hence converting into a same data type and joining the dataframes into a 
    # single dataframe  
    sdf_updated_FHVHV = sdf.withColumn(
        'airport_fee',
        F.col('airport_fee').cast('DOUBLE')
    )

    sdf_FHVHV_test = sdf_FHVHV_test.unionByName(sdf_updated_FHVHV)


In [16]:
# Filling all the numeric columns with 0 inplace of the NULLs

sdf_FHVHV_test= sdf_FHVHV_test.fillna(value = 0.0, 
subset=['base_passenger_fare','base_passenger_fare', 'tolls', 'bcf', 
'sales_tax', 'congestion_surcharge', 'airport_fee', 'tips', 'driver_pay'])

In [None]:
# Selecting only Uber rides

sdf_FHVHV_test.createOrReplaceTempView('FHVHV_view')

sdf_FHVHV_test = spark.sql(""" 

SELECT 
    *
FROM
    FHVHV_view
WHERE 
    hvfhs_license_num = 'HV0003' 

""")

In [None]:
# Checking for the null and nan values

cols = ['hvfhs_license_num', 'PULocationID', 'DOLocationID']

sdf_FHVHV_test.select([count(when(isnan(c) | col(c).isNull(), 
c)).alias(c) for c in cols]).show()


In [None]:
# Converting the pickup and dropoff location ids to integer

for field in ('PU', 'DO'):
    field = f'{field}LocationID'
    sdf_FHVHV_test = sdf_FHVHV_test.withColumn(
        field,
        F.col(field).cast('INT')
    )

In [21]:
# Converting the numeric columns to double

cols = ['base_passenger_fare', 'tolls', 'bcf', 'sales_tax', 
'congestion_surcharge', 'airport_fee', 'tips', 'driver_pay']

for column in cols:
    sdf_FHVHV_test = sdf_FHVHV_test.withColumn(
            column,
            F.col(column).cast('DOUBLE')
        )

In [None]:
# Calculting the total fare amount

sdf_FHVHV_test = sdf_FHVHV_test.withColumn(
    'total_amount', (F.col('base_passenger_fare') + F.col('tolls') + 
    F.col('bcf')+ F.col('sales_tax') + F.col('congestion_surcharge') + 
    F.col('airport_fee') + F.col('tips'))
)

# Rounding the total amoount to 2 decimal places

sdf_FHVHV_test = sdf_FHVHV_test.withColumn(
    'total_amount', F.round('total_amount', 2)
)

In [None]:
# Discarding the invalid records

sdf_FHVHV_test = sdf_FHVHV_test.withColumn(
    'is_valid_record',
   
    F.when(
        ((F.col('total_amount') > 0) & (F.col('PULocationID').between(1,263)) 
        & (F.col('DOLocationID').between(1,263)) & 
        (((F.col('dropoff_datetime').cast("long")) - 
        (F.col('pickup_datetime').cast("long"))) > 0)),
        True
    ).otherwise(False)
)

In [None]:
# Making a column with the taxi type

sdf_FHVHV_test = sdf_FHVHV_test.withColumn('vehicle_type', 

    F.when((F.col('hvfhs_license_num') == 'HV0003'), 'Uber')

)

In [None]:
# Labelling the type of ride

sdf_FHVHV_test = sdf_FHVHV_test.withColumn('rate_code', 

    F.when(((F.col('airport_fee') == 0.0) & 
    ((F.col('shared_request_flag') == 'Y') & 
    (F.col('shared_match_flag') == 'Y'))), 'Shared ride')\

    .when(((F.col('airport_fee') == 0.0) & 
    ((F.col('shared_request_flag') == 'Y') & 
    (F.col('shared_match_flag') == 'N'))), 'Shared ride')\

    .when(((F.col('airport_fee') == 0.0) & 
    ((F.col('shared_request_flag') == 'N') & 
    (F.col('shared_match_flag') == 'Y'))), 'Standard')\

    .when(((F.col('airport_fee') == 0.0) & 
    ((F.col('shared_request_flag') == 'N') & 
    (F.col('shared_match_flag') == 'N'))), 'Standard')\

    .when((F.col('airport_fee') > 0.0 ),'LaGuardia/Newark/JFK')
)

In [None]:
# Define the rate code and ride type

sdf_FHVHV_test = sdf_FHVHV_test.withColumn('vehicle_and_ride_type', 

    when(((F.col('vehicle_type') == 'Uber') & 
    (F.col('rate_code') == 'Standard')), 'Uber-Standard') \

    .when(((F.col('vehicle_type') == 'Uber') & 
    (F.col('rate_code') == 'Shared ride')), 'Uber-Shared ride') \

    .when(((F.col('vehicle_type') == 'Uber') & 
    (F.col('rate_code') == 'LaGuardia/Newark/JFK')), 
    'Uber-LaGuardia/Newark/JFK')
)


In [None]:
# Dropping the unwanted columns

sdf_FHVHV_test = sdf_FHVHV_test.drop('Hvfhs_license_num','Dispatching_base_num'
,'originating_base_num','request_datetime', 'on_scene_datetime', 
'shared_request_flag', 'shared_match_flag', 'access_a_ride_flag',
'wav_request_flag','wav_match_flag','driver_pay', 'tolls','bcf','sales_tax',
'congestion_surcharge')

### Merging the datasets

In [None]:
sdf_yellow_test.createOrReplaceTempView("final_yellow")

final_sdf_yellow_test = spark.sql(""" 

SELECT 
    tpep_pickup_datetime AS pickup_time,
    tpep_dropoff_datetime AS dropoff_time,
    Trip_distance AS trip_distance,
    PULocationID,
    DOLocationID,
    Fare_amount AS base_fare,
    Tip_amount AS tips,
    Total_amount AS total_amount,
    vehicle_type,
    rate_code,
    vehicle_and_ride_type
FROM
    final_yellow

"""
)

In [None]:
sdf_FHVHV_test.createOrReplaceTempView("final_FHVHV")

final_sdf_FHVHV_test = spark.sql(""" 

SELECT 
    Pickup_datetime AS pickup_time,
    DropOff_datetime AS dropoff_time,
    trip_miles AS trip_distance,
    PULocationID,
    DOLocationID,
    base_passenger_fare AS base_fare,
    tips,
    total_amount,
    vehicle_type,
    rate_code,
    vehicle_and_ride_type
FROM
    final_FHVHV

""")

In [None]:
# Merge the two datasets

merged_data = final_sdf_yellow_test.union(final_sdf_FHVHV_test)

In [None]:
# Extract year, month, date, day, pickup hour from the timestamps

merged_data = merged_data.withColumn("Year", 
date_format('pickup_time', 'yyyy'))

merged_data = merged_data.withColumn("Month", 
date_format('pickup_time', 'MMMM'))

merged_data = merged_data.withColumn("Date", 
date_format('pickup_time', 'dd'))

merged_data = merged_data.withColumn("Day", 
date_format('pickup_time', 'EEEE'))

merged_data = merged_data.withColumn("pickup_hour", 
date_format('pickup_time', 'HH'))

# Drop the timestamps
merged_data = merged_data.drop('pickup_time', 'dropoff_time')

In [None]:
# Order the columns of the merged dataset

merged_data.createOrReplaceTempView("temp")

merged_data = spark.sql("""

SELECT 
    Year, Month, Date, Day, pickup_hour,
    trip_distance, PULocationID, DOLocationID, base_fare, tips, total_amount, 
    vehicle_type, rate_code, vehicle_and_ride_type

FROM 
    temp
"""
)

In [37]:
# Select only the standard rides

final_merged = merged_data.where(

    (F.col('vehicle_and_ride_type') == 'Yellow-Standard')
    |
    (F.col('vehicle_and_ride_type') == 'Uber-Standard')
)

In [None]:
# Drop the unwanted columns

final_merged = final_merged.drop('vehicle_type', 'rate_code')

In [None]:
# Remove the outlier data (downloaded data should be first 4 months only)

final_merged.createOrReplaceTempView('outlier')

final_merged = spark.sql(""" 

SELECT 
    *
FROM 
    outlier
WHERE 
    Month IN ('January', 'February', 'March', 'April')

"""
)

In [43]:
# Save the merged data

final_merged.write.parquet("../data/curated/merged_testing.paraquet")

                                                                                