Run this Script To get All Raw Data processed into preprocessed Data



- IQR to plot outliers
- Use Median to avoid outliers

In [1]:
## Library

# Spark:
from pyspark.sql import SparkSession
import warnings
warnings.filterwarnings("ignore")
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import col, datediff, round, lit, date_format, when, to_date
import pyspark.sql.functions as F
from os import path

from pyspark.sql.types import *
from pyspark.sql.functions import col


from pyspark import SparkContext
# create a spark session (which will run spark jobs)
spark = SparkSession.builder.getOrCreate()

sc = SparkContext.getOrCreate(conf=swan_spark_conf) #Start the spark context
spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', True)
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)


In [2]:
# Test Monitor
from pyspark import SparkContext
sc = SparkContext.getOrCreate(conf=swan_spark_conf) #Start the spark context
rdd = sc.parallelize([1, 2, 4, 8])
rdd.count()

4

In [3]:
holidays = [
    "2019-01-01",
    "2019-01-21",
    "2019-02-12",
    "2019-02-18",
    "2019-05-12",
    "2019-05-27",
]

In [4]:
### Type Converter

ints = ('VendorID', 'passenger_count', 'RateCodeID', 'RatecodeID','payment_type','PULocationID', 'DOLocationID')
doubles = ('trip_distance', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
           'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount','congestion_surcharge')
strings = ('hvfhs_license_num','dispatching_base_num','SR_Flag','store_and_fwd_flag', 'CarSpeed', 'EarnPerTime')
dtimes = ('pickup_datetime', 'dropoff_datetime', )

dtypes = {column: IntegerType() for column in ints}
dtypes.update({column: DoubleType() for column in doubles})
dtypes.update({column: StringType() for column in strings})
dtypes.update({column: TimestampType() for column in dtimes})

In [5]:

def removeOutlier_IQR(spark_df, columns):
    '''Use IQR to remove Outliers for each column in Spark Dataframe'''
    for column in columns:
        quantiles = spark_df.approxQuantile(column, [0.25,0.75], 0.01)
        Q1 = quantiles[0]
        Q3 = quantiles[1]
        IQR = Q3 - Q1
        lowerRange = Q1 - 1.5*IQR
        upperRange = Q3 + 1.5*IQR
        spark_df = spark_df.filter(col(column) > lowerRange).filter(col(column) < upperRange)
    return spark_df

# FHV Preprocessing

In [6]:
## HVF Data

spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', True)
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

sdf_fhv = spark.read.csv('../raw_data/fhv_*', header=True)
schema = StructType()
for column in sdf_fhv.columns:
    schema.add(column, # column name
               dtypes[column], # data type
               True # is nullable?
              )

# Load Data with predefined Schema
sdf_fhv = spark.read.csv('../raw_data/fhv_*', header=True, schema=schema)


# Drop Null
sdf_fhv = sdf_fhv.filter(sdf_fhv.pickup_datetime>=lit('2019-01-01'))
sdf_fhv = sdf_fhv.filter(sdf_fhv.pickup_datetime<=lit('2019-05-31'))
sdf_fhv = sdf_fhv.filter(sdf_fhv.DOLocationID. isNotNull())
sdf_fhv = sdf_fhv.filter(sdf_fhv.PULocationID. isNotNull())
sdf_fhv = sdf_fhv.filter(sdf_fhv.dropoff_datetime. isNotNull())
sdf_fhv = sdf_fhv.filter(sdf_fhv.pickup_datetime. isNotNull())

sdf_fhv = sdf_fhv.withColumn("Shared", (sdf_fhv["SR_Flag"] != "null").cast("boolean"))
sdf_fhv = sdf_fhv.drop("SR_Flag")


## Add Duration Feature
sdf_fhv = sdf_fhv.withColumn('Duration',col("dropoff_datetime").cast("long") - col('pickup_datetime').cast("long"))
sdf_fhv = sdf_fhv.withColumn('Duration',round(col('Duration')/60,3).cast("int"))

## Filtering
sdf_fhv = sdf_fhv.filter(sdf_fhv.PULocationID > 0)\
        .filter(sdf_fhv.DOLocationID > 0)\
        .filter(sdf_fhv.Duration > 1)\
        .filter(sdf_fhv.Duration < 300)

sdf_fhv.printSchema()
sdf_fhv.dropna()
sdf_fhv.limit(5)

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- Shared: boolean (nullable = true)
 |-- Duration: integer (nullable = true)



dispatching_base_num,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,Shared,Duration
B00254,2019-01-01 00:33:03,2019-01-01 01:37:24,140,52,,64
B00254,2019-01-01 00:03:00,2019-01-01 00:34:25,141,237,,31
B00254,2019-01-01 00:45:48,2019-01-01 01:26:01,237,236,,40
B00254,2019-01-01 00:37:39,2019-01-01 01:44:59,162,85,,67
B00254,2019-01-01 00:35:06,2019-01-01 01:30:21,237,246,,55


### FHV Daily Data Generation

In [7]:
## Get Overall Data Frame For Each DAY

sdf_fhv_new = sdf_fhv.withColumn("trips_count", lit(1).cast("int"))
sdf_fhv_new = sdf_fhv_new.withColumn("date",
                      to_date(col("pickup_datetime"),"yyyy-MM-dd"))

sdf_fhv_data = sdf_fhv_new.groupby(sdf_fhv_new["date"]).sum()
sdf_fhv_data = sdf_fhv_data.withColumnRenamed("sum(trips_count)", "NumofPickUps")
sdf_fhv_data = sdf_fhv_data.withColumn("avg_duration", round(sdf_fhv_data["sum(Duration)"]/sdf_fhv_data["NumofPickUps"],2))
sdf_fhv_data = sdf_fhv_data.orderBy("date")

sdf_fhv_data = sdf_fhv_data.withColumn("isWeekend", 
                                   when(((date_format(sdf_fhv_data["date"], "E")) == "Sat") | 
                                    ((date_format(sdf_fhv_data["date"], "E")) == "Sun")|
                                        (date_format(sdf_fhv_data.date, 'yyyy-MM-dd').isin(holidays)), True).otherwise(False))
sdf_fhv_data = sdf_fhv_data.withColumn("DayOfWeek",date_format(sdf_fhv_data["date"], "E"))

sdf_fhv_data = sdf_fhv_data["date", "NumofPickUps", "avg_duration", "isWeekend", "DayOfWeek"]
sdf_fhv_data.limit(5)

date,NumofPickUps,avg_duration,isWeekend,DayOfWeek
2019-01-01,676778,16.77,True,Tue
2019-01-02,484909,18.14,False,Wed
2019-01-03,504065,18.23,False,Thu
2019-01-04,581338,17.92,False,Fri
2019-01-05,696771,16.81,True,Sat


## High Volumn FHV PreProcessing

In [8]:
## High Volumn FHV

sdf_fhvhv = spark.read.csv('../raw_data/fhvhv*', header=True)
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

schema = StructType()
for column in sdf_fhvhv.columns:
    schema.add(column, # column name
               dtypes[column], # data type
               True # is nullable?
              )

# Load Data with predefined Schema
sdf_fhvhv = spark.read.csv('../raw_data/fhvhv*', header=True, schema=schema)
sdf_fhvhv
sdf_fhvhv = sdf_fhvhv.withColumn("Shared", (sdf_fhvhv["SR_Flag"] == "null").cast("boolean"))
sdf_fhvhv = sdf_fhvhv.drop("SR_Flag")

# Drop Null
sdf_fhvhv = sdf_fhvhv.filter(sdf_fhvhv.pickup_datetime>=lit('2019-02-01'))
sdf_fhvhv = sdf_fhvhv.filter(sdf_fhvhv.pickup_datetime<=lit('2019-05-31'))
sdf_fhvhv = sdf_fhvhv.filter(sdf_fhvhv.DOLocationID.isNotNull())
sdf_fhvhv = sdf_fhvhv.filter(sdf_fhvhv.PULocationID.isNotNull())
sdf_fhvhv = sdf_fhvhv.filter(sdf_fhvhv.dropoff_datetime.isNotNull())
sdf_fhvhv = sdf_fhvhv.filter(sdf_fhvhv.pickup_datetime.isNotNull())


## Add Duration Feature
sdf_fhvhv = sdf_fhvhv.withColumn('Duration',col("dropoff_datetime").cast("long") - col('pickup_datetime').cast("long"))
sdf_fhvhv = sdf_fhvhv.withColumn('Duration',round(col('Duration')/60,3).cast("int"))

sdf_fhvhv.limit(2)


hvfhs_license_num,dispatching_base_num,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,Shared,Duration
HV0003,B02867,2019-02-01 00:05:18,2019-02-01 00:14:57,245,251,,9
HV0003,B02879,2019-02-01 00:41:29,2019-02-01 00:49:39,216,197,,8


In [9]:
## Filtering

sdf_fhvhv = sdf_fhvhv.filter(sdf_fhvhv.PULocationID > 0)\
        .filter(sdf_fhvhv.DOLocationID > 0)\
        .filter(sdf_fhvhv.Duration > 1)\
        .filter(sdf_fhvhv.Duration < 300)


#print(f"sdf_fhvhv Rows: {sdf_fhvhv.count()}")
sdf_fhvhv.limit(2)

hvfhs_license_num,dispatching_base_num,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,Shared,Duration
HV0003,B02867,2019-02-01 00:05:18,2019-02-01 00:14:57,245,251,,9
HV0003,B02879,2019-02-01 00:41:29,2019-02-01 00:49:39,216,197,,8


## HV FHV Daily Data Generate

In [10]:
## Get Overall Data Frame For Each DAY

sdf_fhvhv_new = sdf_fhvhv.withColumn("trips_count", lit(1).cast("int"))
sdf_fhvhv_new = sdf_fhvhv_new.withColumn("date",
                      to_date(col("pickup_datetime"),"yyyy-MM-dd"))

sdf_fhvhv_data = sdf_fhvhv_new.groupby(sdf_fhvhv_new["date"]).sum()
sdf_fhvhv_data = sdf_fhvhv_data.withColumnRenamed("sum(trips_count)", "NumofPickUps")
sdf_fhvhv_data = sdf_fhvhv_data.withColumn("avg_duration", round(sdf_fhvhv_data["sum(Duration)"]/sdf_fhvhv_data["NumofPickUps"],2))
sdf_fhvhv_data = sdf_fhvhv_data.orderBy("date")

sdf_fhvhv_data = sdf_fhvhv_data.withColumn("isWeekend", 
                                   when(((date_format(sdf_fhvhv_data["date"], "E")) == "Sat") | 
                                    ((date_format(sdf_fhvhv_data["date"], "E")) == "Sun")|
                                        (date_format(sdf_fhvhv_data.date, 'yyyy-MM-dd').isin(holidays)), True).otherwise(False))
sdf_fhvhv_data = sdf_fhvhv_data.withColumn("DayOfWeek",date_format(sdf_fhvhv_data["date"], "E"))

sdf_fhvhv_data = sdf_fhvhv_data["date", "NumofPickUps", "avg_duration", "isWeekend", "DayOfWeek"]
sdf_fhvhv_data.limit(5)

date,NumofPickUps,avg_duration,isWeekend,DayOfWeek
2019-02-01,858400,18.68,False,Fri
2019-02-02,864783,17.74,True,Sat
2019-02-03,755193,17.05,True,Sun
2019-02-04,591207,18.95,False,Mon
2019-02-05,575586,18.57,False,Tue


## Yellow Taxi PreProcessing

### Schema 

In [11]:
## Yellow Taxi

#sdf_yellow.describe("congestion_surcharge").show()

sdf_yellow = spark.read.csv('../raw_data/yellow*', header=True)\
                .withColumnRenamed("tpep_pickup_datetime","pickup_datetime")\
                .withColumnRenamed("tpep_dropoff_datetime","dropoff_datetime") # rename the wrong column
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

schema = StructType()
for column in sdf_yellow.columns:
    schema.add(column, # column name
               dtypes[column], # data type
               True # is nullable?
              )
    
sdf_yellow = spark.read.csv('../raw_data/yellow*', header=True, schema=schema)

sdf_yellow = sdf_yellow.withColumn("store_and_fwd", (sdf_yellow["store_and_fwd_flag"] == "Y").cast("boolean"))
sdf_yellow = sdf_yellow.drop("store_and_fwd_flag")
#sdf_yellow.printSchema()
#print(f"row Count before Preprocessing: {sdf_yellow.count()}")
sdf_yellow.limit(2)

VendorID,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,store_and_fwd
1,2019-01-01 00:46:40,2019-01-01 00:53:20,1,1.5,1,151,239,1,7.0,0.5,0.5,1.65,0.0,0.3,9.95,,False
1,2019-01-01 00:59:47,2019-01-01 01:18:59,1,2.6,1,239,246,1,14.0,0.5,0.5,1.0,0.0,0.3,16.3,,False


### Feature Engineering

- Add Duration in minutes

In [12]:
## Add Duration Label in rounded minutes
sdf_yellow = sdf_yellow.withColumn('Duration',col("dropoff_datetime").cast("long") - col('pickup_datetime').cast("long"))

# Car Speed is Miles/Hour
sdf_yellow = sdf_yellow.withColumn('Duration',round(col('Duration')/60,3).cast("int"))
sdf_yellow = sdf_yellow.withColumn('CarSpeed',round(col('trip_distance')/col('Duration')*60,3).cast("double"))
sdf_yellow = sdf_yellow.withColumn('EarnPerTime',round(col('total_amount')/col('Duration'),3).cast("double"))
sdf_yellow = sdf_yellow.withColumn("isWeekend", 
                                   when(((date_format(sdf_yellow["pickup_datetime"], "E")) == "Sat") | 
                                    ((date_format(sdf_yellow["pickup_datetime"], "E")) == "Sun") |
                                        (date_format(sdf_yellow["pickup_datetime"], 'yyyy-MM-dd').isin(holidays)), True).otherwise(False))
sdf_yellow.limit(2)


VendorID,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,store_and_fwd,Duration,CarSpeed,EarnPerTime,isWeekend
1,2019-01-01 00:46:40,2019-01-01 00:53:20,1,1.5,1,151,239,1,7.0,0.5,0.5,1.65,0.0,0.3,9.95,,False,6,15.0,1.658,True
1,2019-01-01 00:59:47,2019-01-01 01:18:59,1,2.6,1,239,246,1,14.0,0.5,0.5,1.0,0.0,0.3,16.3,,False,19,8.211,0.858,True


### Data Cleanse
Use Basic Knowledge to do data Scleansing

Some Data is Faulty:

- fare = 0
- pick up dropoff time reversed, distance too large




In [13]:
# Only keep 2019 data
# Remove Trip distance < 0.5 Miles
# Remove Trip distance > 150 Miles
# Remove Passenger count = 0
# Remove fare amount = 0
# Remove Duration < 1 mins
# Remove Duration > 10 hours
# Remove Where Extra charge < 0 
# Remove Total Amount < $1
# Filter Null Value for pick up location
# Filter Null Value for Dropoff location
sdf_yellow = sdf_yellow\
    .filter(sdf_yellow.pickup_datetime >= lit("2019-01-01"))\
    .filter(sdf_yellow.pickup_datetime <= lit("2019-05-31"))\
    .filter(sdf_yellow.trip_distance > 0.5)\
    .filter(sdf_yellow.passenger_count > 0)\
    .filter(sdf_yellow.fare_amount > 0)\
    .filter(sdf_yellow.Duration > 1)\
    .filter(sdf_yellow.extra >= 0 )\
    .filter(sdf_yellow.PULocationID.isNotNull())\
    .filter(sdf_yellow.DOLocationID.isNotNull())\
    .filter((sdf_yellow.payment_type == 1) | (sdf_yellow.payment_type == 2))

# filter(sdf_yellow.trip_distance < 150)
# filter(sdf_yellow.Duration < 600)

#print(f"Rows before Preprocessing: {sdf_yellow.count()}")

# Use IQR for fare_amount, DurationEarn/Miles and Car Speed As well
sdf_yellow = removeOutlier_IQR(sdf_yellow, ["trip_distance"])
sdf_yellow = removeOutlier_IQR(sdf_yellow, ["total_amount", "Duration"])
sdf_yellow.limit(5)

VendorID,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,store_and_fwd,Duration,CarSpeed,EarnPerTime,isWeekend
1,2019-01-01 00:46:40,2019-01-01 00:53:20,1,1.5,1,151,239,1,7.0,0.5,0.5,1.65,0.0,0.3,9.95,,False,6,15.0,1.658,True
1,2019-01-01 00:59:47,2019-01-01 01:18:59,1,2.6,1,239,246,1,14.0,0.5,0.5,1.0,0.0,0.3,16.3,,False,19,8.211,0.858,True
1,2019-01-01 00:21:28,2019-01-01 00:28:37,1,1.3,1,163,229,1,6.5,0.5,0.5,1.25,0.0,0.3,9.05,,False,7,11.143,1.293,True
1,2019-01-01 00:32:01,2019-01-01 00:45:39,1,3.7,1,229,7,1,13.5,0.5,0.5,3.7,0.0,0.3,18.5,,False,13,17.077,1.423,True
1,2019-01-01 00:57:32,2019-01-01 01:09:32,2,2.1,1,141,234,1,10.0,0.5,0.5,1.7,0.0,0.3,13.0,,False,12,10.5,1.083,True


### Engineer For Daily Data

In [14]:
## Get Overall Data Frame For Each DAY

sdf_yellow_new = sdf_yellow.withColumn("trips_count", lit(1).cast("int"))
sdf_yellow_new = sdf_yellow_new.withColumn("date",
                      to_date(col("pickup_datetime"),"yyyy-MM-dd"))

sum_yellow_data = sdf_yellow_new.groupby(sdf_yellow_new["date"]).sum()
sum_yellow_data = sum_yellow_data.withColumnRenamed("sum(trips_count)", "NumofPickUps")
sum_yellow_data = sum_yellow_data.withColumn("avg_speed", round(sum_yellow_data["sum(CarSpeed)"]/sum_yellow_data["NumofPickUps"],5))
sum_yellow_data = sum_yellow_data.withColumn("avg_earn", round(sum_yellow_data["sum(total_amount)"]/sum_yellow_data["NumofPickUps"],3))
sum_yellow_data = sum_yellow_data.withColumn("avg_duration", round(sum_yellow_data["sum(Duration)"]/sum_yellow_data["NumofPickUps"],3))
sum_yellow_data = sum_yellow_data.withColumn("avg_fare_amount", round(sum_yellow_data["sum(fare_amount)"]/sum_yellow_data["NumofPickUps"],3))
sum_yellow_data = sum_yellow_data.withColumn("avg_trip_distance", round(sum_yellow_data["sum(trip_distance)"]/sum_yellow_data["NumofPickUps"],5))
sum_yellow_data = sum_yellow_data.orderBy("date")

sum_yellow_data = sum_yellow_data.withColumn("isWeekend", 
                                   when(((date_format(sum_yellow_data["date"], "E")) == "Sat") | 
                                    ((date_format(sum_yellow_data["date"], "E")) == "Sun") | 
                                        (date_format(sum_yellow_data["date"], 'yyyy-MM-dd').isin(holidays)), True).otherwise(False))

sum_yellow_data = sum_yellow_data.withColumn("DayOfWeek",date_format(sum_yellow_data["date"], "E"))

sum_yellow_data = sum_yellow_data["date", "NumofPickUps", "avg_duration", "avg_trip_distance", "avg_earn", "avg_speed", "avg_fare_amount", "isWeekend", "DayOfWeek"]
#sum_yellow_data.limit(5)

## Green Taxi PreProcessing

- Drop column ehail_fee as all the values are null
- "trip_type" Drop this column as this attribute not appeared in green taxi

In [15]:
## Green Taxi

#sdf_green.describe("congestion_surcharge").show()

sdf_green = spark.read.csv('../raw_data/green*', header=True)\
                .withColumnRenamed("lpep_pickup_datetime","pickup_datetime")\
                .withColumnRenamed("lpep_dropoff_datetime","dropoff_datetime") # rename the wrong column
sdf_green = sdf_green.drop("ehail_fee")  # Drop this column as all the values are null
sdf_green = sdf_green.drop("trip_type")  # Drop this column as this attribute not appeared in green taxi
sdf_green = sdf_green.drop("improvement_surcharge")
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

schema = StructType()
for column in sdf_green.columns:
    schema.add(column, # column name
               dtypes[column], # data type
               True # is nullable?
              )
    
sdf_green = spark.read.csv('../raw_data/green*', header=True, schema=schema)

sdf_green = sdf_green.withColumn("store_and_fwd", (sdf_green["store_and_fwd_flag"] == "Y").cast("boolean"))
sdf_green = sdf_green.drop("store_and_fwd_flag")
#sdf_green.printSchema()
print(f"Rows before Preprocessing: {sdf_green.count()}")
#sdf_green.limit(5)

Rows before Preprocessing: 2196066


## Feature Engineering add duration

In [16]:
## Add Duration Label in rounded minutes
sdf_green = sdf_green.withColumn('Duration',col("dropoff_datetime").cast("long") - col('pickup_datetime').cast("long"))

sdf_green = sdf_green.withColumn('Duration',round(col('Duration')/60,3).cast("int"))
sdf_green = sdf_green.withColumn('CarSpeed',round(col('trip_distance')/col('Duration')*60,3).cast("double"))
sdf_green = sdf_green.withColumn('EarnPerTime',round(col('total_amount')/col('Duration'),3).cast("double"))
sdf_green.limit(2)


VendorID,pickup_datetime,dropoff_datetime,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,total_amount,payment_type,congestion_surcharge,store_and_fwd,Duration,CarSpeed,EarnPerTime
2,2019-02-01 00:10:19,2019-02-01 00:21:43,1,92,135,1,2.79,11.0,0.5,0.5,3.08,0.0,,,15.38,False,11,15.218,
2,2019-02-01 00:02:16,2019-02-01 00:24:37,1,66,36,1,4.46,17.5,0.5,0.5,3.76,0.0,,,22.56,False,22,12.164,


## Cleanse

In [17]:
# .filter(sdf_green.pickup_datetime >= lit("2019-01-01"))\
# Remove Trip distance < 0.1 Miles
# Remove Trip distance > 150 Miles
# Remove Passenger count = 0
# Remove fare amount = 0
# Remove Duration < 1 mins
# Remove Duration > 10 hours
# Remove Where Extra charge < 0 
# Remove Total Amount < $1
# Filter Null Value for pick up location
# Filter Null Value for Dropoff location
sdf_green = sdf_green\
    .filter(sdf_green.pickup_datetime >= lit("2019-01-01"))\
    .filter(sdf_green.pickup_datetime <= lit("2019-05-31"))\
    .filter(sdf_green.trip_distance > 0.5)\
    .filter(sdf_green.passenger_count > 0)\
    .filter(sdf_green.fare_amount > 0)\
    .filter(sdf_green.Duration > 1)\
    .filter(sdf_green.Duration < 600)\
    .filter(sdf_green.extra >= 0 )\
    .filter(sdf_green.PULocationID.isNotNull() )\
    .filter(sdf_green.DOLocationID.isNotNull())

sdf_green = removeOutlier_IQR(sdf_green, ["trip_distance"])

#print(f"Rows after Preprocessing: {sdf_green.count()}")

## Green Daily Data

In [18]:
## Get Overall Data Frame For Each DAY

sdf_green_new = sdf_green.withColumn("trips_count", lit(1).cast("int"))
sdf_green_new = sdf_green_new.withColumn("date",
                      to_date(col("pickup_datetime"),"yyyy-MM-dd"))

sum_green_data = sdf_green_new.groupby(sdf_green_new["date"]).sum()
sum_green_data = sum_green_data.withColumnRenamed("sum(trips_count)", "NumofPickUps")
sum_green_data = sum_green_data.withColumn("avg_speed", round(sum_green_data["sum(CarSpeed)"]/sum_green_data["NumofPickUps"],5))
sum_green_data = sum_green_data.withColumn("avg_earn", round(sum_green_data["sum(total_amount)"]/sum_green_data["NumofPickUps"],3))
sum_green_data = sum_green_data.withColumn("avg_duration", round(sum_green_data["sum(Duration)"]/sum_green_data["NumofPickUps"],3))
sum_green_data = sum_green_data.withColumn("avg_fare_amount", round(sum_green_data["sum(fare_amount)"]/sum_green_data["NumofPickUps"],3))
sum_green_data = sum_green_data.withColumn("avg_trip_distance", round(sum_green_data["sum(trip_distance)"]/sum_green_data["NumofPickUps"],5))
sum_green_data = sum_green_data.orderBy("date")

sum_green_data = sum_green_data.withColumn("isWeekend", 
                                   when(((date_format(sum_green_data["date"], "E")) == "Sat") | 
                                    ((date_format(sum_green_data["date"], "E")) == "Sun") | 
                                        (date_format(sum_green_data["date"], 'yyyy-MM-dd').isin(holidays)), True).otherwise(False))

sum_green_data = sum_green_data.withColumn("DayOfWeek",date_format(sum_green_data["date"], "E"))

sum_green_data = sum_green_data["date", "NumofPickUps", "avg_duration", "avg_trip_distance", "avg_earn", "avg_speed", "avg_fare_amount", "isWeekend", "DayOfWeek"]
#sum_green_data.limit(5)

## Save Processed Data To Local

In [19]:
# Check to see if the fpath already exists. If so, remove it.
from shutil import rmtree
from os import path

fpath = '../preprocessed_data/yellow.parquet/'
if path.exists(fpath):
    rmtree(fpath)
sdf_yellow.write.format('parquet').save(fpath)

fpath = '../preprocessed_data/yellow_daily.parquet/'
if path.exists(fpath):
    rmtree(fpath)
sum_yellow_data.write.format('parquet').save(fpath)
    
    
fpath = '../preprocessed_data/green.parquet/'
if path.exists(fpath):
    rmtree(fpath)
sdf_green.write.format('parquet').save(fpath)

fpath = '../preprocessed_data/green_daily.parquet/'
if path.exists(fpath):
    rmtree(fpath)
sum_green_data.write.format('parquet').save(fpath)
    
fpath = '../preprocessed_data/fhv.parquet/'
if path.exists(fpath):
    rmtree(fpath)
sdf_fhv.write.format('parquet').save(fpath)

fpath = '../preprocessed_data/fhv_daily.parquet/'
if path.exists(fpath):
    rmtree(fpath)
sdf_fhv_data.write.format('parquet').save(fpath)

fpath = '../preprocessed_data/hvfhv.parquet/'
if path.exists(fpath):
    rmtree(fpath)
sdf_fhvhv.write.format('parquet').save(fpath)

fpath = '../preprocessed_data/hvfhv_daily.parquet/'
if path.exists(fpath):
    rmtree(fpath)
sdf_fhvhv_data.write.format('parquet').save(fpath)

In [20]:
fpath = '../preprocessed_data/green_daily.parquet/'
if path.exists(fpath):
    rmtree(fpath)
sum_green_data.write.format('parquet').save(fpath)
fpath = '../preprocessed_data/yellow_daily.parquet/'
if path.exists(fpath):
    rmtree(fpath)
sum_yellow_data.write.format('parquet').save(fpath)