In [1]:
# Import libraries

from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import isnan, when, count, col
import pandas as pd
import geopandas as gpd
import folium
from pyspark.sql.functions import unix_timestamp, from_unixtime
from pyspark.sql.functions import date_format



In [2]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZon", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "8g")
    .getOrCreate()
)

22/08/21 23:10:02 WARN Utils: Your hostname, MacBook-Air-3.local resolves to a loopback address: 127.0.0.1; using 192.168.0.66 instead (on interface en0)
22/08/21 23:10:02 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/21 23:10:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/08/21 23:10:03 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/08/21 23:10:03 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/08/21 23:10:03 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [None]:
# Load the merged training set

merged_train = spark.read.parquet('../data/curated/merged_training.paraquet/')

In [5]:
# Load the geospatial data

sf = gpd.read_file("../data/raw/taxi_zones.shp")
zones = spark.read.option("header", True).csv("../data/raw/nyc.csv")


In [None]:
# Merge the training data with the boroughs

merged_train = merged_train \
    .join(zones, merged_train.PULocationID == zones.LocationID, "inner") \
    .drop('LocationID', 'Zone', "service_zone")

### Cleaning the external covid dataset

In [3]:
# Load the Covid-19 dataset

covid_data_county = spark.read.option("header", True).csv("../data/raw/covid_county.csv")

In [5]:
# Match the counties with the boroughs

covid_data_county = covid_data_county.withColumn('Borough', 
    when(((F.col('County') == 'Bronx')), 'Bronx') \
    .when(((F.col('County') == 'Kings')), 'Brooklyn') \
    .when(((F.col('County') == 'New York')), 'Manhattan') \
    .when(((F.col('County') == 'Queens')), 'Queens') \
    .when(((F.col('County') == 'Richmond')), 'Staten Island') \
    .when(((F.col('County') == 'Essex')), 'EWR') \
)

In [None]:
# Checking the null and nan values

cols = ['Test Date', 'New Positives', 'Test % Positive']

covid_data_county.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c)
 for c in cols ])

In [None]:
# Renaming the columns

covid_data_county = covid_data_county.withColumnRenamed(

   'New Positives',
    "covid_cases"
)
covid_data_county = covid_data_county.withColumnRenamed(

   'Total Number of Tests Performed',
    "total_tests"
)

covid_data_county = covid_data_county.withColumnRenamed(

   'Test % Positive',
    "positivity_rate"
)

# Converting the dates in a timestampformat

covid_data_county = covid_data_county.select(
    
    "Test Date", "covid_cases", "Borough", "total_tests", "positivity_rate",
    
    from_unixtime(unix_timestamp( 'Test Date', 'MM/dd/yyy')).alias('Date')
)


In [None]:
# Converting the string timestamp at an actual timestamp

covid_data_county = covid_data_county.withColumn("date",

    F.col("Date").cast('TIMESTAMP')

)

In [None]:
# Extracting the year, month, day from the timestamp

covid_data_county = covid_data_county.withColumn("Year", 
date_format('date', 'yyyy'))

covid_data_county  = covid_data_county.withColumn("Month", 
date_format('date', 'MMMM'))

covid_data_county  = covid_data_county.withColumn("Day", 
date_format('date', 'dd'))

# Dropping the timestamp column
covid_data_county = covid_data_county.drop('date')

In [17]:
# Sorting the merged train and covid dataset by montj

covid_data_county = covid_data_county.orderBy(col("Month").asc(), 
col("Day").asc())


merged_train = merged_train.orderBy(col("Month").asc(), col("Date").asc())

In [None]:
# Making a new column based on month, date and borough to merge the train and 
# covid dataset

merged_train = merged_train.withColumn("day_and_date"
,
F.concat(F.col("Month"), F.col("Date"), F.col("Borough"))
)

covid_data_county = covid_data_county.withColumn("day_and_date"
,
F.concat(F.col("Month"), F.col("Day"), F.col("Borough"))
)

In [None]:
# Checking for any missing values

covid_data_county.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) 
for c in covid_data_county.columns])

In [21]:
# Extracting the positivity rate from the column

covid_data_county = covid_data_county.withColumn("positivity_rate", 

F.regexp_extract(covid_data_county.positivity_rate, r'\d\.\d\d', idx=0))          

In [None]:
# Converting the positivity rate column to a double

covid_data_county = covid_data_county.withColumn(
    "positivity_rate",

    F.col( "positivity_rate").cast('DOUBLE')
)

# Converting the other numeric columns to integer

for field in ('covid_cases', 'total_tests'):

    covid_data_county = covid_data_county.withColumn(
        field,

        F.col(field).cast('INT')
    )

In [23]:
# Renaming the columns in the covid dataset 

covid_data_county = covid_data_county.withColumnRenamed("Borough", 
"Borough_dup")

covid_data_county = covid_data_county.withColumnRenamed("Year", "Year_dup")

covid_data_county = covid_data_county.withColumnRenamed("Month", "Month_dup")

covid_data_county = covid_data_county.withColumnRenamed("Day", "Day_dup")


In [24]:
# Separating the covid dataset based on year value of 2021 and 2022 for the
# train and test sets respectively

covid_data_county.createOrReplaceTempView("covid_temp")

covid_data_county_train = spark.sql("""

SELECT 
    *
FROM 
    covid_temp
WHERE 
    Year_dup = 2021 AND Borough_dup IS NOT NULL

""")


covid_data_county_test = spark.sql("""

SELECT 
    *
FROM 
    covid_temp
WHERE 
    Year_dup = 2022 AND Borough_dup IS NOT NULL

""")

### Merging the Covid-19 and training dataset

In [None]:
# Merging the covid data with train data to prepare the final train dataset

final_train_data = merged_train \
    .join(covid_data_county_train, 
    merged_train.day_and_date == covid_data_county_train.day_and_date, 
    "inner")\
    .drop("day_and_date", "Borough_dup", "Year_dup", "Month_dup", "Day_dup")

In [26]:
# Converting the locations ids back to string

for field in ('PU', 'DO'):
    field = f'{field}LocationID'
    final_train_data = final_train_data.withColumn(
        field,
        F.col(field).cast('STRING')
    )

In [27]:
# Saving the final training dataset

final_train_data.write.parquet("../data/curated/final_train_dataset.parquet")

                                                                                

In [28]:
# Saving the curated covid dataset for train and test 

covid_data_county_train.write.parquet("../data/curated/covid_curated_train.parquet")
covid_data_county_test.write.parquet("../data/curated/covid_curated_test.parquet")