### Joining the merged testing dataset with covid data

In [1]:
# Import the libraries 

from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import isnan, when, count, col
import pandas as pd
import geopandas as gpd
import folium



In [2]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZon", "Etc/UTC")
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "8g")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/21 18:08:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/08/21 18:08:38 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/08/21 18:08:38 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/08/21 18:08:38 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/08/21 18:08:38 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
22/08/21 18:08:38 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.


In [None]:
# Load the merged testing dataset

merged_test = spark.read.parquet('../data/curated/merged_testing.paraquet/')

In [7]:
# Loading the geospatial data

zones = spark.read.option("header", True).csv("../data/raw/nyc.csv")


In [None]:
# Merging the testing dataset based on boroughs

merged_test = merged_test \
    .join(zones, merged_test.PULocationID == zones.LocationID, "inner") \
    .drop('LocationID', 'Zone', "service_zone")

In [9]:
# Loading the Covid-19 data for test data 

covid_data = spark.read.parquet("../data/curated/covid_curated_test.parquet/")

In [10]:
# Sorting the testing data based on months

merged_test = merged_test.orderBy(col("Month").asc(), col("Date").asc())

In [None]:
# Preparing a new concatenated column of features to merge test data with covid 
# data

merged_test = merged_test.withColumn("day_and_date"
,
F.concat(F.col("Month"), F.col("Date"), F.col("Borough"))
)


In [None]:
# Prepare the final test dataset by merging it with the covid data

final_test_data = merged_test \
    .join(covid_data, merged_test.day_and_date == covid_data.day_and_date, "inner")\
    .drop("day_and_date", "Borough_dup", "Year_dup", "Month_dup", "Day_dup")

In [14]:
# Save the final test data

final_test_data.write.parquet("../data/curated/final_test_dataset.parquet")

                                                                                