#### Install pyspark library under this venv using command
<code>python -m pip install pyspark</code>

#### Install jupyter notebook under this venv
<code>python -m pip install notebook</code>

#### Import SparkSession Class from pyspark.sql

In [8]:
from pyspark.sql import SparkSession

#### Create a SparkSession Object to start working on Dataframes using local mode

In [10]:
spark = SparkSession. \
    builder. \
    master("local"). \
    appName("COVID-19-Tracker"). \
    getOrCreate()

print(type(spark))

<class 'pyspark.sql.session.SparkSession'>


#### Import Public Health Infobase data into phi_df Dataframe

In [18]:
phi_df = spark. \
    read. \
    format("csv"). \
    option("inferSchema","true"). \
    option("header", "true"). \
    load("CA__covid19__latest.csv")

phi_df.take(5)

[Row(pruid=35, prname='Ontario', prnameFR='Ontario', date='31-01-2020', update=None, numconf=3, numprob=0, numdeaths=0, numtotal=3, numtested=None, numtests=None, numrecover=None, percentrecover=None, ratetested=None, ratetests=None, numtoday=3, percentoday=300.0, ratetotal=0.02, ratedeaths=0.0, numdeathstoday=0.0, percentdeath=0.0, numtestedtoday=None, numteststoday=None, numrecoveredtoday=None, percentactive=100.0, numactive=3, rateactive=0.02, numtotal_last14=None, ratetotal_last14=None, numdeaths_last14=None, ratedeaths_last14=None, numtotal_last7=None, ratetotal_last7=None, numdeaths_last7=None, ratedeaths_last7=None, avgtotal_last7=None, avgincidence_last7=None, avgdeaths_last7=None, avgratedeaths_last7=None),
 Row(pruid=59, prname='British Columbia', prnameFR='Colombie-Britannique', date='31-01-2020', update=None, numconf=1, numprob=0, numdeaths=0, numtotal=1, numtested=None, numtests=None, numrecover=None, percentrecover=None, ratetested=None, ratetests=None, numtoday=1, percen

#### Importing useful attributes to our PHI dataframe and performing sort operation

In [61]:
phi_df1 = phi_df.select(phi_df.date.alias('phi_Date'), \
                        phi_df.prname.alias('phi_Province'), \
                        phi_df.numconf.alias('phi_Confirmed'), \
                        phi_df.numdeaths.alias('phi_Deaths') \
                       ). \
                 orderBy('phi_Province', 'phi_Confirmed', 'phi_Date')

phi_df1.take(5)

[Row(phi_Date='08-03-2020', phi_Province='Alberta', phi_Confirmed=1, phi_Deaths=0),
 Row(phi_Date='09-03-2020', phi_Province='Alberta', phi_Confirmed=7, phi_Deaths=0),
 Row(phi_Date='11-03-2020', phi_Province='Alberta', phi_Confirmed=14, phi_Deaths=0),
 Row(phi_Date='12-03-2020', phi_Province='Alberta', phi_Confirmed=19, phi_Deaths=0),
 Row(phi_Date='13-03-2020', phi_Province='Alberta', phi_Confirmed=23, phi_Deaths=0)]

#### Import data_format and to_date functions

In [50]:
from pyspark.sql.functions import date_format, to_date

#### Update date attribute in our PHI dataframe to yyyy-MM-dd format

In [60]:
phi_df2 = phi_df1.withColumn("phi_Date", date_format(to_date(phi_df1.phi_Date, "dd-MM-yyyy"), "yyyy-MM-dd"))

phi_df2.take(5)

[Row(phi_Date='2020-03-08', phi_Province='Alberta', phi_Confirmed=1, phi_Deaths=0),
 Row(phi_Date='2020-03-09', phi_Province='Alberta', phi_Confirmed=7, phi_Deaths=0),
 Row(phi_Date='2020-03-11', phi_Province='Alberta', phi_Confirmed=14, phi_Deaths=0),
 Row(phi_Date='2020-03-12', phi_Province='Alberta', phi_Confirmed=19, phi_Deaths=0),
 Row(phi_Date='2020-03-13', phi_Province='Alberta', phi_Confirmed=23, phi_Deaths=0)]

#### Filter for PHI dataframe where province attribute does not contain Repatriated travellers and Canada

In [64]:
phi_df3 = phi_df2.where("phi_Province NOT IN ('Repatriated travellers', 'Canada')")

phi_df3.select("phi_Province").distinct().show()

+--------------------+
|        phi_Province|
+--------------------+
|            Manitoba|
|               Yukon|
|         Nova Scotia|
|Northwest Territo...|
|Newfoundland and ...|
|             Alberta|
|             Nunavut|
|       New Brunswick|
|        Saskatchewan|
|Prince Edward Island|
|             Ontario|
|    British Columbia|
|              Quebec|
+--------------------+



#### Import John Hopkins data into jh_df Dataframe

In [21]:
jh_df = spark. \
    read. \
    format("csv"). \
    schema("jh_Date date, jh_Country string, jh_Province string, jh_Lat double, jh_Long double, jh_Confirmed            integer, jh_Recovered integer, jh_Deaths integer"). \
    option("header", "true"). \
    load("time-series-19-covid-combined.csv")

jh_df.take(5)

[Row(jh_Date=datetime.date(2020, 1, 22), jh_Country='Afghanistan', jh_Province=None, jh_Lat=0.0, jh_Long=0.0, jh_Confirmed=0, jh_Recovered=None, jh_Deaths=None),
 Row(jh_Date=datetime.date(2020, 1, 23), jh_Country='Afghanistan', jh_Province=None, jh_Lat=0.0, jh_Long=0.0, jh_Confirmed=0, jh_Recovered=None, jh_Deaths=None),
 Row(jh_Date=datetime.date(2020, 1, 24), jh_Country='Afghanistan', jh_Province=None, jh_Lat=0.0, jh_Long=0.0, jh_Confirmed=0, jh_Recovered=None, jh_Deaths=None),
 Row(jh_Date=datetime.date(2020, 1, 25), jh_Country='Afghanistan', jh_Province=None, jh_Lat=0.0, jh_Long=0.0, jh_Confirmed=0, jh_Recovered=None, jh_Deaths=None),
 Row(jh_Date=datetime.date(2020, 1, 26), jh_Country='Afghanistan', jh_Province=None, jh_Lat=0.0, jh_Long=0.0, jh_Confirmed=0, jh_Recovered=None, jh_Deaths=None)]

#### Filter John Hopkins dataframe for Canada records only

In [59]:
jh_df1 = jh_df.filter(jh_df.jh_Country == 'Canada')

jh_df1.take(5)

[Row(jh_Date=datetime.date(2020, 1, 22), jh_Country='Canada', jh_Province='Alberta', jh_Lat=0.0, jh_Long=None, jh_Confirmed=0, jh_Recovered=None, jh_Deaths=None),
 Row(jh_Date=datetime.date(2020, 1, 23), jh_Country='Canada', jh_Province='Alberta', jh_Lat=0.0, jh_Long=None, jh_Confirmed=0, jh_Recovered=None, jh_Deaths=None),
 Row(jh_Date=datetime.date(2020, 1, 24), jh_Country='Canada', jh_Province='Alberta', jh_Lat=0.0, jh_Long=None, jh_Confirmed=0, jh_Recovered=None, jh_Deaths=None),
 Row(jh_Date=datetime.date(2020, 1, 25), jh_Country='Canada', jh_Province='Alberta', jh_Lat=0.0, jh_Long=None, jh_Confirmed=0, jh_Recovered=None, jh_Deaths=None),
 Row(jh_Date=datetime.date(2020, 1, 26), jh_Country='Canada', jh_Province='Alberta', jh_Lat=0.0, jh_Long=None, jh_Confirmed=0, jh_Recovered=None, jh_Deaths=None)]

#### Filter for John Hopkins dataframe where province attribute does not contain Repatriated travellers, Grand Princess and Diamond Princess

In [66]:
jh_df2 = jh_df1.where("jh_Province NOT IN ('Repatriated Travellers', 'Grand Princess', 'Diamond Princess')")

jh_df2.select("jh_Province").distinct().show()

+--------------------+
|         jh_Province|
+--------------------+
|            Manitoba|
|               Yukon|
|         Nova Scotia|
|Northwest Territo...|
|Newfoundland and ...|
|             Alberta|
|             Nunavut|
|       New Brunswick|
|        Saskatchewan|
|Prince Edward Island|
|             Ontario|
|    British Columbia|
|              Quebec|
+--------------------+



#### Performing join operation for matching date and province attributes for PHI and John Hopkins dataframes

In [68]:
dfLeftOuterJoin = phi_df3. \
    join(jh_df2, \
         (phi_df3.phi_Date == jh_df2.jh_Date) & (phi_df3.phi_Province == jh_df2.jh_Province), \
             'left' \
        )

#### Our final dataframe contains records for the number of confirmed cases, number of recovered cases and the number of deaths from COVID-19 virus in Canada till date

In [76]:
df_Final = dfLeftOuterJoin.select(dfLeftOuterJoin.phi_Date.alias('Date'), \
                                  dfLeftOuterJoin.jh_Country.alias('Country'), \
                                  dfLeftOuterJoin.phi_Province.alias('Province'), \
                                  dfLeftOuterJoin.jh_Lat.alias('Latitude'), \
                                  dfLeftOuterJoin.jh_Long.alias('Longitude'), \
                                  dfLeftOuterJoin.phi_Confirmed.alias('Confirmed'), \
                                  dfLeftOuterJoin.jh_Recovered.alias('Recovered'), \
                                  dfLeftOuterJoin.jh_Deaths.alias('Deaths') \
                                 ) \
                          .orderBy("Province", "Date", "Confirmed")

#### Write our dataframe in csv format in a new directory

In [77]:
df_Final.write. \
    format("csv"). \
    mode("overwrite"). \
    save("output_path")