In [13]:
# Dependencies
import pandas as pd
import numpy as np
import requests
import io
import time
from datetime import datetime, timedelta
pd.set_option('display.max_columns', None)

In [8]:
def titleCase(words):
    if len(words) > 3:
        titlecased = ""
        wordsArray = words.lower().split(" ")
        for word in wordsArray:
            if len(titlecased) > 0 :
                titlecased = titlecased + " "
            if word == "and":
                titlecased = titlecased + "and"
            else:
                titlecased = titlecased + word.capitalize()
        return titlecased
    else:
        return words.upper()

def us_date(x):
    month = x[5:7]
    day = x[8:11]
    year = x[0:4]
    conversion = (month + "/" + day +"/"+ year).replace(" ", "")
    return conversion

def fixProvince(value):
    province_map = {
        'BC': 'British Columbia',
        'NL': 'Newfoundland and Labrador',
        'NWT': 'Northwest Territories',
        'PEI': 'Prince Edward Island',
        'Repatriated': 'Repatriated Canada',
        'Repatriated Cdn': 'Repatriated Canada'
    }
    value = titleCase(value)
    if value in province_map.keys():
        new_province = province_map[value]
        return new_province
    else:
        return value

In [9]:
canada_source_csv = "https://opendata.arcgis.com/datasets/3afa9ce11b8842cb889714611e6f3076_0.csv"
canada_source_request = requests.get(canada_source_csv).content
canada_df = pd.read_csv(io.StringIO(canada_source_request.decode('utf-8')))
currentTime = datetime.now()

canada_df.rename(columns = {
    'SummaryDate':'Source Date',
    'TotalCases' : 'Positive Total','DailyTotals': 'Positive Daily',
    'TotalRecovered' : 'Recovered Total','DailyRecovered' : 'Recovered Daily',
    'TotalDeaths' : 'Deaths Total','DailyDeaths' : 'Deaths Daily',
    'TotalTested' : 'Tests Total','DailyTested' : 'Tests Daily',
    'TotalActive' : 'Active Total','DailyActive' : 'Active Daily',
    'TotalHospitalized' : 'Hospitalized Total','DailyHospitalized' : 'Hospitalized Daily',
    'TotalICU' : 'ICU Total', 'DailyICU' : 'ICU Daily'
}, inplace = True)

canada_df.drop(columns=["OBJECTID"], inplace = True)
canada_df["Downloaded"] = currentTime
canada_df["Country"] = "Canada"
canada_df["Region"] = "North America"
canada_df["Province"] = canada_df["Province"].apply(lambda x: fixProvince(x))
canada_df["Date"] = canada_df["Source Date"].apply(lambda x: us_date(x))


string_columns = ["Province","Abbreviation","Country","Region"]

for col in string_columns:
    canada_df[col] = canada_df[col].astype(str)
    print(canada_df[col].sort_values(ascending = True).unique())

print(canada_df.columns)
print(canada_df.dtypes)
canada_df.head()

['Alberta' 'British Columbia' 'Canada' 'Manitoba' 'New Brunswick'
 'Newfoundland and Labrador' 'Northwest Territories' 'Nova Scotia'
 'Nunavut' 'Ontario' 'Prince Edward Island' 'Quebec' 'Repatriated Canada'
 'Saskatchewan' 'Yukon']
['AB' 'BC' 'CA' 'MB' 'NB' 'NL' 'NS' 'NT' 'NU' 'ON' 'PE' 'QC' 'RC' 'SK'
 'YT']
['Canada']
['North America']
Index(['Province', 'Abbreviation', 'Positive Daily', 'Source Date',
       'Positive Total', 'Recovered Total', 'Recovered Daily', 'Deaths Total',
       'Deaths Daily', 'Tests Total', 'Tests Daily', 'Active Total',
       'Active Daily', 'Hospitalized Total', 'Hospitalized Daily', 'ICU Total',
       'ICU Daily', 'Downloaded', 'Country', 'Region', 'Date'],
      dtype='object')
Province                      object
Abbreviation                  object
Positive Daily                 int64
Source Date                   object
Positive Total                 int64
Recovered Total                int64
Recovered Daily                int64
Deaths Total        

Unnamed: 0,Province,Abbreviation,Positive Daily,Source Date,Positive Total,Recovered Total,Recovered Daily,Deaths Total,Deaths Daily,Tests Total,Tests Daily,Active Total,Active Daily,Hospitalized Total,Hospitalized Daily,ICU Total,ICU Daily,Downloaded,Country,Region,Date
0,Alberta,AB,0,2020/01/25 12:00:00+00,0,0,0,0,0,0,0,0,0.0,,,,,2020-10-21 08:35:35.045368,Canada,North America,01/25/2020
1,Northwest Territories,NT,0,2020/01/25 12:00:00+00,0,0,0,0,0,0,0,0,0.0,,,,,2020-10-21 08:35:35.045368,Canada,North America,01/25/2020
2,Yukon,YT,0,2020/01/25 12:00:00+00,0,0,0,0,0,0,0,0,0.0,,,,,2020-10-21 08:35:35.045368,Canada,North America,01/25/2020
3,Saskatchewan,SK,0,2020/01/25 12:00:00+00,0,0,0,0,0,0,0,0,0.0,,,,,2020-10-21 08:35:35.045368,Canada,North America,01/25/2020
4,Prince Edward Island,PE,0,2020/01/25 12:00:00+00,0,0,0,0,0,0,0,0,0.0,,,,,2020-10-21 08:35:35.045368,Canada,North America,01/25/2020


In [11]:
r_column_order = [
    "Date",
    "Country",
    "Province",
    "Positive Total",
    "Deaths Total",
    "Deaths Daily",
    "Positive Daily",
    "Tests Daily",
    "Tests Total"
]
canada_df = canada_df.sort_values(["Province","Source Date"])
r_canada_df = canada_df[r_column_order]
r_canada_df.head()

Unnamed: 0,Date,Country,Province,Positive Total,Deaths Total,Deaths Daily,Positive Daily,Tests Daily,Tests Total
0,01/25/2020,Canada,Alberta,0,0,0,0,0,0
24,01/26/2020,Canada,Alberta,0,0,0,0,0,0
35,01/27/2020,Canada,Alberta,0,0,0,0,0,0
53,01/28/2020,Canada,Alberta,0,0,0,0,0,0
68,01/29/2020,Canada,Alberta,0,0,0,0,0,0


In [17]:
filestamp = str(currentTime).replace(".","").replace(" ","").replace(":","").replace("-","")
r_canada_df.to_excel(r'C:\Users\janin\Downloads\canada_' + filestamp + '.xlsx', index = False)