In [3]:
from datetime import datetime, timezone
from dateutil import parser

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np
import pandas as pd
import requests

%matplotlib inline
plt.rcParams['figure.figsize'] = (16,10)

## Data Source

We will be using data from Isha Berry's [Covid19Canada repository](https://github.com/ishaberry/Covid19Canada). Most of what we want is packed into CSV files and we'll rely on the master branch to get up-to-date information. Before digging in to the data with pandas, take a quick look a the [codebook](https://github.com/ishaberry/Covid19Canada/blob/master/codebook.csv) and the [raw data](https://github.com/ishaberry/Covid19Canada/blob/master/cases.csv). The codebook helpfully lists the last midification time in the first line as part of the CSV. I'm not sure this has been standardized, but we can try to grab it flag how old our data is. At the moment it looks like 
```
'Last update: 02 April 2020, 22:00 EST',,
```

In [102]:
branchDir = 'https://raw.githubusercontent.com/ishaberry/Covid19Canada/master/'

codebookCSV  = branchDir + 'codebook.csv'

dataCSV = {
    'cases'       : branchDir + 'cases.csv',
    'mortality'   : branchDir + 'mortality.csv',
    'recovered_c' : branchDir + 'recovered_cumulative.csv',
    'testing_c'   : branchDir + 'testing_cumulative.csv'
}


In [98]:
# Grab the first row, read the date and massage it into datetime
dateDF = pd.read_csv(codebookCSV, names=['date', 'other1', 'other2'], nrows=1)
date = parser.parse(dateDF.iloc[0]['date'].replace('Last update: ', ''), tzinfos={'EST':'UTC-5'})
date

datetime.datetime(2020, 4, 2, 22, 0, tzinfo=tzstr('UTC-5'))

In [99]:
print(f"Data as of {date:%Y-%m-%d %H:%M} ({date - datetime.now().astimezone()} ago)")

Data as of 2020-04-02 22:00 (-1 day, 4:29:04.208528 ago)


### Codebook

The codebook gives descriptions of each of the columns found in the datasets, coding used in columns etc.

In [100]:
# N.B. We're grabbing the master branch here so be aware that the data will change as time passes
codebookDF = pd.read_csv(
    codebookCSV,
    skiprows=2,
)
codebookDF

Unnamed: 0,Variable,Description,Label
0,case_id,National Case ID Number,
1,provincial_case_id,Provincial Case ID Number,
2,age,"Age, if specific age not given then range prov...",
3,sex,Sex,
4,health_region,"Health region, if reported",
5,province,Province,
6,country,Country,
7,date_report,Reported date (i.e. public announcement date) ...,
8,report_week,Week of Report (Sundays are 1st day of week),
9,travel_yn,Travel history (yes/no),"0=no, 1=yes, Not Reported"


## Data

### Cases

In [104]:
casesDF = pd.read_csv(
    dataCSV['cases'],
    parse_dates = ['date_report', 'report_week'],
    dtype = {
        'travel_yn': pd.Int64Dtype(),
        'method_note': pd.Int64Dtype()
    },
    na_values = {
        'age': ['Not Reported'],
        'sex': ['Not Reported'],
        'travel_yn': ['Not Reported'],
        'health_region': ['Not Reported'],
        'province': ['Not Reported', 'Repatriated'],
        'country': ['Not Reported'],
        'travel_history_country': ['Not Reported'],
        'method_note': ['nan']
    }
)
casesDF.head()

# In some rows the case of `locally_acquired` is different
casesDF['locally_acquired'] = casesDF['locally_acquired'].str.title()

casesDF.head()

Unnamed: 0,case_id,provincial_case_id,age,sex,health_region,province,country,date_report,report_week,travel_yn,travel_history_country,locally_acquired,case_source,additional_info,additional_source,method_note
0,1,1,50-59,Male,Toronto,Ontario,Canada,2020-01-25,2020-01-19,1,China,,(1) https://news.ontario.ca/mohltc/en/2020/01/...,,,0.0
1,2,2,50-59,Female,Toronto,Ontario,Canada,2020-01-27,2020-01-26,1,China,,(1) https://news.ontario.ca/mohltc/en/2020/01/...,Travel and Close Contact,,0.0
2,3,1,40-49,Male,Vancouver Coastal,BC,Canada,2020-01-28,2020-01-26,1,China,,https://news.gov.bc.ca/releases/2020HLTH0015-0...,,,
3,4,3,20-29,Female,Middlesex-London,Ontario,Canada,2020-01-31,2020-01-26,1,China,,(1) https://news.ontario.ca/mohltc/en/2020/01/...,,,0.0
4,5,2,50-59,Female,Vancouver Coastal,BC,Canada,2020-04-02,2020-02-02,0,,Close Contact,https://news.gov.bc.ca/releases/2020HLTH0023-0...,The individual had close contact with family v...,,


### Mortality


In [105]:
mortalityDF = pd.read_csv(
    dataCSV['mortality'],
    parse_dates=['date_death_report'],
    dtype = {
        'travel_yn': pd.Int64Dtype(),
        'method_note': pd.Int64Dtype()
    },
    na_values = {
        'age': ['Not Reported'],
        'sex': ['Not Reported'],
        'travel_yn': ['Not Reported'],
        'health_region': ['Not Reported'],
        'province': ['Not Reported', 'Repatriated'],
        'country': ['Not Reported'],
        'travel_history_country': ['Not Reported'],
        'method_note': ['nan']
    }
)
mortalityDF.head()

Unnamed: 0,death_id,province_death_id,case_id,age,sex,health_region,province,country,date_death_report,death_source,additional_info,additional_source
0,1,1,60.0,80-89,Male,Vancouver Coastal,BC,Canada,2020-08-03,https://news.gov.bc.ca/releases/2020HLTH0068-0...,Lynn Valley Resident,
1,2,1,477.0,70-79,Male,Simcoe Muskoka,Ontario,Canada,2020-11-03,https://www.nationalobserver.com/2020/03/17/ne...,Was being treated at Royal Victoria Regional H...,
2,3,2,,,,Vancouver Coastal,BC,Canada,2020-03-16,https://news.gov.bc.ca/releases/2020HLTH0086-0...,Lynn Valley Resident,
3,4,3,,,,Vancouver Coastal,BC,Canada,2020-03-16,https://news.gov.bc.ca/releases/2020HLTH0086-0...,Lynn Valley Resident,
4,5,4,,,,Vancouver Coastal,BC,Canada,2020-03-16,https://news.gov.bc.ca/releases/2020HLTH0086-0...,Lynn Valley Resident,


### Recovered Cumulative


In [111]:
recoveredDF = pd.read_csv(
    dataCSV['recovered_c'],
    parse_dates=['date_recovered'],
    dtype = {
        'cumulative_recovered': pd.Int64Dtype(),
    }
)
recoveredDF.head()

Unnamed: 0,date_recovered,province,cumulative_recovered
0,2020-02-04,Alberta,174
1,2020-01-04,Alberta,142
2,2020-03-31,Alberta,120
3,2020-03-30,Alberta,94
4,2020-03-29,Alberta,73


### Testing Cumulative

In [132]:
testingDF = pd.read_csv(
    dataCSV['testing_c'],
    parse_dates = ['date_testing'],
)
testingDF.head()

Unnamed: 0,date_testing,province,cumulative_testing
0,2020-02-04,Alberta,57096
1,2020-01-04,Alberta,53141
2,2020-03-31,Alberta,48692
3,2020-03-30,Alberta,46057
4,2020-03-29,Alberta,44999
