In [9]:
from datetime import datetime, timezone
from dateutil import parser

import pandas as pd

## Data Source - Covid19Canada

We will be using data from Isha Berry's [Covid19Canada repository](https://github.com/ishaberry/Covid19Canada). Most of what we want is packed into CSV files and we'll rely on the master branch to get up-to-date information. Before digging in to the data with pandas, take a quick look a the [codebook](https://github.com/ishaberry/Covid19Canada/blob/master/codebook.csv) and the [raw data](https://github.com/ishaberry/Covid19Canada/blob/master/cases.csv). The codebook helpfully lists the last modification time in the first line of the CodeBook CSV. I'm not sure this has been standardized, but we can try to grab it flag how old our data is. At the moment it looks like 
```
'Last update: 02 April 2020, 22:00 EST',,
```

In [2]:
repoURL = 'https://raw.githubusercontent.com/ishaberry/Covid19Canada/master/'

codebookCSV  = repoURL + 'codebook.csv'


# Grab the first row, read the date and massage it into datetime
codebookDF = pd.read_csv(
    repoURL + 'codebook.csv',
    names = [
        'date',
        'other1',
        'other2'],
    nrows = 1
)

date = parser.parse(
    codebookDF.iloc[0]['date'].replace('Last update: ', ''), tzinfos={'EST':'UTC-5'})

print(f"Data as of {date:%Y-%m-%d %H:%M} ({date - datetime.now().astimezone()} ago)")


Data as of 2020-04-02 22:00 (-4 days, 4:27:29.319678 ago)


In [3]:
codebookDF = pd.read_csv(
    repoURL + 'codebook.csv',
    skiprows=2,
)
codebookDF

Unnamed: 0,Variable,Description,Label
0,case_id,National Case ID Number,
1,provincial_case_id,Provincial Case ID Number,
2,age,"Age, if specific age not given then range prov...",
3,sex,Sex,
4,health_region,"Health region, if reported",
5,province,Province,
6,country,Country,
7,date_report,Reported date (i.e. public announcement date) ...,
8,report_week,Week of Report (Sundays are 1st day of week),
9,travel_yn,Travel history (yes/no),"0=no, 1=yes, Not Reported"


## Data

The data is stored in CSVs, the dictionary below grabs each CSV in turn with `pd.read_csv`. Keyword arguments to `read_csv` can be added under the `csv` key. For convenience we write out the resulting dataframes as parquet files, but for reliability it is best to grab the original data and apply the transformations to ensure you stay up to date.

In [40]:
can_data = {
    # cases.csv
    'cases'       :  {
        'url' : repoURL + 'cases.csv',
        'read_csv' : {
            # keys should be valid kwargs to read_csv
            'parse_dates' : ['date_report', 'report_week'],
            'dtype'       : {
                'travel_yn': pd.Int64Dtype(),
                'method_note': pd.Int64Dtype()
            },
            'na_values' : {
                'age': ['Not Reported'],
                'sex': ['Not Reported'],
                'travel_yn': ['Not Reported'],
                'health_region': ['Not Reported'],
                'province': ['Not Reported', 'Repatriated'],
                'country': ['Not Reported'],
                'travel_history_country': ['Not Reported'],
                'method_note': ['nan']
            },
            'converters' : {
                'locally_acquired': lambda x: x.title() 
            }
        }
    },
    

    # mortality.csv
    'mortality'       :  {
        'url' : repoURL + 'mortality.csv',
        'read_csv' : {
            # keys should be valid kwargs to read_csv
            'parse_dates' : ['date_death_report'],
            'dtype' : {
                'travel_yn': pd.Int64Dtype(),
                'method_note': pd.Int64Dtype()
            },
            'na_values' : {
                'age': ['Not Reported'],
                'sex': ['Not Reported'],
                'travel_yn': ['Not Reported'],
                'health_region': ['Not Reported'],
                'province': ['Not Reported', 'Repatriated'],
                'country': ['Not Reported'],
                'travel_history_country': ['Not Reported'],
                'method_note': ['nan']
            }
        }
    },
    
    
    # recovered_cumulative.csv
    'recovered' : {
        'url' : repoURL + 'recovered_cumulative.csv',
        'read_csv' : {
            # keys should be valid kwargs to read_csv
            'parse_dates' : ['date_recovered'],
            'dtype'       : {
                'cumulative_recovered': pd.Int64Dtype(),
            }
        }
    },
    
    
    # testing_cumulative.csv
    'testing' : {
        'url'         : repoURL + 'testing_cumulative.csv',
        'read_csv' : {
            # keys should be valid kwargs to read_csv
            'parse_dates' : ['date_testing'],
            'converters' : {
                'cumulative_testing': lambda x: pd.to_numeric(str(x).replace('*', ''), errors='coerce')
            },
            'na_values' : ['NA'],
        }
    }
}

Loop over the sources and build the dataframes

In [42]:
for source in can_data.keys():
    df = pd.read_csv(can_data[source]['url'], **can_data[source]['read_csv'])
    can_data[source]['df'] = df

## Data-Source ECDC

The data from ecdc.europa.eu is published on this page

https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide

Again, we want the latest data so we can use beautifulsoup to find the link we need


In [43]:
ecdc_data = {
    'cases' : {
        'url' : 'https://opendata.ecdc.europa.eu/covid19/casedistribution/csv',
        'read_csv' : {
            'parse_dates' : ['dateRep'],
            'dtype' : {
                'popData2018': pd.Int64Dtype()
            }
        }
    }   
}

In [44]:
for source in ecdc_data.keys():
    df = pd.read_csv(ecdc_data[source]['url'], **ecdc_data[source]['read_csv'])
    ecdc_data[source]['df'] = df

# Data Source - Johns Hopkins

Most of this comes from [github.com/CSSEGISandData/COVID-19](https://github.com/CSSEGISandData/COVID-19)


In [45]:
JHSRepoURL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/'

jhs_data = {
    'cases' : {
        'url' : JHSRepoURL + 'time_series_covid19_confirmed_global.csv',
        'read_csv' : {}
    },
    'death' : {
        'url' : JHSRepoURL + 'time_series_covid19_deaths_global.csv',
        'read_csv' : {}
    },
    'recovered' : {
        'url' : JHSRepoURL + 'time_series_covid19_recovered_global.csv',
        'read_csv' : {}
    }
}

In [46]:
for source in jhs_data.keys():
    df = pd.read_csv(jhs_data[source]['url'], **jhs_data[source]['read_csv'])
    jhs_data[source]['df'] = df