# Collecting and Filtering Canadian Covid-19 Metrics
* Covid-19 data sources from [The Government of Canada](https://health-infobase.canada.ca/covid-19/epidemiological-summary-covid-19-cases.html)
* Daily numbers for British Columbia are filtered out and exported from here. This entrie notebook will be reduced to a function.

In [1]:
import sys  
sys.path.insert(0, '~/data_bootcamp/data-science-final-project/scripts/')
from functions import *

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/lclark/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [25]:
# Run this cell to update bc_covid_data.sav
# In future, this may be done automatically and pushed to github

df_bc_covid = get_covid_data()
df_bc_covid.to_pickle('~/data_bootcamp/data-science-final-project/data/bc_covid_data.sav')

In [11]:
# Perform the above manually and explore data

df = pd.read_csv('https://health-infobase.canada.ca/src/data/covidLive/covid19-download.csv')

In [12]:
df.columns

Index(['pruid', 'prname', 'prnameFR', 'date', 'numconf', 'numprob',
       'numdeaths', 'numtotal', 'numtested', 'numrecover', 'percentrecover',
       'ratetested', 'numtoday', 'percentoday', 'ratetotal', 'ratedeaths',
       'numdeathstoday', 'percentdeath', 'numtestedtoday', 'numrecoveredtoday',
       'percentactive', 'numactive', 'rateactive', 'numtotal_last14',
       'ratetotal_last14', 'numdeaths_last14', 'ratedeaths_last14',
       'numtotal_last7', 'ratetotal_last7', 'numdeaths_last7',
       'ratedeaths_last7', 'avgtotal_last7', 'avgincidence_last7',
       'avgdeaths_last7', 'avgratedeaths_last7'],
      dtype='object')

In [13]:
df_bc = df[df.prname == 'British Columbia']

In [14]:
df_bc.sort_values(by='numtoday', ascending=False)

Unnamed: 0,pruid,prname,prnameFR,date,numconf,numprob,numdeaths,numtotal,numtested,numrecover,...,numdeaths_last14,ratedeaths_last14,numtotal_last7,ratetotal_last7,numdeaths_last7,ratedeaths_last7,avgtotal_last7,avgincidence_last7,avgdeaths_last7,avgratedeaths_last7
3927,59,British Columbia,Colombie-Britannique,2020-11-24,28348,0,358.0,28348,754571.0,19605.0,...,74.0,1.46,4687.0,92.42,48.0,0.95,670.0,13.20,7.0,0.14
3492,59,British Columbia,Colombie-Britannique,2020-10-26,13371,0,259.0,13371,581804.0,10734.0,...,14.0,0.28,1684.0,33.21,6.0,0.12,241.0,4.74,1.0,0.02
3837,59,British Columbia,Colombie-Britannique,2020-11-18,24422,0,320.0,24422,714714.0,16914.0,...,47.0,0.93,4648.0,91.65,36.0,0.71,664.0,13.09,5.0,0.10
3822,59,British Columbia,Colombie-Britannique,2020-11-17,23661,0,310.0,23661,709169.0,16469.0,...,38.0,0.75,4422.0,87.20,26.0,0.51,632.0,12.46,4.0,0.07
3882,59,British Columbia,Colombie-Britannique,2020-11-21,26187,0,331.0,26187,735429.0,17477.0,...,55.0,1.08,4548.0,89.68,41.0,0.81,650.0,12.81,6.0,0.12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
852,59,British Columbia,Colombie-Britannique,2020-05-03,2171,0,114.0,2171,81061.0,1376.0,...,36.0,0.71,223.0,4.40,14.0,0.28,32.0,0.63,2.0,0.04
3357,59,British Columbia,Colombie-Britannique,2020-10-17,11189,0,251.0,11189,543791.0,9387.0,...,13.0,0.26,1004.0,19.80,6.0,0.12,143.0,2.83,1.0,0.02
3372,59,British Columbia,Colombie-Britannique,2020-10-18,11189,0,251.0,11189,543791.0,9387.0,...,13.0,0.26,1004.0,19.80,6.0,0.12,143.0,2.83,1.0,0.02
1587,59,British Columbia,Colombie-Britannique,2020-06-21,2790,0,168.0,2790,152568.0,2444.0,...,1.0,0.02,81.0,1.60,0.0,0.00,12.0,0.23,0.0,0.00


In [15]:
print('Most recent date recorded:', df_bc['date'].tail(1).values)
print('Number of cases announced today ')
# Days with 0 reported cases - Would include beginning of pandemic when testing limited, as well as weekends and holidays when reports were not published.
df_bc[df_bc.numtoday == 0]

Most recent date recorded: ['2020-11-24']
Number of cases announced today 


Unnamed: 0,pruid,prname,prnameFR,date,numconf,numprob,numdeaths,numtotal,numtested,numrecover,...,numdeaths_last14,ratedeaths_last14,numtotal_last7,ratetotal_last7,numdeaths_last7,ratedeaths_last7,avgtotal_last7,avgincidence_last7,avgdeaths_last7,avgratedeaths_last7
13,59,British Columbia,Colombie-Britannique,2020-02-24,6,0,0.0,6,,,...,,,,,,,,,,
19,59,British Columbia,Colombie-Britannique,2020-02-26,7,0,0.0,7,,,...,,,,,,,,,,
22,59,British Columbia,Colombie-Britannique,2020-02-27,7,0,0.0,7,,,...,,,,,,,,,,
25,59,British Columbia,Colombie-Britannique,2020-02-29,7,0,0.0,7,,,...,,,,,,,,,,
40,59,British Columbia,Colombie-Britannique,2020-03-06,21,0,0.0,21,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3282,59,British Columbia,Colombie-Britannique,2020-10-12,10185,0,245.0,10185,501496.0,8502.0,...,12.0,0.24,446.0,8.79,3.0,0.06,64.0,1.26,0.0,0.01
3357,59,British Columbia,Colombie-Britannique,2020-10-17,11189,0,251.0,11189,543791.0,9387.0,...,13.0,0.26,1004.0,19.80,6.0,0.12,143.0,2.83,1.0,0.02
3372,59,British Columbia,Colombie-Britannique,2020-10-18,11189,0,251.0,11189,543791.0,9387.0,...,13.0,0.26,1004.0,19.80,6.0,0.12,143.0,2.83,1.0,0.02
3462,59,British Columbia,Colombie-Britannique,2020-10-24,12554,0,256.0,12554,581804.0,10247.0,...,11.0,0.22,1365.0,26.92,5.0,0.10,195.0,3.85,1.0,0.01


In [16]:
bc_df = df_bc[['prname', 'date', 'numconf', 'numdeaths', 'numtotal',
       'numtoday', 'percentoday', 'ratetotal', 'ratedeaths',
       'numdeathstoday', 'numactive', 'numtotal_last14',
       'ratetotal_last14', 'numdeaths_last14', 'ratedeaths_last14',
       'numtotal_last7', 'ratetotal_last7', 'numdeaths_last7',
       'ratedeaths_last7', 'avgtotal_last7', 'avgincidence_last7',
       'avgdeaths_last7']]

bc_df = bc_df.reset_index().sort_values(by='date').set_index('date').sort_index(ascending=False).drop('index', axis=1)
#bc_df = bc_df[['numtoday','numtotal_last7','numdeathstoday','numdeaths_last7','numactive']]
#bc_df.to_pickle('~/data_bootcamp/data-science-final-project/data/bc_covid_data.sav')


In [24]:
# This function lives in scripts/functions and has been imported in first cell

def get_covid_data(df_name='df_bc_covid',globe=True):
    """
    Downloads Covid-19 data from Canada Gov site
    Filters on British Columbia
    Return DataFrame with date as index
    """
    if globe:
        global df_bc_covid
    try:
        df_bc_covid = pd.read_csv(bc_cov19_url)
    except:
        print('Reading CSV from URL failed')
    else:
        df_bc_covid = df_bc_covid[df_bc_covid.prname == 'British Columbia']
        df_bc_covid = df_bc_covid.set_index('date').fillna(0)
        df_bc_covid.drop(['pruid','prname','prnameFR'], axis=1, inplace=True)
        return df_bc_covid