## Canada COVID data processing

In [1]:
import pandas as pd
import os

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
path_canada = os.path.join(os.getcwd(), 'data', 'covid19canada.csv') 
url = 'https://health-infobase.canada.ca/src/data/covidLive/covid19.csv'
response = pd.read_csv(url)

In [3]:
response.head()

Unnamed: 0,pruid,prname,prnameFR,date,numconf,numprob,numdeaths,numtotal,numtested,numrecover,percentrecover,ratetested,numtoday,percentoday,ratetotal,ratedeaths,deathstoday,percentdeath,testedtoday,recoveredtoday,percentactive
0,35,Ontario,Ontario,31-01-2020,3,0,0.0,3,,,,,3,300.0,0.02,0.0,0.0,0.0,,,100.0
1,59,British Columbia,Colombie-Britannique,31-01-2020,1,0,0.0,1,,,,,1,100.0,0.02,0.0,0.0,0.0,,,100.0
2,1,Canada,Canada,31-01-2020,4,0,0.0,4,,,,,4,400.0,0.01,0.0,0.0,0.0,,,100.0
3,35,Ontario,Ontario,08-02-2020,3,0,0.0,3,,,,,0,0.0,0.02,0.0,0.0,0.0,,,100.0
4,59,British Columbia,Colombie-Britannique,08-02-2020,4,0,0.0,4,,,,,3,300.0,0.08,0.0,0.0,0.0,,,100.0


In [4]:
response.columns

Index(['pruid', 'prname', 'prnameFR', 'date', 'numconf', 'numprob',
       'numdeaths', 'numtotal', 'numtested', 'numrecover', 'percentrecover',
       'ratetested', 'numtoday', 'percentoday', 'ratetotal', 'ratedeaths',
       'deathstoday', 'percentdeath', 'testedtoday', 'recoveredtoday',
       'percentactive'],
      dtype='object')

- `pruid`: province id
- **`prname`: (English )province name
- `prnameFR`: (French) province name
- **`date`: date reported
- **`numconf`: number of confirmed cases
- **`numprob`: number of probable cases
- **`numdeaths`: number of deaths
- **`numtotal`: total # of confirmed and probable cases
- **`numtested`: number of people tested
- **`numrecover`: number of people recovered
- **`percentrecover`: numrecover / numtotal
- `ratetested`: 
- **`numtoday`: number of new cases relative to yesterday
- `percentoday`:  percent change of new cases relative to yesterday
- `ratetotal`:
- `ratedeaths`:
- **`deathstoday`: number of deaths reported today
- `percentdeath`: 
- **`testedtoday`: number of people tested today
- `recoveredtoday`: number of people who have recovered today
- `percentactive`:

In [5]:
response = response.drop(columns=['pruid', 'prnameFR', 'percentoday',
                                 'ratetested', 'ratetotal', 'ratedeaths',
                                 'percentdeath', 'percentactive',
                                 'recoveredtoday'])

In [6]:
response['date'] = pd.to_datetime(response['date'], dayfirst=True)

In [7]:
response.dtypes

prname                    object
date              datetime64[ns]
numconf                    int64
numprob                    int64
numdeaths                float64
numtotal                   int64
numtested                float64
numrecover               float64
percentrecover           float64
numtoday                   int64
deathstoday              float64
testedtoday              float64
dtype: object

In [8]:
response.isnull().sum()

prname              0
date                0
numconf             0
numprob             0
numdeaths         102
numtotal            0
numtested          57
numrecover        524
percentrecover    622
numtoday            0
deathstoday       102
testedtoday        57
dtype: int64

In [9]:
response.describe(include='all')

Unnamed: 0,prname,date,numconf,numprob,numdeaths,numtotal,numtested,numrecover,percentrecover,numtoday,deathstoday,testedtoday
count,2067,2067,2067.0,2067.0,1965.0,2067.0,2010.0,1543.0,1445.0,2067.0,1965.0,2010.0
unique,15,150,,,,,,,,,,
top,Canada,2020-06-14 00:00:00,,,,,,,,,,
freq,150,15,,,,,,,,,,
first,,2020-01-31 00:00:00,,,,,,,,,,
last,,2020-07-22 00:00:00,,,,,,,,,,
mean,,,8351.334301,2.887276,658.64631,8354.22061,191934.3,6321.551523,77.27746,108.59313,9.027481,3641.570149
std,,,21140.999317,28.853258,1783.99863,21142.17343,496813.5,14703.710296,26.942562,286.015784,27.746805,8465.01153
min,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-2.0,-213.0
25%,,,11.0,0.0,0.0,13.0,1342.25,13.0,63.75,0.0,0.0,11.0


In [10]:
response = response.sort_values(by=['prname', 'date'])

In [11]:
# Impute missing values
# response['numdeaths'] = response['numdeaths'].ffill()
# response['numtested'] = response['numtested'].fillna(0)
# response['deathstoday'] = response['deathstoday'].fillna(0)
# response['testedtoday'] = response['testedtoday'].fillna(0)

provinces = response['prname'].value_counts().index
impute_cols = ['numdeaths', 'numtested', 'deathstoday',
              'testedtoday', 'numrecover', 'percentrecover']

for p in provinces:
    for colname in impute_cols:
        response.loc[response['prname']==p, colname] = response.loc[response['prname']==p, colname].ffill().fillna(0)

In [12]:
response.isnull().sum()

prname            0
date              0
numconf           0
numprob           0
numdeaths         0
numtotal          0
numtested         0
numrecover        0
percentrecover    0
numtoday          0
deathstoday       0
testedtoday       0
dtype: int64

In [13]:
response.to_csv(path_canada, index=False)