## Canada COVID data processing

In [69]:
import pandas as pd
import os

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [70]:
path_canada = os.path.join(os.getcwd(), 'data', 'covid19canada.csv') 
url = 'https://health-infobase.canada.ca/src/data/covidLive/covid19.csv'
response = pd.read_csv(url)

In [71]:
response.head()

Unnamed: 0,pruid,prname,prnameFR,date,numconf,numprob,numdeaths,numtotal,numtested,numrecover,percentrecover,ratetested,numtoday,percentoday,ratetotal,ratedeaths,deathstoday,percentdeath,testedtoday,recoveredtoday,percentactive,numactive,rateactive,numtotal_last14,ratetotal_last14,numdeaths_last14,ratedeaths_last14
0,35,Ontario,Ontario,31-01-2020,3,0,0.0,3,,,,,3,300.0,0.02,0.0,0.0,0.0,,,100.0,3.0,0.02,,,,
1,59,British Columbia,Colombie-Britannique,31-01-2020,1,0,0.0,1,,,,,1,100.0,0.02,0.0,0.0,0.0,,,100.0,1.0,0.02,,,,
2,1,Canada,Canada,31-01-2020,4,0,0.0,4,,,,,4,400.0,0.01,0.0,0.0,0.0,,,100.0,4.0,0.01,,,,
3,35,Ontario,Ontario,08-02-2020,3,0,0.0,3,,,,,0,0.0,0.02,0.0,0.0,0.0,,,100.0,3.0,0.02,,,,
4,59,British Columbia,Colombie-Britannique,08-02-2020,4,0,0.0,4,,,,,3,300.0,0.08,0.0,0.0,0.0,,,100.0,4.0,0.08,,,,


In [72]:
response.columns

Index(['pruid', 'prname', 'prnameFR', 'date', 'numconf', 'numprob',
       'numdeaths', 'numtotal', 'numtested', 'numrecover', 'percentrecover',
       'ratetested', 'numtoday', 'percentoday', 'ratetotal', 'ratedeaths',
       'deathstoday', 'percentdeath', 'testedtoday', 'recoveredtoday',
       'percentactive', 'numactive', 'rateactive', 'numtotal_last14',
       'ratetotal_last14', 'numdeaths_last14', 'ratedeaths_last14'],
      dtype='object')

- `pruid`: province id
- **`prname`: (English )province name
- `prnameFR`: (French) province name
- **`date`: date reported
- **`numconf`: number of confirmed cases
- **`numprob`: number of probable cases
- **`numdeaths`: number of deaths
- **`numtotal`: total # of confirmed and probable cases
- **`numtested`: number of people tested
- **`numrecover`: number of people recovered
- **`percentrecover`: numrecover / numtotal
- `ratetested`: 
- **`numtoday`: number of new cases relative to yesterday
- `percentoday`:  percent change of new cases relative to yesterday
- `ratetotal`:
- `ratedeaths`:
- **`deathstoday`: number of deaths reported today
- `percentdeath`: 
- **`testedtoday`: number of people tested today
- `recoveredtoday`: number of people who have recovered today
- `percentactive`:

In [73]:
response = response.drop(columns=['pruid', 'prnameFR', 'percentoday',
                                  'ratetested', 'ratetotal', 'ratedeaths',
                                  'percentdeath', 'percentactive','numtotal_last14',
                                  'ratetotal_last14', 'numdeaths_last14', 'ratedeaths_last14'])

In [74]:
response['date'] = pd.to_datetime(response['date'], dayfirst=True)

In [75]:
response.dtypes

prname                    object
date              datetime64[ns]
numconf                    int64
numprob                    int64
numdeaths                float64
numtotal                   int64
numtested                float64
numrecover               float64
percentrecover           float64
numtoday                   int64
deathstoday              float64
testedtoday              float64
recoveredtoday           float64
numactive                float64
rateactive               float64
dtype: object

In [76]:
response.isnull().sum()

prname              0
date                0
numconf             0
numprob             0
numdeaths         109
numtotal            0
numtested          57
numrecover        524
percentrecover    629
numtoday            0
deathstoday       109
testedtoday        57
recoveredtoday    524
numactive         288
rateactive        141
dtype: int64

In [77]:
response.describe(include='all')

Unnamed: 0,prname,date,numconf,numprob,numdeaths,numtotal,numtested,numrecover,percentrecover,numtoday,deathstoday,testedtoday,recoveredtoday,numactive,rateactive
count,2172,2172,2172.0,2172.0,2063.0,2172.0,2115.0,1648.0,1543.0,2172.0,2063.0,2115.0,1648.0,1884.0,2031.0
unique,15,157,,,,,,,,,,,,,
top,British Columbia,2020-06-14 00:00:00,,,,,,,,,,,,,
freq,157,15,,,,,,,,,,,,,
first,,2020-01-31 00:00:00,,,,,,,,,,,,,
last,,2020-07-29 00:00:00,,,,,,,,,,,,,
mean,,,8682.70442,2.833333,687.716917,8685.536832,207642.8,6763.832524,78.061393,106.31768,8.644207,3731.976359,127.869539,3343.380042,25.070931
std,,,21783.227624,28.165429,1837.205525,21784.461145,539136.0,15889.190015,26.318383,280.548132,27.13771,8696.081964,903.948171,8361.023347,63.179197
min,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-2.0,-20682.0,-5.0,0.0,0.0
25%,,,13.0,0.0,0.0,13.0,1397.0,13.0,66.135,0.0,0.0,11.0,0.0,3.0,0.1


In [78]:
response = response.sort_values(by=['prname', 'date'])

In [79]:
# Impute missing values

provinces = response['prname'].value_counts().index
impute_cols = ['numdeaths', 'numtested', 'deathstoday',
               'testedtoday', 'recoveredtoday', 'numrecover', 'percentrecover']

for p in provinces:
    for colname in impute_cols:
        response.loc[response['prname']==p, colname] = response.loc[response['prname']==p, colname].ffill().fillna(0)

In [80]:
response.isnull().sum()

prname              0
date                0
numconf             0
numprob             0
numdeaths           0
numtotal            0
numtested           0
numrecover          0
percentrecover      0
numtoday            0
deathstoday         0
testedtoday         0
recoveredtoday      0
numactive         288
rateactive        141
dtype: int64

In [81]:
response.to_csv(path_canada, index=False)

## World COVID data processing

In [82]:
path_world = os.path.join(os.getcwd(), 'data', 'covid19world.csv') 
url_world = 'https://covid.ourworldindata.org/data/owid-covid-data.csv'
response_world = pd.read_csv(url_world)
response_world.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,total_cases_per_million,new_cases_per_million,total_deaths_per_million,new_deaths_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,tests_units,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy
0,AFG,Asia,Afghanistan,2019-12-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,38928341.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83
1,AFG,Asia,Afghanistan,2020-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.0,38928341.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83
2,AFG,Asia,Afghanistan,2020-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.0,38928341.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83
3,AFG,Asia,Afghanistan,2020-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.0,38928341.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83
4,AFG,Asia,Afghanistan,2020-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,0.0,38928341.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83


In [83]:
response_world = response_world.drop(
    columns=['new_tests_smoothed', 'new_tests_smoothed_per_thousand', 'tests_units',
             'stringency_index', 'population', 'population_density', 'median_age',
             'aged_65_older', 'aged_70_older', 'gdp_per_capita', 'extreme_poverty',
             'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers',
             'male_smokers', 'handwashing_facilities', 'life_expectancy']
)

In [84]:
response_world['date'] = pd.to_datetime(response_world['date'], yearfirst=True)
response_world = response_world.sort_values(by=['location', 'date'])

In [85]:
response_world.dtypes

iso_code                              object
continent                             object
location                              object
date                          datetime64[ns]
total_cases                          float64
new_cases                            float64
total_deaths                         float64
new_deaths                           float64
total_cases_per_million              float64
new_cases_per_million                float64
total_deaths_per_million             float64
new_deaths_per_million               float64
new_tests                            float64
total_tests                          float64
total_tests_per_thousand             float64
new_tests_per_thousand               float64
hospital_beds_per_thousand           float64
dtype: object

In [86]:
response_world.isnull().sum()

iso_code                         64
continent                       277
location                          0
date                              0
total_cases                     355
new_cases                       355
total_deaths                    355
new_deaths                      355
total_cases_per_million         419
new_cases_per_million           419
total_deaths_per_million        419
new_deaths_per_million          419
new_tests                     22951
total_tests                   22723
total_tests_per_thousand      22723
new_tests_per_thousand        22951
hospital_beds_per_thousand     6109
dtype: int64

In [87]:
response_world.to_csv(path_world, index=False)