# Data Wrangling COVID-19 Mini-Challenge

## Imports/Setup

In [1]:
import pandas as pd

from bokeh.palettes import turbo
from bokeh.plotting import figure, output_notebook, show

# set bokeh output mode to notebook
output_notebook()

from load_data import get_jhu_cached

## Worldwide COVID-19 Data

In [3]:
# get all dates from june 1st 2020 until yesterday
all_dates = pd.date_range(start='2020-05-31', end='today')[:-1]

# load data from all dates
all_data = []
for date in all_dates:
    data = pd.read_csv(get_jhu_cached(date))
    data['Date'] = date
    all_data.append(data)

# combine data from all days
raw_data = pd.concat(all_data)

# group data by country
updates_per_country = raw_data.groupby(['Country_Region', 'Date']).agg(
     Confirmed = ('Confirmed','sum'),
     Deaths = ('Deaths','sum'),
 ).reset_index()

# get all cantons
all_countries = updates_per_country['Country_Region'].unique()
all_countries.sort()

# calculate difference between days
for country in all_countries:
    updates_for_country = updates_per_country.loc[updates_per_country['Country_Region'] == country]
    previous_index = None
    total_cases = 0
    for index in updates_for_country.index:
        if previous_index != None:
            new_cases = updates_per_country.at[index, 'Confirmed'] - updates_per_country.at[previous_index, 'Confirmed']
            updates_per_country.at[index, 'New_Cases'] = new_cases
            total_cases += new_cases
            updates_per_country.at[index, 'Total_Cases'] = total_cases
            updates_per_country.at[index, 'New_Deaths'] = updates_per_country.at[index, 'Deaths'] - updates_per_country.at[previous_index, 'Deaths']
        previous_index = index

updates_per_country = updates_per_country[updates_per_country['Date'] >= '2020-06-01']

worldwide_pretty = updates_per_country.loc[:, ['Date', 'Country_Region', 'New_Cases', 'Total_Cases', 'New_Deaths']]

In [4]:
# calculate new cases
new_cases_graph = figure(title="New COVID-19 cases per country", y_axis_label='new cases', x_axis_type='datetime', sizing_mode='stretch_width')
palette = turbo(all_countries.size)
i = 0
for country in all_countries:
    updates_for_country = updates_per_country.loc[updates_per_country['Country_Region'] == country]
    new_cases_graph.line(updates_for_country['Date'], updates_for_country['New_Cases'], line_color=palette[i], legend_label=country, line_width=2)
    i += 1
show(new_cases_graph)

In [5]:
# show total cases
total_cases_graph = figure(title="Total COVID-19 cases per country since 1st of june", y_axis_label='total cases', x_axis_type='datetime', sizing_mode='stretch_width')
palette = turbo(all_countries.size)
i = 0
for country in all_countries:
    updates_for_country = updates_per_country.loc[updates_per_country['Country_Region'] == country]
    total_cases_graph.line(updates_for_country['Date'], updates_for_country['Total_Cases'], line_color=palette[i], legend_label=country, line_width=2)
    i += 1
show(total_cases_graph)

## Swiss COVID-19 Data

In [6]:
raw_data = pd.read_csv('https://raw.githubusercontent.com/openZH/covid_19/master/COVID19_Fallzahlen_CH_total_v2.csv')

# convert to date
raw_data['date'] = pd.to_datetime(raw_data['date'])

# remove FL
swiss = raw_data[raw_data['abbreviation_canton_and_fl'] != 'FL']

# only use data after 1st of june
swiss = swiss[swiss['date'] >= '2020-05-31']

# only keep useful entries
conf_cases = swiss.loc[-swiss['ncumul_conf'].isna()].copy()

# get all cantons
all_cantons = conf_cases['abbreviation_canton_and_fl'].unique()
all_cantons.sort()

# calculate new cases
conf_cases['new_cases'] = 0

for canton in all_cantons:
    updates_for_canton = conf_cases.loc[conf_cases['abbreviation_canton_and_fl'] == canton]
    previous_index = None
    total_cases = 0
    for index in updates_for_canton.index:
        if previous_index != None:
            new_cases = conf_cases.at[index, 'ncumul_conf'] - conf_cases.at[previous_index, 'ncumul_conf']
            conf_cases.at[index, 'new_cases'] = new_cases
            total_cases += new_cases
            conf_cases.at[index, 'total_cases'] = total_cases
            conf_cases.at[index, 'new_deaths'] = conf_cases.at[index, 'ncumul_deceased'] - conf_cases.at[previous_index, 'ncumul_deceased']
        previous_index = index

conf_cases = conf_cases[conf_cases['date'] >= '2020-06-01']

swiss_pretty = conf_cases.loc[:, ['date', 'abbreviation_canton_and_fl', 'new_cases', 'total_cases', 'new_deaths']]

In [7]:
# calculate new cases
new_cases_graph = figure(title="New COVID-19 cases per canton", y_axis_label='new cases', x_axis_type='datetime', sizing_mode='stretch_width')
palette = turbo(all_cantons.size)
i = 0
for canton in all_cantons:
    update_for_canton = conf_cases.loc[conf_cases['abbreviation_canton_and_fl'] == canton]
    new_cases_graph.line(update_for_canton['date'], update_for_canton['new_cases'], line_color=palette[i], legend_label=canton, line_width=2)
    i += 1
show(new_cases_graph)

In [8]:
# show total cases
total_cases_graph = figure(title="Total COVID-19 cases per canton since 1st of june", y_axis_label='total cases', x_axis_type='datetime', sizing_mode='stretch_width')
palette = turbo(all_cantons.size)
i = 0
for canton in all_cantons:
    update_for_canton = conf_cases.loc[conf_cases['abbreviation_canton_and_fl'] == canton]
    total_cases_graph.line(update_for_canton['date'], update_for_canton['total_cases'], line_color=palette[i], legend_label=canton, line_width=2)
    i += 1
show(total_cases_graph)

## Final Data Frames

In [10]:
worldwide_pretty

Unnamed: 0,Date,Country_Region,New_Cases,Total_Cases,New_Deaths
1,2020-06-01,Afghanistan,545.0,545.0,8.0
2,2020-06-02,Afghanistan,759.0,1304.0,8.0
3,2020-06-03,Afghanistan,758.0,2062.0,24.0
4,2020-06-04,Afghanistan,787.0,2849.0,6.0
5,2020-06-05,Afghanistan,915.0,3764.0,9.0
...,...,...,...,...,...
52113,2021-02-25,Zimbabwe,34.0,35816.0,2.0
52114,2021-02-26,Zimbabwe,50.0,35866.0,5.0
52115,2021-02-27,Zimbabwe,14.0,35880.0,0.0
52116,2021-02-28,Zimbabwe,31.0,35911.0,0.0


In [11]:
swiss_pretty

Unnamed: 0,date,abbreviation_canton_and_fl,new_cases,total_cases,new_deaths
2346,2020-06-01,BL,1,1.0,0.0
2348,2020-06-01,FR,0,0.0,0.0
2349,2020-06-01,GE,1,1.0,0.0
2350,2020-06-01,GR,0,0.0,0.0
2351,2020-06-01,JU,0,0.0,0.0
...,...,...,...,...,...
9175,2021-03-01,SH,14,3552.0,0.0
9176,2021-03-01,TG,0,14667.0,0.0
9177,2021-03-01,BS,20,8861.0,0.0
9178,2021-03-01,AI,3,849.0,0.0
