# Data Wrangling COVID-19 Mini-Challenge

In [1]:
import pandas as pd

raw_data = pd.read_csv('https://raw.githubusercontent.com/openZH/covid_19/master/COVID19_Fallzahlen_CH_total_v2.csv')

# convert to date
raw_data['date'] = pd.to_datetime(raw_data['date'])

# remove FL
swiss = raw_data[raw_data['abbreviation_canton_and_fl'] != 'FL']

# only keep useful entries
conf_cases = swiss.loc[-swiss['ncumul_conf'].isna()].copy()

# get only get latest for every canton
cases_per_canton = conf_cases.loc[conf_cases.groupby(by='abbreviation_canton_and_fl').date.idxmax(), ['date', 'abbreviation_canton_and_fl', 'ncumul_conf']]
print(cases_per_canton)

# get all cantons
all_cantons = cases_per_canton['abbreviation_canton_and_fl'].copy()

# calculate new cases
conf_cases['new_cases'] = 0

for canton in all_cantons:
    updates_for_canton = conf_cases.loc[conf_cases['abbreviation_canton_and_fl'] == canton]
    previous_index = None
    for index in updates_for_canton.index:
        if previous_index != None:
            conf_cases.at[index, 'new_cases'] = conf_cases.at[index, 'ncumul_conf'] - conf_cases.at[previous_index, 'ncumul_conf']
        previous_index = index

new_cases = conf_cases.loc[:, ['date', 'abbreviation_canton_and_fl', 'new_cases']]
print(new_cases)

           date abbreviation_canton_and_fl  ncumul_conf
9082 2021-02-25                         AG      36514.0
9112 2021-02-26                         AI        871.0
8232 2021-01-22                         AR       2867.0
9119 2021-02-27                         BE      53429.0
9114 2021-02-27                         BL      13568.0
9122 2021-02-27                         BS       9840.0
9094 2021-02-26                         FR      27147.0
9095 2021-02-26                         GE      47307.0
9096 2021-02-26                         GL       2199.0
9116 2021-02-27                         GR      10679.0
9071 2021-02-25                         JU       5599.0
9072 2021-02-25                         LU      21706.0
9073 2021-02-25                         NE      14029.0
9117 2021-02-27                         NW       1988.0
9103 2021-02-26                         OW       1878.0
9075 2021-02-25                         SG      34058.0
9109 2021-02-26                         SH      

In [2]:
from bokeh.palettes import turbo
from bokeh.plotting import figure, output_notebook, show

output_notebook()

p = figure(title="New COVID-19 cases per canton", y_axis_label='new cases', x_axis_type='datetime', sizing_mode='stretch_both')
palette = turbo(all_cantons.size)
i = 0
for canton in all_cantons:
    update_for_canton = conf_cases.loc[conf_cases['abbreviation_canton_and_fl'] == canton]
    p.line(update_for_canton['date'], update_for_canton['new_cases'], line_color=palette[i], legend_label=canton, line_width=2)
    i += 1
show(p)