# Data Wrangling COVID-19 Mini-Challenge

## Imports/Setup

In [1]:
import pandas as pd
import wikidata_plain_sparql as wikidata

from bokeh.palettes import turbo
from bokeh.plotting import figure, output_notebook, show, gridplot

# set bokeh output mode to notebook
output_notebook()

from helper import get_jhu_cached, create_grid

## Worldwide COVID-19 Data

In [2]:
# get all dates from earliest available data until yesterday
all_dates = pd.date_range(start='2020-01-22', end='today')[:-1]

# standardize column names for all entries
def rename_columns(column):
    column_map = {
        'Lat': 'Latitude',
        'Long_': 'Longitude',
        'Incidence_Rate': 'Incident_Rate'
    }
    if column in column_map:
        return column_map[column]
    return column.replace('/', '_').replace('-', '_').replace(' ', '_')

# load data from all dates
all_data = []
for date in all_dates:
    data = pd.read_csv(get_jhu_cached(date))
    data.rename(columns=rename_columns, inplace=True)
    data['Date'] = date
    all_data.append(data)

# combine data from all days
raw_data = pd.concat(all_data)

In [3]:
# remove cruise ships from countries
exclude_countries = [
    'MS Zaandam',
    'Diamond Princess',
    'Cruise Ship'
]

raw_data = raw_data[~raw_data['Country_Region'].isin(exclude_countries)]

# standardize country names for all entries
country_mapping = {
    'Hong Kong.+': 'Hong Kong',
    'Iran.+': 'Iran',
    '.*Congo.*': 'Congo',
    'Mainland China': 'China',
    '.*Bahamas.*': 'The Bahamas',
    '.*Gambia.*': 'The Gambia',
    'Viet Nam': 'Vietnam',
    'Taiwan\*': 'Taiwan',
    'Cote d\'Ivoire': 'Ivory Coast',
    'Cabo Verde': 'Cape Verde',
    'Russian Federation': 'Russia',
    ' Azerbaijan': 'Azerbaijan',
    'Holy See': 'Vatican City',
    'Republic of Ireland': 'Ireland',
    'Republic of Moldova': 'Moldova',
    'Czechia': 'Czech Republic',
    'Republic of Korea|Korea, South': 'South Korea',
    'Timor-Leste': 'East Timor',
    'Macao SAR|Macau': 'Macao',
    'UK': 'United Kingdom',
    'Jersey|Guernsey': 'Channel Islands',
    'Dominica': 'Dominican Republic'
}

raw_data['Country_Region'] = raw_data['Country_Region'].replace(to_replace=country_mapping.keys(), value=country_mapping.values(), regex=True)

In [4]:
# group data by country
updates_per_country = raw_data.groupby(['Country_Region', 'Date']).agg(
     Confirmed = ('Confirmed','sum'),
     Deaths = ('Deaths','sum'),
 ).reset_index()

# get all cantons
all_countries = updates_per_country['Country_Region'].unique()
all_countries.sort()

# calculate difference between days
for country in all_countries:
    updates_for_country = updates_per_country.loc[updates_per_country['Country_Region'] == country]
    previous_index = None
    total_cases = 0
    for index in updates_for_country.index:
        if previous_index != None:
            new_cases = updates_per_country.at[index, 'Confirmed'] - updates_per_country.at[previous_index, 'Confirmed']
            updates_per_country.at[index, 'New_Cases'] = new_cases
            total_cases += new_cases
            updates_per_country.at[index, 'Total_Cases'] = total_cases
            updates_per_country.at[index, 'New_Deaths'] = updates_per_country.at[index, 'Deaths'] - updates_per_country.at[previous_index, 'Deaths']
        previous_index = index

updates_per_country = updates_per_country[updates_per_country['Date'] >= '2020-01-23']

worldwide_pretty = updates_per_country.loc[:, ['Date', 'Country_Region', 'New_Cases', 'Total_Cases', 'New_Deaths']]

In [5]:
# calculate new cases
new_cases_graph = figure(title="New COVID-19 cases per country", y_axis_label='new cases', x_axis_type='datetime', sizing_mode='stretch_width')
palette = turbo(all_countries.size)
i = 0
for country in all_countries:
    updates_for_country = updates_per_country.loc[updates_per_country['Country_Region'] == country]
    new_cases_graph.line(updates_for_country['Date'], updates_for_country['New_Cases'], line_color=palette[i], legend_label=country, line_width=2)
    i += 1
show(new_cases_graph)

In [6]:
# show total cases
total_cases_graph = figure(title="Total COVID-19 cases per country", y_axis_label='total cases', x_axis_type='datetime', sizing_mode='stretch_width')
palette = turbo(all_countries.size)
i = 0
for country in all_countries:
    updates_for_country = updates_per_country.loc[updates_per_country['Country_Region'] == country]
    total_cases_graph.line(updates_for_country['Date'], updates_for_country['Total_Cases'], line_color=palette[i], legend_label=country, line_width=2)
    i += 1
show(total_cases_graph)

## Swiss COVID-19 Data

In [7]:
# get population data from WikiData
canton_data = wikidata.query('''
SELECT ?shortCode ?population ?canton WHERE {
  ?canton wdt:P31 wd:Q23058.
  ?canton wdt:P300 ?shortCode.
  OPTIONAL {
    ?canton p:P1082 ?population_stmt. 
    ?population_stmt ps:P1082 ?population.
    ?population_stmt pq:P585 ?population_date.
  }
  FILTER NOT EXISTS {
    ?canton p:P1082/pq:P585 ?population_date_.
    FILTER (?population_date_ > ?population_date)
  }
}
ORDER BY ?shortCode
''')
canton_data.set_index('shortCode', inplace=True)

In [8]:
raw_data = pd.read_csv('https://raw.githubusercontent.com/openZH/covid_19/master/COVID19_Fallzahlen_CH_total_v2.csv')

# convert to date
raw_data['date'] = pd.to_datetime(raw_data['date'])

# remove FL
swiss = raw_data[raw_data['abbreviation_canton_and_fl'] != 'FL']

# only use data after 1st of june
swiss = swiss[swiss['date'] >= '2020-05-31']

# only keep useful entries
conf_cases = swiss.loc[-swiss['ncumul_conf'].isna()].copy()

# get all cantons
all_cantons = conf_cases['abbreviation_canton_and_fl'].unique()
all_cantons.sort()

# calculate new cases
conf_cases['new_cases'] = 0

for canton in all_cantons:
    updates_for_canton = conf_cases.loc[conf_cases['abbreviation_canton_and_fl'] == canton]
    previous_index = None
    total_cases = 0
    for index in updates_for_canton.index:
        if previous_index != None:
            new_cases = conf_cases.at[index, 'ncumul_conf'] - conf_cases.at[previous_index, 'ncumul_conf']
            conf_cases.at[index, 'new_cases'] = new_cases
            conf_cases.at[index, 'new_cases_relative'] = new_cases / int(canton_data.at['CH-' + canton, 'population']) * 100000
            total_cases += new_cases
            conf_cases.at[index, 'total_cases'] = total_cases
            conf_cases.at[index, 'total_cases_relative'] = total_cases / int(canton_data.at['CH-' + canton, 'population']) * 100000
            conf_cases.at[index, 'new_deaths'] = conf_cases.at[index, 'ncumul_deceased'] - conf_cases.at[previous_index, 'ncumul_deceased']
        previous_index = index

conf_cases = conf_cases[conf_cases['date'] >= '2020-06-01']

swiss_pretty = conf_cases.loc[:, ['date', 'abbreviation_canton_and_fl', 'new_cases', 'total_cases', 'new_deaths']]

### New COVID-19 cases per 100'0000 residents

In [9]:
# calculate new cases
graphs = []
max_new_cases = conf_cases['new_cases_relative'].max()
for canton in all_cantons:
    update_for_canton = conf_cases.loc[conf_cases['abbreviation_canton_and_fl'] == canton].copy()
    
    update_for_canton['new_cases_relative_avg'] = update_for_canton['new_cases_relative'].rolling(window=7).mean()

    new_cases_graph = figure(title=canton, y_axis_label='new cases', y_range=[0, max_new_cases], x_axis_type='datetime')
    new_cases_graph.line(update_for_canton['date'], update_for_canton['new_cases_relative'], line_width=1)
    new_cases_graph.line(update_for_canton['date'], update_for_canton['new_cases_relative_avg'], line_color='red', line_width=1)
    graphs.append(new_cases_graph)

show(create_grid(graphs, sizing_mode='scale_width'))

### Total COVID-19 cases since 1st of june per 100'000 residents

In [10]:
# calculate total cases
graphs = []
max_total_cases = conf_cases['total_cases_relative'].max()
for canton in all_cantons:
    update_for_canton = conf_cases.loc[conf_cases['abbreviation_canton_and_fl'] == canton]

    total_cases_graph = figure(title=canton, y_axis_label='total cases', y_range=[0, max_total_cases], x_axis_type='datetime')
    total_cases_graph.line(update_for_canton['date'], update_for_canton['total_cases_relative'], line_width=1)
    graphs.append(total_cases_graph)

show(create_grid(graphs, sizing_mode='scale_width'))

## Final Data Frames

In [11]:
worldwide_pretty

Unnamed: 0,Date,Country_Region,New_Cases,Total_Cases,New_Deaths
0,2020-02-24,Afghanistan,,,
1,2020-02-25,Afghanistan,0.0,0.0,0.0
2,2020-02-26,Afghanistan,0.0,0.0,0.0
3,2020-02-27,Afghanistan,0.0,0.0,0.0
4,2020-02-28,Afghanistan,0.0,0.0,0.0
...,...,...,...,...,...
77326,2020-03-12,occupied Palestinian territory,0.0,-25.0,0.0
77327,2020-03-14,occupied Palestinian territory,0.0,-25.0,0.0
77328,2020-03-15,occupied Palestinian territory,0.0,-25.0,0.0
77329,2020-03-16,occupied Palestinian territory,0.0,-25.0,0.0


In [12]:
swiss_pretty

Unnamed: 0,date,abbreviation_canton_and_fl,new_cases,total_cases,new_deaths
2346,2020-06-01,BL,1,1.0,0.0
2347,2020-06-01,FR,0,0.0,0.0
2348,2020-06-01,GE,1,1.0,0.0
2349,2020-06-01,GR,0,0.0,0.0
2350,2020-06-01,JU,0,0.0,0.0
...,...,...,...,...,...
10504,2021-04-23,SZ,61,10557.0,0.0
10505,2021-04-23,TG,0,16911.0,0.0
10506,2021-04-23,BS,59,10659.0,0.0
10507,2021-04-23,AI,5,942.0,0.0
