In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
str_c = 'cases'
str_d = 'deaths'
str_r = 'recovered'
str_dstrct = 'district'
str_date = 'date_reported'

# ETL pipeline

Get covid cases data in Germany. 
Data is taken from here: https://www.arcgis.com/home/item.html?id=f10774f1c63e40168479a1feb6c7ca74

In [3]:
# pull Covid-19 data from Robert-Koch-Institute webpage
df_rki = pd.read_csv('https://www.arcgis.com/sharing/rest/content/items/f10774f1c63e40168479a1feb6c7ca74/data')

In [4]:
# rename relevant columns to English
df_rki.rename(columns={'AnzahlFall': str_c,\
                            'AnzahlTodesfall':str_d,\
                            'AnzahlGenesen':str_r,\
                            'Landkreis':str_dstrct,\
                            'Meldedatum':str_date},\
                            inplace = True)

In [5]:
# convert date to datetime
df_rki[str_date] = pd.to_datetime(df_rki[str_date], format='%Y/%m/%d')

In [6]:
df_rki.head()

Unnamed: 0,FID,IdBundesland,Bundesland,district,Altersgruppe,Geschlecht,cases,deaths,date_reported,IdLandkreis,Datenstand,NeuerFall,NeuerTodesfall,Refdatum,NeuGenesen,recovered,IstErkrankungsbeginn,Altersgruppe2
0,12093090,1,Schleswig-Holstein,SK Flensburg,A15-A34,M,1,0,2020-03-14,1001,"14.06.2020, 00:00 Uhr",0,-9,2020/03/16 00:00:00,0,1,1,Nicht übermittelt
1,12093091,1,Schleswig-Holstein,SK Flensburg,A15-A34,M,1,0,2020-03-19,1001,"14.06.2020, 00:00 Uhr",0,-9,2020/03/13 00:00:00,0,1,1,Nicht übermittelt
2,12093092,1,Schleswig-Holstein,SK Flensburg,A15-A34,M,1,0,2020-03-19,1001,"14.06.2020, 00:00 Uhr",0,-9,2020/03/16 00:00:00,0,1,1,Nicht übermittelt
3,12093093,1,Schleswig-Holstein,SK Flensburg,A15-A34,M,1,0,2020-03-21,1001,"14.06.2020, 00:00 Uhr",0,-9,2020/03/13 00:00:00,0,1,1,Nicht übermittelt
4,12093094,1,Schleswig-Holstein,SK Flensburg,A15-A34,M,1,0,2020-03-27,1001,"14.06.2020, 00:00 Uhr",0,-9,2020/03/22 00:00:00,0,1,1,Nicht übermittelt


## Stats overview

Compute basic stats for the first table based on the reporting scheme provided with the data set, as given at https://www.arcgis.com/home/item.html?id=f10774f1c63e40168479a1feb6c7ca74 (in German).

In [7]:
n_cases = df_rki.loc[df_rki['NeuerFall'].isin([0,1])][str_c].sum()
n_cases_new = \
        df_rki.loc[df_rki['NeuerFall'].isin([-1,1])][str_c].sum()
n_deaths = \
     df_rki.loc[df_rki['NeuerTodesfall'].isin([0,1])][str_d].sum()
n_deaths_new = \
     df_rki.loc[df_rki['NeuerTodesfall'].isin([-1,1])][str_d].sum()
n_recovered = \
        df_rki.loc[df_rki['NeuGenesen'].isin([0,1])][str_r].sum()
n_recovered_new = \
        df_rki.loc[df_rki['NeuGenesen'].isin([-1,1])][str_r].sum()
n_active = n_cases - n_deaths - n_recovered
n_active_new = n_cases_new - n_deaths_new - n_recovered_new
n_cases_new

247

In [8]:
# create data frame with total and new cases/deaths/recovered/active
df_stats = pd.DataFrame({' ':['Total','Today'],\
                        'Cases':[n_cases, n_cases_new],\
                        'Recovered':[n_recovered, n_recovered_new],\
                        'Deaths':[n_deaths, n_deaths_new],\
                        'Unresolved':[n_active, n_active_new]})
df_stats.set_index(' ', inplace=True)
display(df_stats)

Unnamed: 0,Cases,Recovered,Deaths,Unresolved
,,,,
Total,186269.0,172208.0,8787.0,5274.0
Today,247.0,276.0,6.0,-35.0


# Split data frames
For better data handling, we create new data frames for cases, deaths and recovered cases.

In [9]:
df_cases = df_rki.loc[df_rki['NeuerFall'].isin([0,1])].copy()
df_deaths = df_rki.loc[df_rki['NeuerTodesfall'].isin([0,1])].copy()
df_recovered = df_rki.loc[df_rki['NeuGenesen'].isin([0,1])].copy()

We aggregate the deaths by age group to inspect the distribution.

In [10]:
df_deaths_stats = pd.get_dummies(df_deaths['Altersgruppe']).sum().reset_index()
df_deaths_stats.rename(columns={'index':'Age',0:'Count'}, inplace=True)
df_deaths_stats

Unnamed: 0,Age,Count
0,A00-A04,1
1,A15-A34,18
2,A35-A59,390
3,A60-A79,2730
4,A80+,5017


In [11]:
# drop unnecessary columns
df_cases.drop(columns=\
        [str_d,'NeuerTodesfall',str_r,\
         'NeuGenesen','Altersgruppe2'],\
          inplace=True)
df_deaths.drop(columns=\
        [str_c,'NeuerFall','NeuGenesen',\
         str_r,'Altersgruppe2'], \
         inplace=True)
df_recovered.drop(columns=\
        [str_c,'NeuerFall',str_d,\
         'NeuerTodesfall','Altersgruppe2'], \
         inplace=True)

## Data cleaning:
- We'll later add population data to all districts. The data we pull is for all districts expect of the quaters of Berlin. We thus merge the Berlin data to view it as a single city.
- Converting the dates to proper datetime data types
- drop column `Altersgruppe2` since this is not published any more.
- drop column `Datenstand` as this is always the current date

In [12]:
# merge Berlin cases since we currently don't have population data for the
# individual districts
df_cases.loc[df_cases['IdLandkreis'].\
            isin(np.arange(11000,11013,1)),'IdLandkreis'] = 11000
df_cases.loc[df_cases['IdLandkreis'].\
            isin(np.arange(11000,11013,1)),str_dstrct] = 'SK Berlin'
df_cases.drop(columns = ['Datenstand'], inplace=True)
df_cases.head()

Unnamed: 0,FID,IdBundesland,Bundesland,district,Altersgruppe,Geschlecht,cases,date_reported,IdLandkreis,NeuerFall,Refdatum,IstErkrankungsbeginn
0,12093090,1,Schleswig-Holstein,SK Flensburg,A15-A34,M,1,2020-03-14,1001,0,2020/03/16 00:00:00,1
1,12093091,1,Schleswig-Holstein,SK Flensburg,A15-A34,M,1,2020-03-19,1001,0,2020/03/13 00:00:00,1
2,12093092,1,Schleswig-Holstein,SK Flensburg,A15-A34,M,1,2020-03-19,1001,0,2020/03/16 00:00:00,1
3,12093093,1,Schleswig-Holstein,SK Flensburg,A15-A34,M,1,2020-03-21,1001,0,2020/03/13 00:00:00,1
4,12093094,1,Schleswig-Holstein,SK Flensburg,A15-A34,M,1,2020-03-27,1001,0,2020/03/22 00:00:00,1


# Population Data
Next, we pull the population data from https://www.destatis.de/DE/Themen/Laender-Regionen/Regionales/Gemeindeverzeichnis/Administrativ/04-kreise.html.

In [13]:
# needs packae xlrd
df_population = pd.read_excel('./data/04-kreise.xlsx', sheet_name='Kreisfreie Städte u. Landkreise',skiprows=6, skipfooter=16)
df_population.columns = ['IdLandkreis', 'Bezeichnung','Name','NUTS3','area','pop_tot','pop_male','pop_female','pop_per_sqkm2']
df_population.dropna(axis=0, how='any', inplace=True)
# set integer IdLandkreis
df_population['IdLandkreis'] = df_population['IdLandkreis'].astype(int)
df_population.head(10)

Unnamed: 0,IdLandkreis,Bezeichnung,Name,NUTS3,area,pop_tot,pop_male,pop_female,pop_per_sqkm2
1,1001,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0
2,1002,Kreisfreie Stadt,"Kiel, Landeshauptstadt",DEF02,118.65,247548.0,120566.0,126982.0,2086.0
3,1003,Kreisfreie Stadt,"Lübeck, Hansestadt",DEF03,214.19,217198.0,104371.0,112827.0,1014.0
4,1004,Kreisfreie Stadt,"Neumünster, Stadt",DEF04,71.66,79487.0,39241.0,40246.0,1109.0
5,1051,Kreis,Dithmarschen,DEF05,1428.17,133210.0,65720.0,67490.0,93.0
6,1053,Kreis,Herzogtum Lauenburg,DEF06,1263.07,197264.0,96881.0,100383.0,156.0
7,1054,Kreis,Nordfriesland,DEF07,2083.55,165507.0,81099.0,84408.0,79.0
8,1055,Kreis,Ostholstein,DEF08,1393.02,200581.0,96765.0,103816.0,144.0
9,1056,Kreis,Pinneberg,DEF09,664.25,314391.0,154211.0,160180.0,473.0
10,1057,Kreis,Plön,DEF0A,1083.56,128647.0,62532.0,66115.0,119.0


# Rolling

We want to create a data frame that has the rolling sum of the cases in the past week. This is an important metric when deciding about necessary lockdown measures.

In [14]:
df_to_roll = df_cases.copy()[[str_date,str_c,'IdLandkreis']].\
        groupby([str_date,'IdLandkreis']).sum().reset_index()
df_to_roll.sort_values(by=str_date,inplace = True)
df_to_roll

Unnamed: 0,date_reported,IdLandkreis,cases
0,2020-01-28,9181,1
1,2020-01-28,9188,1
2,2020-01-29,9162,1
3,2020-01-29,9179,1
4,2020-01-31,9179,1
...,...,...,...
23286,2020-06-13,5754,17
23287,2020-06-13,5913,8
23288,2020-06-13,5914,9
23281,2020-06-13,5362,1


To compute sums over 7 day periods, we need to have one entry per day. Thus, we add zero rows for all districts and days that did not report any new cases.

In [15]:
# pad zero rows for days on which no new cases were reported
lkr_all = set(df_to_roll['IdLandkreis'].unique())
# sweep over all days in the data set
for date in pd.date_range(df_to_roll[str_date].min(), \
                          df_to_roll[str_date].max(), freq = '1D'):
    # add zero rows for all districts that didn't report cases on that day
    for id_lkr in lkr_all - \
        set(df_to_roll.loc[df_to_roll[str_date] == date ]['IdLandkreis']):
        df_to_roll = \
            df_to_roll.append({str_date:date, \
                               'IdLandkreis': id_lkr,\
                                str_c:0}, \
                                ignore_index=True)
    print('fixed date',date)

fixed date 2020-01-28 00:00:00
fixed date 2020-01-29 00:00:00
fixed date 2020-01-30 00:00:00
fixed date 2020-01-31 00:00:00
fixed date 2020-02-01 00:00:00
fixed date 2020-02-02 00:00:00
fixed date 2020-02-03 00:00:00
fixed date 2020-02-04 00:00:00
fixed date 2020-02-05 00:00:00
fixed date 2020-02-06 00:00:00
fixed date 2020-02-07 00:00:00
fixed date 2020-02-08 00:00:00
fixed date 2020-02-09 00:00:00
fixed date 2020-02-10 00:00:00
fixed date 2020-02-11 00:00:00
fixed date 2020-02-12 00:00:00
fixed date 2020-02-13 00:00:00
fixed date 2020-02-14 00:00:00
fixed date 2020-02-15 00:00:00
fixed date 2020-02-16 00:00:00
fixed date 2020-02-17 00:00:00
fixed date 2020-02-18 00:00:00
fixed date 2020-02-19 00:00:00
fixed date 2020-02-20 00:00:00
fixed date 2020-02-21 00:00:00
fixed date 2020-02-22 00:00:00
fixed date 2020-02-23 00:00:00
fixed date 2020-02-24 00:00:00
fixed date 2020-02-25 00:00:00
fixed date 2020-02-26 00:00:00
fixed date 2020-02-27 00:00:00
fixed date 2020-02-28 00:00:00
fixed da

Next, we sort by date and district to get the number of reported cases for each day of the pandemic.

In [16]:
df_to_roll.sort_values(by=str_date,inplace = True)
df_to_roll = df_to_roll.set_index(str_date).\
                groupby('IdLandkreis').rolling('7d').sum()
df_to_roll = df_to_roll.drop(columns = ['IdLandkreis']).reset_index()
df_to_roll

Unnamed: 0,IdLandkreis,date_reported,cases
0,1001,2020-01-28,0.0
1,1001,2020-01-29,0.0
2,1001,2020-01-30,0.0
3,1001,2020-01-31,0.0
4,1001,2020-02-01,0.0
...,...,...,...
55333,16077,2020-06-09,4.0
55334,16077,2020-06-10,4.0
55335,16077,2020-06-11,4.0
55336,16077,2020-06-12,4.0


Now we can merge this data with the population data

In [17]:
# merge data frame containing rolling sum with population data
df_cases_roll = pd.merge(df_to_roll, df_population,on='IdLandkreis')
df_cases_roll    

Unnamed: 0,IdLandkreis,date_reported,cases,Bezeichnung,Name,NUTS3,area,pop_tot,pop_male,pop_female,pop_per_sqkm2
0,1001,2020-01-28,0.0,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0
1,1001,2020-01-29,0.0,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0
2,1001,2020-01-30,0.0,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0
3,1001,2020-01-31,0.0,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0
4,1001,2020-02-01,0.0,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0
...,...,...,...,...,...,...,...,...,...,...,...
55333,16077,2020-06-09,4.0,Landkreis,Altenburger Land,DEG0M,569.40,90118.0,44138.0,45980.0,158.0
55334,16077,2020-06-10,4.0,Landkreis,Altenburger Land,DEG0M,569.40,90118.0,44138.0,45980.0,158.0
55335,16077,2020-06-11,4.0,Landkreis,Altenburger Land,DEG0M,569.40,90118.0,44138.0,45980.0,158.0
55336,16077,2020-06-12,4.0,Landkreis,Altenburger Land,DEG0M,569.40,90118.0,44138.0,45980.0,158.0


Add a column for the normalized (per 100k inhabitants) cases

In [18]:
# add column for cases per 100k and compute normalized cases
df_cases_roll.insert(3,'AnzahlFall100k',0)
df_cases_roll['AnzahlFall100k'] = \
    df_cases_roll[str_c]/df_cases_roll['pop_tot']*(10**5)
df_cases_roll

Unnamed: 0,IdLandkreis,date_reported,cases,AnzahlFall100k,Bezeichnung,Name,NUTS3,area,pop_tot,pop_male,pop_female,pop_per_sqkm2
0,1001,2020-01-28,0.0,0.000000,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0
1,1001,2020-01-29,0.0,0.000000,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0
2,1001,2020-01-30,0.0,0.000000,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0
3,1001,2020-01-31,0.0,0.000000,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0
4,1001,2020-02-01,0.0,0.000000,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0
...,...,...,...,...,...,...,...,...,...,...,...,...
55333,16077,2020-06-09,4.0,4.438625,Landkreis,Altenburger Land,DEG0M,569.40,90118.0,44138.0,45980.0,158.0
55334,16077,2020-06-10,4.0,4.438625,Landkreis,Altenburger Land,DEG0M,569.40,90118.0,44138.0,45980.0,158.0
55335,16077,2020-06-11,4.0,4.438625,Landkreis,Altenburger Land,DEG0M,569.40,90118.0,44138.0,45980.0,158.0
55336,16077,2020-06-12,4.0,4.438625,Landkreis,Altenburger Land,DEG0M,569.40,90118.0,44138.0,45980.0,158.0


Finally, we make sure that the district names are correct by pulling the original names based on the district id.

In [19]:
# set district ids to district names
df_cases_roll[str_dstrct] = df_cases_roll['IdLandkreis']
# make sure that the district name matches the original one from the cases
# data frame (this step is a bit slow, could likely be improved)
df_cases_roll[str_dstrct] = df_cases_roll[str_dstrct].apply(lambda x: \
        df_cases.loc[df_cases['IdLandkreis'] == x][str_dstrct].iloc[0])
df_cases_roll.rename(columns=\
        {"AnzahlFall":"7d_AnzahlFall",'AnzahlFall100k':'7d_AnzahlFall100k'})

Unnamed: 0,IdLandkreis,date_reported,cases,7d_AnzahlFall100k,Bezeichnung,Name,NUTS3,area,pop_tot,pop_male,pop_female,pop_per_sqkm2,district
0,1001,2020-01-28,0.0,0.000000,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0,SK Flensburg
1,1001,2020-01-29,0.0,0.000000,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0,SK Flensburg
2,1001,2020-01-30,0.0,0.000000,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0,SK Flensburg
3,1001,2020-01-31,0.0,0.000000,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0,SK Flensburg
4,1001,2020-02-01,0.0,0.000000,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0,SK Flensburg
...,...,...,...,...,...,...,...,...,...,...,...,...,...
55333,16077,2020-06-09,4.0,4.438625,Landkreis,Altenburger Land,DEG0M,569.40,90118.0,44138.0,45980.0,158.0,LK Altenburger Land
55334,16077,2020-06-10,4.0,4.438625,Landkreis,Altenburger Land,DEG0M,569.40,90118.0,44138.0,45980.0,158.0,LK Altenburger Land
55335,16077,2020-06-11,4.0,4.438625,Landkreis,Altenburger Land,DEG0M,569.40,90118.0,44138.0,45980.0,158.0,LK Altenburger Land
55336,16077,2020-06-12,4.0,4.438625,Landkreis,Altenburger Land,DEG0M,569.40,90118.0,44138.0,45980.0,158.0,LK Altenburger Land


# Long data format
Next, we transform the data frames for cases, deaths, and recovered into a long data format for better processing with Altair.

In [20]:
# Country
df_ctr_cases = df_cases.groupby([str_date])\
                            .sum()[[str_c]]
df_ctr_deaths = df_deaths.groupby([str_date])\
                            .sum()[[str_d]]
df_ctr_recovered = df_recovered.groupby([str_date])\
                            .sum()[[str_r]]
df_ctr_cases = pd.melt(df_ctr_cases.reset_index(), id_vars=[str_date],\
                                                       value_vars = [str_c],\
                                                       var_name = 'category',\
                                                       value_name = 'Number')
df_ctr_deaths = pd.melt(df_ctr_deaths.reset_index(), id_vars=[str_date],\
                                                       value_vars = [str_d],\
                                                       var_name = 'category',\
                                                       value_name = 'Number')
df_ctr_recovered = pd.melt(df_ctr_recovered.reset_index(), id_vars=[str_date],\
                                                       value_vars = [str_r],\
                                                       var_name = 'category',\
                                                       value_name = 'Number')
df_ctr = pd.concat([df_ctr_cases,df_ctr_deaths,df_ctr_recovered], axis = 0)
df_ctr['category'] = df_ctr['category']\
            .apply(lambda x: 'case' if x == str_c else\
                                  ('death' if x == str_d else 'recovered'))
df_ctr

Unnamed: 0,date_reported,category,Number
0,2020-01-28,case,2
1,2020-01-29,case,2
2,2020-01-31,case,3
3,2020-02-03,case,1
4,2020-02-04,case,4
...,...,...,...
117,2020-06-09,recovered,20
118,2020-06-10,recovered,17
119,2020-06-11,recovered,5
120,2020-06-12,recovered,7


In [21]:
# Country cumulative cases
df_ctr_cum = df_ctr.copy().sort_values(by=[str_date,'category'])
for el in list(df_ctr_cum['category'].unique()):
    df_ctr_cum.loc[df_ctr_cum['category']== el,'Number' ] = \
            np.cumsum(df_ctr_cum.loc[df_ctr_cum['category']== el,'Number' ])
df_ctr_cum    

Unnamed: 0,date_reported,category,Number
0,2020-01-28,case,2
0,2020-01-28,recovered,2
1,2020-01-29,case,4
1,2020-01-29,recovered,4
2,2020-01-31,case,7
...,...,...,...
120,2020-06-12,case,186122
97,2020-06-12,death,8787
120,2020-06-12,recovered,172206
121,2020-06-13,case,186269


### States

In [22]:
# states daily cases
df_sta = pd.concat([df_cases.groupby([str_date,'Bundesland']).sum().reset_index()\
                    [[str_date,'Bundesland',str_c]],\
                df_deaths.groupby([str_date,'Bundesland']).sum().reset_index()\
                                [[str_date,'Bundesland',str_d]],\
                df_recovered.groupby([str_date,'Bundesland']).sum().reset_index()\
                                [[str_date,'Bundesland',str_r]]])
df_sta = df_sta.fillna(0).groupby([str_date,'Bundesland']).sum().reset_index()
df_sta[[str_c,str_d,str_r]] = df_sta[[str_c,str_d,str_r]].astype('int64')
df_sta

Unnamed: 0,date_reported,Bundesland,cases,deaths,recovered
0,2020-01-28,Bayern,2,0,2
1,2020-01-29,Bayern,2,0,2
2,2020-01-31,Bayern,3,0,3
3,2020-02-03,Bayern,1,0,1
4,2020-02-04,Bayern,3,0,3
...,...,...,...,...,...
1606,2020-06-13,Niedersachsen,8,0,1
1607,2020-06-13,Nordrhein-Westfalen,80,0,0
1608,2020-06-13,Saarland,1,0,0
1609,2020-06-13,Sachsen-Anhalt,10,0,0


In [23]:
# states cumulative cases
df_sta_cum = df_sta.copy()
for state in list(df_sta['Bundesland'].unique()):
    for col in [str_c,str_d,str_r]:
        df_sta_cum.loc[df_sta_cum['Bundesland']==state,col] = \
            np.cumsum(df_sta_cum.loc[df_sta_cum['Bundesland']==state,col])
df_sta_cum        

Unnamed: 0,date_reported,Bundesland,cases,deaths,recovered
0,2020-01-28,Bayern,2,0,2
1,2020-01-29,Bayern,4,0,4
2,2020-01-31,Bayern,7,0,7
3,2020-02-03,Bayern,8,0,8
4,2020-02-04,Bayern,11,0,11
...,...,...,...,...,...
1606,2020-06-13,Niedersachsen,12874,614,11314
1607,2020-06-13,Nordrhein-Westfalen,39233,1645,36137
1608,2020-06-13,Saarland,2772,168,2553
1609,2020-06-13,Sachsen-Anhalt,1756,57,1639


### Districts

In [24]:
# districts daily
df_lkr = pd.concat([df_cases.groupby([str_date,str_dstrct]).sum().\
            reset_index()[[str_date,str_dstrct,str_c]],\
        df_deaths.groupby([str_date,str_dstrct]).sum().\
            reset_index()[[str_date,str_dstrct,str_d]],\
        df_recovered.groupby([str_date,str_dstrct]).sum().\
            reset_index()[[str_date,str_dstrct,str_r]]])
df_lkr = df_lkr.fillna(0).groupby([str_date,str_dstrct]).sum().reset_index()
df_lkr[[str_c,str_d,str_r]] = df_lkr[[str_c,str_d,str_r]].astype('int64')
df_lkr

Unnamed: 0,date_reported,district,cases,deaths,recovered
0,2020-01-28,LK Landsberg a.Lech,1,0,1
1,2020-01-28,LK Starnberg,1,0,1
2,2020-01-29,LK Fürstenfeldbruck,1,0,1
3,2020-01-29,SK München,1,0,1
4,2020-01-31,LK Fürstenfeldbruck,1,0,1
...,...,...,...,...,...
24217,2020-06-13,SK Hamm,3,0,0
24218,2020-06-13,SK Magdeburg,9,0,0
24219,2020-06-13,SK München,1,0,0
24220,2020-06-13,SK Münster,1,0,0


In [25]:
# districts cumulative
df_lkr_cum = df_lkr.copy()
for el in list(df_lkr[str_dstrct].unique()):
    for col in [str_c,str_d,str_r]:
        df_lkr_cum.loc[df_lkr_cum[str_dstrct]==el,col] = \
            np.cumsum(df_lkr_cum.loc[df_lkr_cum[str_dstrct]==el,col])
df_lkr_cum

Unnamed: 0,date_reported,district,cases,deaths,recovered
0,2020-01-28,LK Landsberg a.Lech,1,0,1
1,2020-01-28,LK Starnberg,1,0,1
2,2020-01-29,LK Fürstenfeldbruck,1,0,1
3,2020-01-29,SK München,1,0,1
4,2020-01-31,LK Fürstenfeldbruck,2,0,2
...,...,...,...,...,...
24217,2020-06-13,SK Hamm,481,36,437
24218,2020-06-13,SK Magdeburg,141,2,106
24219,2020-06-13,SK München,6575,212,6160
24220,2020-06-13,SK Münster,729,13,700


# Location Data

Merge reported cases with the location of the district so that
we can plot a heatmap of the number of cases on an actual map of Germany.
Data set is taken from
https://public.opendatasoft.com/explore/dataset/landkreise-in-germany/export/.

Pull the geographic data and perform basic cleaning.

In [26]:
geo_data = pd.read_csv('./data/landkreise-in-germany.csv', delimiter = ';',\
                           usecols=['Geo Point','Name 2','Cca 2', 'Type 2'])
# drop NaN row corresponding to a lake
geo_data.dropna(axis=0, inplace = True)
geo_data[['lat','lon']] = geo_data['Geo Point'].str.split(',', expand=True)
geo_data.drop(columns = 'Geo Point',inplace = True)
geo_data.rename(columns = \
    {'Name 2':'Name','Cca 2':'IdLandkreis','Type 2':'Type of District'}, \
    inplace =True)
geo_data

Unnamed: 0,Name,IdLandkreis,Type of District,lat,lon
0,Freiburg im Breisgau,8311.0,Stadtkreis,47.9925229956,7.81807596197
1,Dillingen an der Donau,9773.0,Landkreis,48.5964037974,10.527764168
2,Nürnberg,9564.0,Kreisfreie Stadt,49.4362114486,11.0827553426
3,Neumarkt in der Oberpfalz,9373.0,Landkreis,49.2159614099,11.5665579197
4,Rosenheim,9163.0,Kreisfreie Stadt,47.8443777181,12.1087247511
...,...,...,...,...,...
398,Meißen,14627.0,Landkreis,51.239397748,13.4829006825
399,Plön,1057.0,Kreis,54.2433885939,10.3636951573
400,Stormarn,1062.0,Kreis,53.7208005726,10.3316398811
401,Altenburger Land,16077.0,Landkreis,50.9564246614,12.3991313423


Merge case data and location data

In [27]:
df_cases_loc = pd.merge(df_cases, geo_data, \
            on='IdLandkreis')\
            [['IdLandkreis',str_date,str_c,'lat','lon']]
df_cases_loc

Unnamed: 0,IdLandkreis,date_reported,cases,lat,lon
0,1001,2020-03-14,1,54.7849933768,9.43852835486
1,1001,2020-03-19,1,54.7849933768,9.43852835486
2,1001,2020-03-19,1,54.7849933768,9.43852835486
3,1001,2020-03-21,1,54.7849933768,9.43852835486
4,1001,2020-03-27,1,54.7849933768,9.43852835486
...,...,...,...,...,...
149196,16077,2020-06-09,1,50.9564246614,12.3991313423
149197,16077,2020-05-06,1,50.9564246614,12.3991313423
149198,16077,2020-05-11,1,50.9564246614,12.3991313423
149199,16077,2020-05-28,2,50.9564246614,12.3991313423


Next, we merge this data with the population data so that we can normalize the cases per 100k inhabitants.

In [28]:
# normalize to cases per 100k inhabitants
df_cases_loc = pd.merge(df_cases_loc,df_population[['IdLandkreis','pop_tot']],on='IdLandkreis')
df_cases_loc[str_c] = df_cases_loc[str_c]/df_cases_loc['pop_tot']*10**5
df_cases_loc[str_c] = df_cases_loc[str_c].apply(np.round).astype(int)
df_cases_loc

Unnamed: 0,IdLandkreis,date_reported,cases,lat,lon,pop_tot
0,1001,2020-03-14,1,54.7849933768,9.43852835486,89504.0
1,1001,2020-03-19,1,54.7849933768,9.43852835486,89504.0
2,1001,2020-03-19,1,54.7849933768,9.43852835486,89504.0
3,1001,2020-03-21,1,54.7849933768,9.43852835486,89504.0
4,1001,2020-03-27,1,54.7849933768,9.43852835486,89504.0
...,...,...,...,...,...,...
149196,16077,2020-06-09,1,50.9564246614,12.3991313423,90118.0
149197,16077,2020-05-06,1,50.9564246614,12.3991313423,90118.0
149198,16077,2020-05-11,1,50.9564246614,12.3991313423,90118.0
149199,16077,2020-05-28,2,50.9564246614,12.3991313423,90118.0


When plotting the map, we only use the geographic coordinates of the reported
    case. Currently, each row contains information about how many cases were
    reported. Thus, we create a new row for every reported cases and copy the
    coordinates of the district.

In [29]:
df_cases_loc_long = df_cases_loc.loc[df_cases_loc[str_c] == 1]
for n_cases in sorted(df_cases_loc[str_c].unique())[1:]:
    for k in range(n_cases):
        df_cases_loc_long = \
                pd.concat([df_cases_loc_long, df_cases_loc.loc[df_cases_loc[str_c] == n_cases]])
df_cases_loc_long.drop(columns=str_c, inplace = True)
df_cases_loc_long

Unnamed: 0,IdLandkreis,date_reported,lat,lon,pop_tot
0,1001,2020-03-14,54.7849933768,9.43852835486,89504.0
1,1001,2020-03-19,54.7849933768,9.43852835486,89504.0
2,1001,2020-03-19,54.7849933768,9.43852835486,89504.0
3,1001,2020-03-21,54.7849933768,9.43852835486,89504.0
4,1001,2020-03-27,54.7849933768,9.43852835486,89504.0
...,...,...,...,...,...
72624,8136,2020-04-11,48.877525969,10.0901573582,314002.0
72624,8136,2020-04-11,48.877525969,10.0901573582,314002.0
72624,8136,2020-04-11,48.877525969,10.0901573582,314002.0
72624,8136,2020-04-11,48.877525969,10.0901573582,314002.0


Finally, we create a data frame containing the locations of the cases in the last 7 days to plot on a map.

In [30]:
# data frame with only past week's cases
df_cases_7d = df_cases_loc_long.loc[(pd.Timestamp.today() - \
                                    df_cases_loc_long[str_date]).dt.days < 7]
df_cases_7d

Unnamed: 0,IdLandkreis,date_reported,lat,lon,pop_tot
31,1001,2020-06-10,54.7849933768,9.43852835486,89504.0
577,1051,2020-06-10,54.1329109614,9.10781447873,133210.0
578,1051,2020-06-12,54.1329109614,9.10781447873,133210.0
2406,1061,2020-06-08,53.9289451889,9.51938189615,131347.0
2419,1061,2020-06-11,53.9289451889,9.51938189615,131347.0
...,...,...,...,...,...
125108,9771,2020-06-10,48.4275701484,11.0527555565,133596.0
125108,9771,2020-06-10,48.4275701484,11.0527555565,133596.0
125108,9771,2020-06-10,48.4275701484,11.0527555565,133596.0
125108,9771,2020-06-10,48.4275701484,11.0527555565,133596.0


## Overview over created data frames
Below, all data frames used in the dashboard are displayed again

In [31]:
df_stats

Unnamed: 0,Cases,Recovered,Deaths,Unresolved
,,,,
Total,186269.0,172208.0,8787.0,5274.0
Today,247.0,276.0,6.0,-35.0


In [32]:
df_cases_roll

Unnamed: 0,IdLandkreis,date_reported,cases,AnzahlFall100k,Bezeichnung,Name,NUTS3,area,pop_tot,pop_male,pop_female,pop_per_sqkm2,district
0,1001,2020-01-28,0.0,0.000000,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0,SK Flensburg
1,1001,2020-01-29,0.0,0.000000,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0,SK Flensburg
2,1001,2020-01-30,0.0,0.000000,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0,SK Flensburg
3,1001,2020-01-31,0.0,0.000000,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0,SK Flensburg
4,1001,2020-02-01,0.0,0.000000,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0,SK Flensburg
...,...,...,...,...,...,...,...,...,...,...,...,...,...
55333,16077,2020-06-09,4.0,4.438625,Landkreis,Altenburger Land,DEG0M,569.40,90118.0,44138.0,45980.0,158.0,LK Altenburger Land
55334,16077,2020-06-10,4.0,4.438625,Landkreis,Altenburger Land,DEG0M,569.40,90118.0,44138.0,45980.0,158.0,LK Altenburger Land
55335,16077,2020-06-11,4.0,4.438625,Landkreis,Altenburger Land,DEG0M,569.40,90118.0,44138.0,45980.0,158.0,LK Altenburger Land
55336,16077,2020-06-12,4.0,4.438625,Landkreis,Altenburger Land,DEG0M,569.40,90118.0,44138.0,45980.0,158.0,LK Altenburger Land


In [33]:
df_ctr

Unnamed: 0,date_reported,category,Number
0,2020-01-28,case,2
1,2020-01-29,case,2
2,2020-01-31,case,3
3,2020-02-03,case,1
4,2020-02-04,case,4
...,...,...,...
117,2020-06-09,recovered,20
118,2020-06-10,recovered,17
119,2020-06-11,recovered,5
120,2020-06-12,recovered,7


In [34]:
df_ctr_cum.to_csv('df_ctr_cum.csv',index=False)
df_ctr_cum

Unnamed: 0,date_reported,category,Number
0,2020-01-28,case,2
0,2020-01-28,recovered,2
1,2020-01-29,case,4
1,2020-01-29,recovered,4
2,2020-01-31,case,7
...,...,...,...
120,2020-06-12,case,186122
97,2020-06-12,death,8787
120,2020-06-12,recovered,172206
121,2020-06-13,case,186269


In [35]:
df_sta

Unnamed: 0,date_reported,Bundesland,cases,deaths,recovered
0,2020-01-28,Bayern,2,0,2
1,2020-01-29,Bayern,2,0,2
2,2020-01-31,Bayern,3,0,3
3,2020-02-03,Bayern,1,0,1
4,2020-02-04,Bayern,3,0,3
...,...,...,...,...,...
1606,2020-06-13,Niedersachsen,8,0,1
1607,2020-06-13,Nordrhein-Westfalen,80,0,0
1608,2020-06-13,Saarland,1,0,0
1609,2020-06-13,Sachsen-Anhalt,10,0,0


In [36]:
df_sta_cum

Unnamed: 0,date_reported,Bundesland,cases,deaths,recovered
0,2020-01-28,Bayern,2,0,2
1,2020-01-29,Bayern,4,0,4
2,2020-01-31,Bayern,7,0,7
3,2020-02-03,Bayern,8,0,8
4,2020-02-04,Bayern,11,0,11
...,...,...,...,...,...
1606,2020-06-13,Niedersachsen,12874,614,11314
1607,2020-06-13,Nordrhein-Westfalen,39233,1645,36137
1608,2020-06-13,Saarland,2772,168,2553
1609,2020-06-13,Sachsen-Anhalt,1756,57,1639


In [37]:
df_lkr

Unnamed: 0,date_reported,district,cases,deaths,recovered
0,2020-01-28,LK Landsberg a.Lech,1,0,1
1,2020-01-28,LK Starnberg,1,0,1
2,2020-01-29,LK Fürstenfeldbruck,1,0,1
3,2020-01-29,SK München,1,0,1
4,2020-01-31,LK Fürstenfeldbruck,1,0,1
...,...,...,...,...,...
24217,2020-06-13,SK Hamm,3,0,0
24218,2020-06-13,SK Magdeburg,9,0,0
24219,2020-06-13,SK München,1,0,0
24220,2020-06-13,SK Münster,1,0,0


In [38]:
df_lkr_cum

Unnamed: 0,date_reported,district,cases,deaths,recovered
0,2020-01-28,LK Landsberg a.Lech,1,0,1
1,2020-01-28,LK Starnberg,1,0,1
2,2020-01-29,LK Fürstenfeldbruck,1,0,1
3,2020-01-29,SK München,1,0,1
4,2020-01-31,LK Fürstenfeldbruck,2,0,2
...,...,...,...,...,...
24217,2020-06-13,SK Hamm,481,36,437
24218,2020-06-13,SK Magdeburg,141,2,106
24219,2020-06-13,SK München,6575,212,6160
24220,2020-06-13,SK Münster,729,13,700


In [39]:
df_cases_loc_long

Unnamed: 0,IdLandkreis,date_reported,lat,lon,pop_tot
0,1001,2020-03-14,54.7849933768,9.43852835486,89504.0
1,1001,2020-03-19,54.7849933768,9.43852835486,89504.0
2,1001,2020-03-19,54.7849933768,9.43852835486,89504.0
3,1001,2020-03-21,54.7849933768,9.43852835486,89504.0
4,1001,2020-03-27,54.7849933768,9.43852835486,89504.0
...,...,...,...,...,...
72624,8136,2020-04-11,48.877525969,10.0901573582,314002.0
72624,8136,2020-04-11,48.877525969,10.0901573582,314002.0
72624,8136,2020-04-11,48.877525969,10.0901573582,314002.0
72624,8136,2020-04-11,48.877525969,10.0901573582,314002.0
