In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ETL pipeline

Get covid cases data in germany. 
Data is taken from here: https://www.arcgis.com/home/item.html?id=f10774f1c63e40168479a1feb6c7ca74

In [2]:
df_rki = pd.read_csv('https://www.arcgis.com/sharing/rest/content/items/f10774f1c63e40168479a1feb6c7ca74/data')
#df_rki = pd.read_csv('data/RKI_COVID19-2.csv')
df_rki['Meldedatum'] = pd.to_datetime(df_rki['Meldedatum'], format='%Y/%m/%d')

## Stats overview

In [3]:
n_cases = df_rki.loc[df_rki['NeuerFall'].isin([0,1])]['AnzahlFall'].sum()
n_cases_new = df_rki.loc[df_rki['NeuerFall'].isin([-1,1])]['AnzahlFall'].sum()
n_deaths = df_rki.loc[df_rki['NeuerTodesfall'].isin([0,1])]['AnzahlTodesfall'].sum()
n_deaths_new = df_rki.loc[df_rki['NeuerTodesfall'].isin([-1,1])]['AnzahlTodesfall'].sum()
n_recovered = df_rki.loc[df_rki['NeuGenesen'].isin([0,1])]['AnzahlGenesen'].sum()
n_recovered_new = df_rki.loc[df_rki['NeuGenesen'].isin([-1,1])]['AnzahlGenesen'].sum()
n_active = n_cases - n_deaths - n_recovered
n_active_new = n_cases_new - n_deaths_new - n_recovered_new

df_stats = pd.DataFrame({' ':['Total','Today'],\
                    'Cases':[n_cases, n_cases_new],\
                    'Recovered':[n_recovered, n_recovered_new],\
                    'Deaths':[n_deaths, n_deaths_new],\
                    'Active':[n_active, n_active_new]})
df_stats.set_index(' ', inplace=True)
n_cases_new

745

In [4]:
df_cases = df_rki.loc[df_rki['NeuerFall'].isin([0,1])].copy()
df_deaths = df_rki.loc[df_rki['NeuerTodesfall'].isin([0,1])].copy()
df_recovered = df_rki.loc[df_rki['NeuGenesen'].isin([0,1])].copy()

In [5]:
df_deaths_stats = pd.get_dummies(df_deaths['Altersgruppe']).sum().reset_index()
df_deaths_stats.rename(columns={'index':'Age',0:'Count'}, inplace=True)
df_deaths_stats

Unnamed: 0,Age,Count
0,A00-A04,1
1,A15-A34,16
2,A35-A59,346
3,A60-A79,2516
4,A80+,4697


In [6]:
df_cases.drop(columns=['AnzahlTodesfall','NeuerTodesfall','AnzahlGenesen','NeuGenesen','Altersgruppe2'], inplace=True)
df_deaths.drop(columns=['AnzahlFall','NeuerFall','NeuGenesen','AnzahlGenesen','Altersgruppe2'], inplace=True)
df_recovered.drop(columns=['AnzahlFall','NeuerFall','AnzahlTodesfall','NeuerTodesfall','Altersgruppe2'], inplace=True)

Data cleaining:
- We'll later add population data to all districts. The data we pull is for all districts expect of the quaters of Berlin. We thus merge the Berlin data to view it as a single city.
- Converting the dates to proper datetime data types
- drop column `Altersgruppe2` since this is not published any more.
- drop column `Datenstand` as this is always the current date

In [7]:
# merge Berlin cases since we currently don't have population data for the individual districts
df_cases.loc[df_cases['IdLandkreis'].isin(np.arange(11000,11013,1)),'IdLandkreis'] = 11000
df_cases.loc[df_cases['IdLandkreis'].isin(np.arange(11000,11013,1)),'Landkreis'] = 'SK Berlin'
# convert columns to datetimes
#df_cases['Meldedatum'] = df_cases['Meldedatum'].apply(pd.to_datetime)
#df_cases['Datenstand'] = df_cases['Datenstand'].apply(lambda x: pd.to_datetime(x.split(',')[0]))
#df_cases['Refdatum'] = df_cases['Refdatum'].apply(pd.to_datetime)
# drop irrelevant columns
df_cases.drop(columns = ['Datenstand'], inplace=True)
df_cases.head()

Unnamed: 0,FID,IdBundesland,Bundesland,Landkreis,Altersgruppe,Geschlecht,AnzahlFall,Meldedatum,IdLandkreis,NeuerFall,Refdatum,IstErkrankungsbeginn
0,8249440,1,Schleswig-Holstein,SK Flensburg,A15-A34,M,1,2020-03-14,1001,0,2020/03/16 00:00:00,1
1,8249441,1,Schleswig-Holstein,SK Flensburg,A15-A34,M,1,2020-03-19,1001,0,2020/03/13 00:00:00,1
2,8249442,1,Schleswig-Holstein,SK Flensburg,A15-A34,M,1,2020-03-19,1001,0,2020/03/16 00:00:00,1
3,8249443,1,Schleswig-Holstein,SK Flensburg,A15-A34,M,1,2020-03-21,1001,0,2020/03/13 00:00:00,1
4,8249444,1,Schleswig-Holstein,SK Flensburg,A15-A34,M,1,2020-03-27,1001,0,2020/03/22 00:00:00,1


In [8]:
# one-hot encode age groups
df_cases = pd.concat([df_cases,pd.get_dummies(df_cases['Altersgruppe'], prefix='age')],axis=1).drop(columns='Altersgruppe')
df_cases.head()

Unnamed: 0,FID,IdBundesland,Bundesland,Landkreis,Geschlecht,AnzahlFall,Meldedatum,IdLandkreis,NeuerFall,Refdatum,IstErkrankungsbeginn,age_A00-A04,age_A05-A14,age_A15-A34,age_A35-A59,age_A60-A79,age_A80+,age_unbekannt
0,8249440,1,Schleswig-Holstein,SK Flensburg,M,1,2020-03-14,1001,0,2020/03/16 00:00:00,1,0,0,1,0,0,0,0
1,8249441,1,Schleswig-Holstein,SK Flensburg,M,1,2020-03-19,1001,0,2020/03/13 00:00:00,1,0,0,1,0,0,0,0
2,8249442,1,Schleswig-Holstein,SK Flensburg,M,1,2020-03-19,1001,0,2020/03/16 00:00:00,1,0,0,1,0,0,0,0
3,8249443,1,Schleswig-Holstein,SK Flensburg,M,1,2020-03-21,1001,0,2020/03/13 00:00:00,1,0,0,1,0,0,0,0
4,8249444,1,Schleswig-Holstein,SK Flensburg,M,1,2020-03-27,1001,0,2020/03/22 00:00:00,1,0,0,1,0,0,0,0


Next, we pull the population data

Data on inhabitants:
https://www.destatis.de/DE/Themen/Laender-Regionen/Regionales/Gemeindeverzeichnis/Administrativ/04-kreise.html
could be correct one

In [9]:
# needs packae xlrd
df_population = pd.read_excel('./data/04-kreise.xlsx', sheet_name='Kreisfreie Städte u. Landkreise',skiprows=6, skipfooter=16)
df_population.columns = ['IdLandkreis', 'Bezeichnung','Name','NUTS3','area','pop_tot','pop_male','pop_female','pop_per_sqkm2']
df_population.dropna(axis=0, how='any', inplace=True)
# set integer IdLandkreis
df_population['IdLandkreis'] = df_population['IdLandkreis'].astype(int)
df_population.head(10)

Unnamed: 0,IdLandkreis,Bezeichnung,Name,NUTS3,area,pop_tot,pop_male,pop_female,pop_per_sqkm2
1,1001,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0
2,1002,Kreisfreie Stadt,"Kiel, Landeshauptstadt",DEF02,118.65,247548.0,120566.0,126982.0,2086.0
3,1003,Kreisfreie Stadt,"Lübeck, Hansestadt",DEF03,214.19,217198.0,104371.0,112827.0,1014.0
4,1004,Kreisfreie Stadt,"Neumünster, Stadt",DEF04,71.66,79487.0,39241.0,40246.0,1109.0
5,1051,Kreis,Dithmarschen,DEF05,1428.17,133210.0,65720.0,67490.0,93.0
6,1053,Kreis,Herzogtum Lauenburg,DEF06,1263.07,197264.0,96881.0,100383.0,156.0
7,1054,Kreis,Nordfriesland,DEF07,2083.55,165507.0,81099.0,84408.0,79.0
8,1055,Kreis,Ostholstein,DEF08,1393.02,200581.0,96765.0,103816.0,144.0
9,1056,Kreis,Pinneberg,DEF09,664.25,314391.0,154211.0,160180.0,473.0
10,1057,Kreis,Plön,DEF0A,1083.56,128647.0,62532.0,66115.0,119.0


# Rolling

Create a data frame that has the rolling sum of the cases in the past week

In [10]:
df_cases.copy()[['Meldedatum','AnzahlFall','IdLandkreis']].groupby(['Meldedatum','IdLandkreis']).sum().reset_index()

Unnamed: 0,Meldedatum,IdLandkreis,AnzahlFall
0,2020-01-28,5513,2
1,2020-01-28,9181,1
2,2020-01-28,9188,1
3,2020-01-29,9162,1
4,2020-01-29,9179,1
...,...,...,...
20399,2020-05-20,16067,1
20400,2020-05-20,16069,2
20401,2020-05-20,16070,1
20402,2020-05-20,16076,2


In [11]:
#df_cases.groupby('IdLandkreis').rolling('7d').sum()
df_to_roll = df_cases.copy()[['Meldedatum','AnzahlFall','IdLandkreis','Landkreis']].groupby(['Meldedatum','IdLandkreis','Landkreis']).sum().reset_index()
df_to_roll.sort_values(by='Meldedatum',inplace = True)
df_to_roll


Unnamed: 0,Meldedatum,IdLandkreis,Landkreis,AnzahlFall
0,2020-01-28,5513,SK Gelsenkirchen,2
1,2020-01-28,9181,LK Landsberg a.Lech,1
2,2020-01-28,9188,LK Starnberg,1
3,2020-01-29,9162,SK München,1
4,2020-01-29,9179,LK Fürstenfeldbruck,1
...,...,...,...,...
20319,2020-05-20,5562,LK Recklinghausen,1
20318,2020-05-20,5558,LK Coesfeld,3
20317,2020-05-20,5554,LK Borken,1
20329,2020-05-20,6433,LK Groß-Gerau,5


In [12]:
#lkr_all - set(df_to_roll.loc[df_to_roll['Meldedatum'] == date ]['IdLandkreis'])

In [13]:
#pd.date_range(df_to_roll['Meldedatum'].min(), df_to_roll['Meldedatum'].max(), freq = '1D')

To compute rolling 7-day totals of new cases, we need to make sure to have one row per day for every administrative district. Thus, we to add rows with zero cases for all days that are missing.

In [14]:
lkr_all = set(df_to_roll['IdLandkreis'].unique())
# sweep over all days in the data set
for date in pd.date_range(df_to_roll['Meldedatum'].min(), df_to_roll['Meldedatum'].max(), freq = '1D'):
    # add zero rows for all districts that didn't report cases on that day
    for id_lkr in lkr_all - set(df_to_roll.loc[df_to_roll['Meldedatum'] == date ]['IdLandkreis']):
        df_to_roll = df_to_roll.append({'Meldedatum':date, 'IdLandkreis': id_lkr, 'AnzahlFall':0}, ignore_index=True)
    print('fixed date',date)        

fixed date 2020-01-28 00:00:00
fixed date 2020-01-29 00:00:00
fixed date 2020-01-30 00:00:00
fixed date 2020-01-31 00:00:00
fixed date 2020-02-01 00:00:00
fixed date 2020-02-02 00:00:00
fixed date 2020-02-03 00:00:00
fixed date 2020-02-04 00:00:00
fixed date 2020-02-05 00:00:00
fixed date 2020-02-06 00:00:00
fixed date 2020-02-07 00:00:00
fixed date 2020-02-08 00:00:00
fixed date 2020-02-09 00:00:00
fixed date 2020-02-10 00:00:00
fixed date 2020-02-11 00:00:00
fixed date 2020-02-12 00:00:00
fixed date 2020-02-13 00:00:00
fixed date 2020-02-14 00:00:00
fixed date 2020-02-15 00:00:00
fixed date 2020-02-16 00:00:00
fixed date 2020-02-17 00:00:00
fixed date 2020-02-18 00:00:00
fixed date 2020-02-19 00:00:00
fixed date 2020-02-20 00:00:00
fixed date 2020-02-21 00:00:00
fixed date 2020-02-22 00:00:00
fixed date 2020-02-23 00:00:00
fixed date 2020-02-24 00:00:00
fixed date 2020-02-25 00:00:00
fixed date 2020-02-26 00:00:00
fixed date 2020-02-27 00:00:00
fixed date 2020-02-28 00:00:00
fixed da

In [15]:
df_to_roll.sort_values(by='Meldedatum',inplace = True)
df_to_roll

Unnamed: 0,Meldedatum,IdLandkreis,Landkreis,AnzahlFall
0,2020-01-28,5513,SK Gelsenkirchen,2
20675,2020-01-28,16062,,0
20674,2020-01-28,16061,,0
20673,2020-01-28,16056,,0
20672,2020-01-28,16055,,0
...,...,...,...,...
45441,2020-05-20,8222,,0
45440,2020-05-20,8221,,0
45439,2020-05-20,1051,,0
45449,2020-05-20,3102,,0


In [16]:
df_to_roll.sort_values(by='Meldedatum',inplace = True)
df_to_roll = df_to_roll.set_index('Meldedatum').groupby('IdLandkreis').rolling('7d').sum()

df_to_roll = df_to_roll.drop(columns = ['IdLandkreis']).reset_index()
#df_to_roll.loc[df_to_roll['IdLandkreis'] == 5558]

In [17]:
df_cases_roll = pd.merge(df_to_roll, df_population,on='IdLandkreis')
df_cases_roll.insert(3,'AnzahlFall100k',0)
df_cases_roll['AnzahlFall100k'] = df_cases_roll['AnzahlFall']/df_cases_roll['pop_tot']*(10**5)
df_cases_roll
df_cases_roll.loc[df_cases_roll['IdLandkreis']==5558].tail(25)

Unnamed: 0,IdLandkreis,Meldedatum,AnzahlFall,AnzahlFall100k,Bezeichnung,Name,NUTS3,area,pop_tot,pop_male,pop_female,pop_per_sqkm2
10691,5558,2020-04-26,19.0,8.639152,Kreis,Coesfeld,DEA35,1112.05,219929.0,108564.0,111365.0,198.0
10692,5558,2020-04-27,18.0,8.18446,Kreis,Coesfeld,DEA35,1112.05,219929.0,108564.0,111365.0,198.0
10693,5558,2020-04-28,21.0,9.548536,Kreis,Coesfeld,DEA35,1112.05,219929.0,108564.0,111365.0,198.0
10694,5558,2020-04-29,22.0,10.003228,Kreis,Coesfeld,DEA35,1112.05,219929.0,108564.0,111365.0,198.0
10695,5558,2020-04-30,30.0,13.640766,Kreis,Coesfeld,DEA35,1112.05,219929.0,108564.0,111365.0,198.0
10696,5558,2020-05-01,27.0,12.276689,Kreis,Coesfeld,DEA35,1112.05,219929.0,108564.0,111365.0,198.0
10697,5558,2020-05-02,37.0,16.823611,Kreis,Coesfeld,DEA35,1112.05,219929.0,108564.0,111365.0,198.0
10698,5558,2020-05-03,46.0,20.915841,Kreis,Coesfeld,DEA35,1112.05,219929.0,108564.0,111365.0,198.0
10699,5558,2020-05-04,54.0,24.553379,Kreis,Coesfeld,DEA35,1112.05,219929.0,108564.0,111365.0,198.0
10700,5558,2020-05-05,81.0,36.830068,Kreis,Coesfeld,DEA35,1112.05,219929.0,108564.0,111365.0,198.0


In [18]:
df_cases_roll['Landkreis'] = df_cases_roll['IdLandkreis']
# this step is a bit slow, could likely be improved
df_cases_roll['Landkreis'] = df_cases_roll['Landkreis'].apply(lambda x: df_cases.loc[df_cases['IdLandkreis'] == x]['Landkreis'].iloc[0])

In [19]:
df_cases_roll
df_cases_roll.rename(columns={"AnzahlFall":"7d_AnzahlFall",'AnzahlFall100k':'7d_AnzahlFall100k'})

Unnamed: 0,IdLandkreis,Meldedatum,7d_AnzahlFall,7d_AnzahlFall100k,Bezeichnung,Name,NUTS3,area,pop_tot,pop_male,pop_female,pop_per_sqkm2,Landkreis
0,1001,2020-01-28,0.0,0.000000,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0,SK Flensburg
1,1001,2020-01-29,0.0,0.000000,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0,SK Flensburg
2,1001,2020-01-30,0.0,0.000000,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0,SK Flensburg
3,1001,2020-01-31,0.0,0.000000,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0,SK Flensburg
4,1001,2020-02-01,0.0,0.000000,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0,SK Flensburg
...,...,...,...,...,...,...,...,...,...,...,...,...,...
45709,16077,2020-05-16,5.0,5.548281,Landkreis,Altenburger Land,DEG0M,569.40,90118.0,44138.0,45980.0,158.0,LK Altenburger Land
45710,16077,2020-05-17,5.0,5.548281,Landkreis,Altenburger Land,DEG0M,569.40,90118.0,44138.0,45980.0,158.0,LK Altenburger Land
45711,16077,2020-05-18,1.0,1.109656,Landkreis,Altenburger Land,DEG0M,569.40,90118.0,44138.0,45980.0,158.0,LK Altenburger Land
45712,16077,2020-05-19,2.0,2.219312,Landkreis,Altenburger Land,DEG0M,569.40,90118.0,44138.0,45980.0,158.0,LK Altenburger Land


# Cases overview

In [20]:
df_rki.groupby(['Meldedatum','Bundesland']).sum().reset_index()\
                [['Meldedatum','Bundesland','AnzahlFall','AnzahlTodesfall']]

Unnamed: 0,Meldedatum,Bundesland,AnzahlFall,AnzahlTodesfall
0,2020-01-28,Bayern,2,0
1,2020-01-28,Nordrhein-Westfalen,2,0
2,2020-01-29,Bayern,2,0
3,2020-01-31,Bayern,3,0
4,2020-02-03,Bayern,1,0
...,...,...,...,...
1263,2020-05-20,Saarland,3,0
1264,2020-05-20,Sachsen,5,0
1265,2020-05-20,Sachsen-Anhalt,2,0
1266,2020-05-20,Schleswig-Holstein,5,0


# Long data format

In [21]:
df_ctr_cases = df_cases.groupby(['Meldedatum'])\
                        .sum()[['AnzahlFall']]
df_ctr_deaths = df_deaths.groupby(['Meldedatum'])\
                        .sum()[['AnzahlTodesfall']]
df_ctr_recovered = df_recovered.groupby(['Meldedatum'])\
                        .sum()[['AnzahlGenesen']]
df_ctr_cases = pd.melt(df_ctr_cases.reset_index(), id_vars=['Meldedatum'],\
                                                   value_vars = ['AnzahlFall'],\
                                                   var_name = 'category',\
                                                   value_name = 'Number')
df_ctr_deaths = pd.melt(df_ctr_deaths.reset_index(), id_vars=['Meldedatum'],\
                                                   value_vars = ['AnzahlTodesfall'],\
                                                   var_name = 'category',\
                                                   value_name = 'Number')
df_ctr_recovered = pd.melt(df_ctr_recovered.reset_index(), id_vars=['Meldedatum'],\
                                                   value_vars = ['AnzahlGenesen'],\
                                                   var_name = 'category',\
                                                   value_name = 'Number')
df_ctr = pd.concat([df_ctr_cases,df_ctr_deaths,df_ctr_recovered], axis = 0)
df_ctr['category'] = df_ctr['category']\
        .apply(lambda x: 'case' if x == 'AnzahlFall' else\
                              ('death' if x == 'AnzahlTodesfall' else 'recovered'))
df_ctr

Unnamed: 0,Meldedatum,category,Number
0,2020-01-28,case,4
1,2020-01-29,case,2
2,2020-01-31,case,3
3,2020-02-03,case,1
4,2020-02-04,case,5
...,...,...,...
93,2020-05-16,recovered,22
94,2020-05-17,recovered,12
95,2020-05-18,recovered,17
96,2020-05-19,recovered,11


In [22]:
df_ctr_cum = df_ctr.copy().sort_values(by=['Meldedatum','category'])
for el in list(df_ctr_cum['category'].unique()):
    df_ctr_cum.loc[df_ctr_cum['category']== el,'Number' ] = \
        np.cumsum(df_ctr_cum.loc[df_ctr_cum['category']== el,'Number' ])
df_ctr_cum    

Unnamed: 0,Meldedatum,category,Number
0,2020-01-28,case,4
0,2020-01-28,recovered,4
1,2020-01-29,case,6
1,2020-01-29,recovered,6
2,2020-01-31,case,9
...,...,...,...
75,2020-05-19,death,8146
96,2020-05-19,recovered,158037
97,2020-05-20,case,176752
76,2020-05-20,death,8147


### States

In [23]:
df_sta = pd.concat([df_cases.groupby(['Meldedatum','Bundesland']).sum().reset_index()\
                [['Meldedatum','Bundesland','AnzahlFall']],\
            df_deaths.groupby(['Meldedatum','Bundesland']).sum().reset_index()\
                            [['Meldedatum','Bundesland','AnzahlTodesfall']],\
            df_recovered.groupby(['Meldedatum','Bundesland']).sum().reset_index()\
                            [['Meldedatum','Bundesland','AnzahlGenesen']]])
df_sta = df_sta.fillna(0).groupby(['Meldedatum','Bundesland']).sum().reset_index()
df_sta[['AnzahlFall','AnzahlTodesfall','AnzahlGenesen']] = df_sta[['AnzahlFall','AnzahlTodesfall','AnzahlGenesen']].astype('int64')
df_sta

Unnamed: 0,Meldedatum,Bundesland,AnzahlFall,AnzahlTodesfall,AnzahlGenesen
0,2020-01-28,Bayern,2,0,2
1,2020-01-28,Nordrhein-Westfalen,2,0,2
2,2020-01-29,Bayern,2,0,2
3,2020-01-31,Bayern,3,0,3
4,2020-02-03,Bayern,1,0,1
...,...,...,...,...,...
1263,2020-05-20,Saarland,3,0,0
1264,2020-05-20,Sachsen,5,0,0
1265,2020-05-20,Sachsen-Anhalt,2,0,0
1266,2020-05-20,Schleswig-Holstein,5,0,0


In [24]:
df_sta_cum = df_sta.copy()
for state in list(df_sta['Bundesland'].unique()):
    for col in ['AnzahlFall','AnzahlTodesfall','AnzahlGenesen']:
        df_sta_cum.loc[df_sta_cum['Bundesland']==state,col] = np.cumsum(df_sta_cum.loc[df_sta_cum['Bundesland']==state,col])
df_sta_cum

Unnamed: 0,Meldedatum,Bundesland,AnzahlFall,AnzahlTodesfall,AnzahlGenesen
0,2020-01-28,Bayern,2,0,2
1,2020-01-28,Nordrhein-Westfalen,2,0,2
2,2020-01-29,Bayern,4,0,4
3,2020-01-31,Bayern,7,0,7
4,2020-02-03,Bayern,8,0,8
...,...,...,...,...,...
1263,2020-05-20,Saarland,2706,157,2479
1264,2020-05-20,Sachsen,5185,201,4604
1265,2020-05-20,Sachsen-Anhalt,1689,54,1558
1266,2020-05-20,Schleswig-Holstein,3021,134,2763


### Districts

In [25]:
df_recovered.groupby(['Meldedatum','Landkreis']).sum().reset_index()

Unnamed: 0,Meldedatum,Landkreis,FID,IdBundesland,IdLandkreis,NeuGenesen,AnzahlGenesen,IstErkrankungsbeginn
0,2020-01-28,LK Landsberg a.Lech,8343897,9,9181,0,1,1
1,2020-01-28,LK Starnberg,8347393,9,9188,0,1,1
2,2020-01-28,SK Gelsenkirchen,16564370,10,11026,0,2,1
3,2020-01-29,LK Fürstenfeldbruck,8342951,9,9179,0,1,1
4,2020-01-29,SK München,8336441,9,9162,0,1,1
...,...,...,...,...,...,...,...,...
18955,2020-05-20,LK Ilm-Kreis,8388710,16,16070,1,1,1
18956,2020-05-20,LK Kleve,8269943,5,5154,1,1,1
18957,2020-05-20,LK Rhön-Grabfeld,8365288,9,9673,1,1,1
18958,2020-05-20,LK Würzburg,8366485,9,9679,1,1,1


In [26]:
df_lkr = pd.concat([df_cases.groupby(['Meldedatum','Landkreis']).sum().reset_index()\
                [['Meldedatum','Landkreis','AnzahlFall']],\
            df_deaths.groupby(['Meldedatum','Landkreis']).sum().reset_index()\
                            [['Meldedatum','Landkreis','AnzahlTodesfall']],\
            df_recovered.groupby(['Meldedatum','Landkreis']).sum().reset_index()\
                            [['Meldedatum','Landkreis','AnzahlGenesen']]])
df_lkr = df_lkr.fillna(0).groupby(['Meldedatum','Landkreis']).sum().reset_index()
df_lkr[['AnzahlFall','AnzahlTodesfall','AnzahlGenesen']] = df_lkr[['AnzahlFall','AnzahlTodesfall','AnzahlGenesen']].astype('int64')
df_lkr

Unnamed: 0,Meldedatum,Landkreis,AnzahlFall,AnzahlTodesfall,AnzahlGenesen
0,2020-01-28,LK Landsberg a.Lech,1,0,1
1,2020-01-28,LK Starnberg,1,0,1
2,2020-01-28,SK Gelsenkirchen,2,0,2
3,2020-01-29,LK Fürstenfeldbruck,1,0,1
4,2020-01-29,SK München,1,0,1
...,...,...,...,...,...
21116,2020-05-20,SK Schweinfurt,1,0,0
21117,2020-05-20,SK Stuttgart,5,0,0
21118,2020-05-20,SK Wiesbaden,6,0,0
21119,2020-05-20,SK Worms,1,0,0


In [27]:
df_lkr_cum = df_lkr.copy()
for el in list(df_lkr['Landkreis'].unique()):
    for col in ['AnzahlFall','AnzahlTodesfall','AnzahlGenesen']:
        df_lkr_cum.loc[df_lkr_cum['Landkreis']==el,col] = np.cumsum(df_lkr_cum.loc[df_lkr_cum['Landkreis']==el,col])
df_lkr_cum

Unnamed: 0,Meldedatum,Landkreis,AnzahlFall,AnzahlTodesfall,AnzahlGenesen
0,2020-01-28,LK Landsberg a.Lech,1,0,1
1,2020-01-28,LK Starnberg,1,0,1
2,2020-01-28,SK Gelsenkirchen,2,0,2
3,2020-01-29,LK Fürstenfeldbruck,1,0,1
4,2020-01-29,SK München,1,0,1
...,...,...,...,...,...
21116,2020-05-20,SK Schweinfurt,171,18,149
21117,2020-05-20,SK Stuttgart,1436,57,1279
21118,2020-05-20,SK Wiesbaden,401,12,324
21119,2020-05-20,SK Worms,205,7,189


Location Data

Collected the coordinates of the individual districts on this webpage:
https://public.opendatasoft.com/explore/dataset/landkreise-in-germany/export/

In [28]:
pd.read_csv?

In [29]:
#geo_data = pd.read_csv('https://public.opendatasoft.com/explore/dataset/landkreise-in-germany/download/?format=csv&timezone=Europe/Berlin&lang=en&use_labels_for_header=true&csv_separator=%3B')
geo_data = pd.read_csv('./data/landkreise-in-germany.csv', delimiter = ';',\
                       usecols=['Geo Point','Name 2','Cca 2', 'Type 2'])
# drop NaN row corresponding to a lake
geo_data.dropna(axis=0, inplace = True)
geo_data[['lat','lon']] = geo_data['Geo Point'].str.split(',', expand=True)
geo_data.drop(columns = 'Geo Point',inplace = True)

In [30]:
geo_data.rename(columns = {'Name 2':'Name','Cca 2':'IdLandkreis','Type 2':'Type of District'}, inplace =True)
geo_data

Unnamed: 0,Name,IdLandkreis,Type of District,lat,lon
0,Freiburg im Breisgau,8311.0,Stadtkreis,47.9925229956,7.81807596197
1,Dillingen an der Donau,9773.0,Landkreis,48.5964037974,10.527764168
2,Nürnberg,9564.0,Kreisfreie Stadt,49.4362114486,11.0827553426
3,Neumarkt in der Oberpfalz,9373.0,Landkreis,49.2159614099,11.5665579197
4,Rosenheim,9163.0,Kreisfreie Stadt,47.8443777181,12.1087247511
...,...,...,...,...,...
398,Meißen,14627.0,Landkreis,51.239397748,13.4829006825
399,Plön,1057.0,Kreis,54.2433885939,10.3636951573
400,Stormarn,1062.0,Kreis,53.7208005726,10.3316398811
401,Altenburger Land,16077.0,Landkreis,50.9564246614,12.3991313423


In [31]:
df_cases_loc = pd.merge(df_cases, geo_data, on='IdLandkreis')[['IdLandkreis','Meldedatum','AnzahlFall','lat','lon']]
df_cases_loc

Unnamed: 0,IdLandkreis,Meldedatum,AnzahlFall,lat,lon
0,1001,2020-03-14,1,54.7849933768,9.43852835486
1,1001,2020-03-19,1,54.7849933768,9.43852835486
2,1001,2020-03-19,1,54.7849933768,9.43852835486
3,1001,2020-03-21,1,54.7849933768,9.43852835486
4,1001,2020-03-27,1,54.7849933768,9.43852835486
...,...,...,...,...,...
139668,16077,2020-03-24,1,50.9564246614,12.3991313423
139669,16077,2020-03-27,1,50.9564246614,12.3991313423
139670,16077,2020-05-07,1,50.9564246614,12.3991313423
139671,16077,2020-05-06,1,50.9564246614,12.3991313423


In [47]:
df_cases_loc = pd.merge(df_cases_loc,df_population[['IdLandkreis','pop_tot']],on='IdLandkreis')
df_cases_loc['AnzahlFall'] = df_cases_loc['AnzahlFall']/df_cases_loc['pop_tot']*10**5
df_cases_loc['AnzahlFall'] = df_cases_loc['AnzahlFall'].apply(np.round)

array([ 1.,  0.,  2.,  3.,  5., 10.,  7.,  4.,  8.,  6., 14.,  9., 18.,
       11., 20., 12., 13., 15., 17.])

In [51]:
df_cases_loc['AnzahlFall'] = df_cases_loc['AnzahlFall'].apply(np.round).astype(int)

When plotting the map, we only use the geological coordinates of the reported case. Currently, each row contains information about how many cases were reported. Thus, we create a new row for every reported cases and copy the coordinates of the district.

In [52]:
df_cases_loc_long = df_cases_loc.loc[df_cases_loc['AnzahlFall'] == 1]
for n_cases in sorted(df_cases_loc['AnzahlFall'].unique())[1:]:
    for k in range(n_cases):
        df_cases_loc_long = \
            pd.concat([df_cases_loc_long, df_cases_loc.loc[df_cases_loc['AnzahlFall'] == n_cases]])
df_cases_loc_long.drop(columns='AnzahlFall', inplace = True)

In [54]:
df_cases_loc_long

Unnamed: 0,IdLandkreis,Meldedatum,lat,lon,pop_tot
0,1001,2020-03-14,54.7849933768,9.43852835486,89504.0
1,1001,2020-03-19,54.7849933768,9.43852835486,89504.0
2,1001,2020-03-19,54.7849933768,9.43852835486,89504.0
3,1001,2020-03-21,54.7849933768,9.43852835486,89504.0
4,1001,2020-03-27,54.7849933768,9.43852835486,89504.0
...,...,...,...,...,...
67609,8136,2020-04-11,48.877525969,10.0901573582,314002.0
67609,8136,2020-04-11,48.877525969,10.0901573582,314002.0
67609,8136,2020-04-11,48.877525969,10.0901573582,314002.0
67609,8136,2020-04-11,48.877525969,10.0901573582,314002.0


In [55]:
(pd.Timestamp.today() - df_cases_loc_long['Meldedatum'].head()).dt.days < 7

0    False
1    False
2    False
3    False
4    False
Name: Meldedatum, dtype: bool

In [56]:
df_cases_7d = df_cases_loc_long.loc[(pd.Timestamp.today() - df_cases_loc_long['Meldedatum']).dt.days < 7]

In [57]:
df_cases_7d

Unnamed: 0,IdLandkreis,Meldedatum,lat,lon,pop_tot
23,1001,2020-05-17,54.7849933768,9.43852835486,89504.0
466,1004,2020-05-15,54.0811244365,9.98448195474,79487.0
2009,1060,2020-05-20,53.9199857446,10.1409147199,276032.0
2112,1060,2020-05-15,53.9199857446,10.1409147199,276032.0
2511,1062,2020-05-19,53.7208005726,10.3316398811,243196.0
...,...,...,...,...,...
109517,9478,2020-05-15,50.1093305037,11.1169215721,66838.0
109517,9478,2020-05-15,50.1093305037,11.1169215721,66838.0
109517,9478,2020-05-15,50.1093305037,11.1169215721,66838.0
109517,9478,2020-05-15,50.1093305037,11.1169215721,66838.0


## Save Data

In [37]:
#df_cases.to_csv('data_cases.csv', index=False)
#df_deaths.to_csv('data_deaths.csv', index=False)
#df_recovered.to_csv('data_recovered.csv', index=False)
df_deaths_stats.to_csv('data_death_stats.csv', index=False)
df_stats.to_csv('data_stats.csv')
df_cases_roll.to_csv('data_cases_rolling.csv', index=False)
#df_population.to_csv('data_population.csv', index=False)
# country data
df_ctr.to_csv('data_ctr_long.csv', index=False)
df_ctr_cum.to_csv('data_ctr_cum_long.csv', index=False)

In [38]:
# state data
df_sta.to_csv('data_sta_long.csv', index=False)
df_sta_cum.to_csv('data_sta_cum_long.csv', index=False)

In [39]:
# district data
df_lkr.to_csv('data_lkr_long.csv', index=False)
df_lkr_cum.to_csv('data_lkr_cum_long.csv', index=False)

In [40]:
# location case data 
df_cases_loc_long.to_csv('data_loc_long.csv', index=False)