In [1]:
import pandas as pd
import altair as alt
import numpy as np

df_cases = pd.read_csv('data_cases.csv')
df_population = pd.read_csv('data_population.csv')

In [2]:
cases_cumulative = np.cumsum(df_cases.groupby(['Meldedatum']).sum()[['AnzahlFall','AnzahlTodesfall']])
cases_cumulative.head()

Unnamed: 0_level_0,AnzahlFall,AnzahlTodesfall
Meldedatum,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-28,4,0
2020-01-29,6,0
2020-01-31,9,0
2020-02-03,10,0
2020-02-04,14,0


In [65]:
df_ctr_tmp = df_cases.groupby(['Meldedatum'])\
                        .sum()[['AnzahlFall','AnzahlTodesfall']]
df_ctr = \
        pd.melt(df_ctr_tmp.reset_index(), id_vars=['Meldedatum'],\
                value_vars = ['AnzahlFall','AnzahlTodesfall'],\
                var_name = 'category',\
                value_name = 'Number')

# State level data
df_sta = df_cases.groupby(['Meldedatum','Bundesland']).sum().reset_index()\
                [['Meldedatum','Bundesland','AnzahlFall','AnzahlTodesfall']]
#
df_lkr = df_cases.groupby(['Meldedatum','Landkreis']).sum().reset_index()\
                [['Meldedatum','Landkreis','AnzahlFall','AnzahlTodesfall']]                


# Plots on country level

In [3]:
df_cumulative = \
    pd.melt(cases_cumulative.reset_index(), id_vars=['Meldedatum'],\
            value_vars = ['AnzahlFall','AnzahlTodesfall'],\
            var_name = 'category',\
            value_name = 'Number')
df_cumulative.Meldedatum = pd.to_datetime(df_cumulative.Meldedatum, format='%Y/%m/%d')
df_cumulative.tail()

Unnamed: 0,Meldedatum,category,Number
173,2020-05-06,AnzahlTodesfall,7402
174,2020-05-07,AnzahlTodesfall,7408
175,2020-05-08,AnzahlTodesfall,7409
176,2020-05-09,AnzahlTodesfall,7410
177,2020-05-10,AnzahlTodesfall,7411


In [4]:
# take care of log problems
df_cumulative.loc[df_cumulative['Number']<1,'Number'] = np.nan

In [5]:
alt.Chart(df_cumulative)\
        .mark_line(point=True)\
        .encode(x=alt.X('monthdate(Meldedatum):O', title='Date'),\
                y=alt.Y('mean(Number):Q', title='Cumulative Cases'),\
                color='category')\
        .properties(width=800, height=400, title='Number of Cases')

In [6]:
alt.Chart(df_cumulative)\
        .mark_line(point=True)\
        .encode(x=alt.X('monthdate(Meldedatum):O', title='Date'),\
                y=alt.Y('mean(Number):Q', title='Cumulative Cases',\
                        scale=alt.Scale(type='log')), color='category')\
        .properties(width=800, height=400, title='Number of Cases')

In [7]:
df_daily_tmp = df_cases.groupby(['Meldedatum']).sum()[['AnzahlFall','AnzahlTodesfall']]
df_daily = \
    pd.melt(df_daily_tmp.reset_index(), id_vars=['Meldedatum'],\
            value_vars = ['AnzahlFall','AnzahlTodesfall'],\
            var_name = 'category',\
            value_name = 'Number')
df_daily.head()

Unnamed: 0,Meldedatum,category,Number
0,2020-01-28,AnzahlFall,4
1,2020-01-29,AnzahlFall,2
2,2020-01-31,AnzahlFall,3
3,2020-02-03,AnzahlFall,1
4,2020-02-04,AnzahlFall,4


In [8]:
alt.Chart(df_daily)\
        .mark_bar(point=True)\
        .encode(x=alt.X('monthdate(Meldedatum):O', title='Date'),\
                y=alt.Y('mean(Number):Q', title='Cumulative Cases'),\
                color='category')\
        .properties(width=800, height=400, title='Number of Cases')

In [9]:
df_daily.loc[df_daily['Number']>0].Number.unique()

array([   4,    2,    3,    1,    7,   23,   47,   21,   37,   41,   85,
        157,  186,  180,  139,   98,  344,  587,  746,  988, 1430, 1304,
        962, 2040, 3022, 3563, 4030, 4086, 3312, 2310, 3700, 4778, 5612,
       5823, 6104, 4789, 3098, 4074, 5978, 6248, 6563, 6314, 4250, 2528,
       3675, 5162, 5281, 4924, 3308, 2929, 1837, 1595, 2437, 3276, 3387,
       3064, 2058, 1381, 1751, 2180, 2461, 2091, 1891, 1301,  688, 1131,
       1437, 1404, 1468,  872,  624,  420,  716, 1081, 1187, 1151,  870,
        572,  178,    5,   11,   14,   20,   12,   35,   40,   63,   59,
         94,   88,   75,  117,  161,  190,  249,  263,  235,  156,  250,
        328,  354,  374,  366,  220,  162,  269,  291,  305,  289,  171,
        147,  141,  191,  189,  145,   97,   96,  114,  111,  103,   64,
         54,   50,   39,   17,   15,    6])

In [10]:
alt.Chart(df_daily.loc[df_daily['Number']>0])\
        .mark_line(point=True)\
        .encode(x=alt.X('monthdate(Meldedatum):O', title='Date'),\
                y=alt.Y('mean(Number):Q', title='Cumulative Cases',\
                       scale=alt.Scale(type='log')),\
                color='category')\
        .properties(width=800, height=400, title='Number of Cases')

# State level plots

In [11]:
df_cases.groupby(['Meldedatum','Bundesland']).sum()


Unnamed: 0_level_0,Unnamed: 1_level_0,FID,IdBundesland,AnzahlFall,AnzahlTodesfall,IdLandkreis,NeuerFall,NeuerTodesfall,NeuGenesen,AnzahlGenesen,IstErkrankungsbeginn,age_A00-A04,age_A05-A14,age_A15-A34,age_A35-A59,age_A60-A79,age_A80+,age_unbekannt
Meldedatum,Bundesland,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2020-01-28,Bayern,11796210,18,2,0,18369,0,-18,0,2,2,0,0,1,1,0,0,0
2020-01-28,Nordrhein-Westfalen,11675484,10,2,0,11026,0,-18,0,2,1,0,0,0,0,1,1,0
2020-01-29,Bayern,11785228,18,2,0,18341,0,-18,0,2,2,0,0,2,0,0,0,0
2020-01-31,Bayern,17695845,27,3,0,27557,0,-27,0,3,3,1,0,0,2,0,0,0
2020-02-03,Bayern,5889109,9,1,0,9162,0,-9,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-05-10,Niedersachsen,5819471,3,1,0,3456,1,-9,-9,0,0,0,0,0,1,0,0,0
2020-05-10,Nordrhein-Westfalen,274231165,235,77,0,255767,47,-423,-363,6,19,1,0,13,20,10,1,2
2020-05-10,Sachsen-Anhalt,5936575,15,1,0,15002,1,-9,-9,0,1,0,0,0,0,0,1,0
2020-05-10,Schleswig-Holstein,75515474,13,15,0,13775,13,-117,-117,0,5,0,0,3,4,1,5,0


In [12]:
df = df_cases.groupby(['Meldedatum','Bundesland']).sum().reset_index()[['Meldedatum','Bundesland','AnzahlFall']]
df

Unnamed: 0,Meldedatum,Bundesland,AnzahlFall
0,2020-01-28,Bayern,2
1,2020-01-28,Nordrhein-Westfalen,2
2,2020-01-29,Bayern,2
3,2020-01-31,Bayern,3
4,2020-02-03,Bayern,1
...,...,...,...
1104,2020-05-10,Niedersachsen,1
1105,2020-05-10,Nordrhein-Westfalen,77
1106,2020-05-10,Sachsen-Anhalt,1
1107,2020-05-10,Schleswig-Holstein,15


In [13]:
df.loc[df.AnzahlFall <1,'AnzahlFall'] = np.nan

In [14]:
tuple(sorted(list(df['Bundesland'].unique())))

('Baden-Württemberg',
 'Bayern',
 'Berlin',
 'Brandenburg',
 'Bremen',
 'Hamburg',
 'Hessen',
 'Mecklenburg-Vorpommern',
 'Niedersachsen',
 'Nordrhein-Westfalen',
 'Rheinland-Pfalz',
 'Saarland',
 'Sachsen',
 'Sachsen-Anhalt',
 'Schleswig-Holstein',
 'Thüringen')

In [15]:
alt.Chart(df.loc[df['Bundesland'].isin(['Niedersachsen','Bayern'])]).mark_line(point=True).encode(x='Meldedatum',y='AnzahlFall', color='Bundesland')

In [267]:
df.loc[df['Bundesland'].isin(['Bremen'])]

Unnamed: 0,Meldedatum,Bundesland,AnzahlFall
39,2020-03-01,Bremen,1.0
46,2020-03-02,Bremen,1.0
55,2020-03-03,Bremen,2.0
120,2020-03-09,Bremen,7.0
134,2020-03-10,Bremen,11.0
...,...,...,...
978,2020-05-02,Bremen,4.0
994,2020-05-03,Bremen,16.0
1008,2020-05-04,Bremen,17.0
1024,2020-05-05,Bremen,10.0


In [45]:
alt.Chart(df.loc[df['Bundesland'].isin(['Niedersachsen','Bremen'])])\
        .mark_bar(point=True)\
        .encode(x=alt.X('monthdate(Meldedatum):O', title='Date'),\
                y=alt.Y('mean(AnzahlFall):Q', title='Cumulative Cases'),\
                color='Bundesland')\
        .properties(width=800, height=400, title='Number of Cases')

In [36]:
alt.Chart(df).mark_line(point=True)\
    .encode(x='Meldedatum',y = alt.Y('AnzahlFall',scale=alt.Scale(type='log')), color='Bundesland')

In [244]:
state = 'Bayern'

In [245]:
df_plot = df.loc[df['Bundesland']==state]

In [250]:
alt.Chart(df_plot).mark_bar(point=True).encode(x='Meldedatum',y='AnzahlFall', color='Bundesland')

# Landkreise

In [277]:
df_lkr = df_cases.groupby(['Meldedatum','Landkreis']).sum().reset_index()[['Meldedatum','Landkreis','AnzahlFall']]
df_lkr

Unnamed: 0,Meldedatum,Landkreis,AnzahlFall
0,2020-01-03,LK Lippe,-1
1,2020-01-28,LK Landsberg a.Lech,1
2,2020-01-28,LK Starnberg,1
3,2020-01-29,LK Fürstenfeldbruck,1
4,2020-01-29,SK München,1
...,...,...,...
18353,2020-05-06,SK Stuttgart,3
18354,2020-05-06,SK Weiden i.d.OPf.,3
18355,2020-05-06,SK Wiesbaden,11
18356,2020-05-06,SK Worms,1


In [279]:
lkr = ['LK Celle','SK Flensburg']
df_plot = df_lkr.loc[df_lkr['Landkreis'].isin(lkr)]

In [142]:
df_cumulative['Meldedatum'].iloc[0]

Timestamp('2020-01-03 00:00:00')

In [288]:
df_cases[''.apply
#df_population

Unnamed: 0,FID,IdBundesland,Bundesland,Landkreis,Geschlecht,AnzahlFall,AnzahlTodesfall,Meldedatum,IdLandkreis,NeuerFall,...,NeuGenesen,AnzahlGenesen,IstErkrankungsbeginn,age_A00-A04,age_A05-A14,age_A15-A34,age_A35-A59,age_A60-A79,age_A80+,age_unbekannt
0,5280712,1,Schleswig-Holstein,SK Flensburg,M,1,0,2020-03-14,1001,0,...,0,1,1,0,0,1,0,0,0,0
1,5280713,1,Schleswig-Holstein,SK Flensburg,M,1,0,2020-03-19,1001,0,...,0,1,1,0,0,1,0,0,0,0
2,5280714,1,Schleswig-Holstein,SK Flensburg,M,1,0,2020-03-19,1001,0,...,0,1,1,0,0,1,0,0,0,0
3,5280715,1,Schleswig-Holstein,SK Flensburg,M,1,0,2020-03-21,1001,0,...,0,1,1,0,0,1,0,0,0,0
4,5280716,1,Schleswig-Holstein,SK Flensburg,M,1,0,2020-03-27,1001,0,...,0,1,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129861,5410573,16,Thüringen,LK Altenburger Land,W,1,0,2020-04-16,16077,0,...,-9,0,1,0,0,0,0,1,0,0
129862,5410574,16,Thüringen,LK Altenburger Land,W,1,0,2020-04-22,16077,0,...,0,1,1,0,0,0,0,1,0,0
129863,5410575,16,Thüringen,LK Altenburger Land,M,1,1,2020-03-24,16077,0,...,-9,0,1,0,0,0,0,0,1,0
129864,5410576,16,Thüringen,LK Altenburger Land,M,1,0,2020-03-27,16077,0,...,0,1,1,0,0,0,0,0,1,0


In [185]:
alt.Chart(df_cumulative).mark_line().encode(
    alt.X('month:T', axis=alt.Axis(format='%b')),
    y='mean(Number):Q',
    color='category'
).transform_timeunit(
    month='month(Meldedatum)'
)

In [188]:
alt.Chart(df_cumulative).mark_bar().encode(
    x='monthdate(Meldedatum):O',
    y='mean(Number):Q',
    color = 'category'
)

Display Total cases by state

In [22]:
df_cases.groupby('Bundesland').sum()[['AnzahlFall','AnzahlTodesfall']]

Unnamed: 0_level_0,AnzahlFall,AnzahlTodesfall
Bundesland,Unnamed: 1_level_1,Unnamed: 2_level_1
Baden-Württemberg,33039,1540
Bayern,44365,2154
Berlin,6272,165
Brandenburg,3106,134
Bremen,1055,34
Hamburg,4780,204
Hessen,9012,412
Mecklenburg-Vorpommern,728,19
Niedersachsen,10854,498
Nordrhein-Westfalen,35130,1437


In [62]:
df

Unnamed: 0,Meldedatum,Bundesland,AnzahlFall
0,2020-01-28,Bayern,2.0
1,2020-01-28,Nordrhein-Westfalen,2.0
2,2020-01-29,Bayern,2.0
3,2020-01-31,Bayern,3.0
4,2020-02-03,Bayern,1.0
...,...,...,...
1104,2020-05-10,Niedersachsen,1.0
1105,2020-05-10,Nordrhein-Westfalen,77.0
1106,2020-05-10,Sachsen-Anhalt,1.0
1107,2020-05-10,Schleswig-Holstein,15.0


In [70]:
def plot_cases(cat = 'Bundesland', el_list = ['Bayern']):
    if cat == 'Bundesland':
        df = df_sta
    elif cat == 'Landkreis':
        df = df_lkr
    else: 
        print('Please choose either Bundesland or Landkreis')
    c = alt.Chart(df.loc[df[cat].isin(el_list)])\
            .mark_bar(point=True)\
            .encode(x=alt.X('monthdate(Meldedatum):O', title='Date'),\
                    y=alt.Y('mean(AnzahlFall):Q', title='Cumulative Cases'),\
                    color=cat)\
            .properties(width=800, height=400, title='Number of Cases')
    c.display()
plot_cases('Landkreis',['LK Celle','Region Hannover']) 

In [58]:
cat = 'Bundesland'
el_list = ['Niedersachsen','Bayern']

alt.Chart(df.loc[df[cat].isin(el_list)])\
        .mark_bar(point=True)\
        .encode(x=alt.X('monthdate(Meldedatum):O', title='Date'),\
                y=alt.Y('mean(AnzahlFall):Q', title='Cumulative Cases'),\
                color=cat)\
        .properties(width=800, height=400, title='Number of Cases')

# State wise cases per 100k inhabitants

In [35]:
df_cases_pop = pd.merge(df_cases.groupby('IdLandkreis').sum().reset_index(),df_population, on='IdLandkreis')

In [37]:
df_cases_pop.columns

Index(['IdLandkreis', 'FID', 'IdBundesland', 'AnzahlFall', 'AnzahlTodesfall',
       'NeuerFall', 'NeuerTodesfall', 'NeuGenesen', 'AnzahlGenesen',
       'IstErkrankungsbeginn', 'age_A00-A04', 'age_A05-A14', 'age_A15-A34',
       'age_A35-A59', 'age_A60-A79', 'age_A80+', 'age_unbekannt',
       'Bezeichnung', 'Name', 'NUTS3', 'area', 'pop_tot', 'pop_male',
       'pop_female', 'pop_per_sqkm2'],
      dtype='object')

In [38]:
df_cases_pop['Cases_per_100k'] = df_cases_pop['AnzahlFall']/df_cases_pop['pop_tot']

In [39]:
df_cases_pop

Unnamed: 0,IdLandkreis,FID,IdBundesland,AnzahlFall,AnzahlTodesfall,NeuerFall,NeuerTodesfall,NeuGenesen,AnzahlGenesen,IstErkrankungsbeginn,...,age_unbekannt,Bezeichnung,Name,NUTS3,area,pop_tot,pop_male,pop_female,pop_per_sqkm2,Cases_per_100k
0,1001,191626347,33,33,2,0,-279,-27,30,33,...,0,Kreisfreie Stadt,"Flensburg, Stadt",DEF01,56.73,89504.0,44599.0,44905.0,1578.0,0.000369
1,1002,1515628566,261,276,9,1,-2268,-257,245,217,...,0,Kreisfreie Stadt,"Kiel, Landeshauptstadt",DEF02,118.65,247548.0,120566.0,126982.0,2086.0,0.001115
2,1003,882696300,152,165,1,0,-1359,-18,163,135,...,0,Kreisfreie Stadt,"Lübeck, Hansestadt",DEF03,214.19,217198.0,104371.0,112827.0,1014.0,0.000760
3,1004,412320004,71,75,2,1,-621,-117,62,55,...,0,Kreisfreie Stadt,"Neumünster, Stadt",DEF04,71.66,79487.0,39241.0,40246.0,1109.0,0.000944
4,1051,325213700,56,59,3,0,-477,-63,51,51,...,0,Kreis,Dithmarschen,DEF05,1428.17,133210.0,65720.0,67490.0,93.0,0.000443
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396,16073,374177664,1008,65,0,0,-567,-54,59,53,...,0,Landkreis,Saalfeld-Rudolstadt,DEG0I,1036.03,106356.0,52388.0,53968.0,103.0,0.000611
397,16074,344484533,928,66,2,3,-504,-117,52,33,...,0,Landkreis,Saale-Holzland-Kreis,DEG0J,815.24,83051.0,41360.0,41691.0,102.0,0.000795
398,16075,712737300,1920,133,12,1,-990,-233,105,107,...,0,Landkreis,Saale-Orla-Kreis,DEG0K,1151.30,80868.0,40119.0,40749.0,70.0,0.001645
399,16076,2833273152,7632,530,37,1,-3959,-1136,382,427,...,0,Landkreis,Greiz,DEG0L,845.98,98159.0,48326.0,49833.0,116.0,0.005399


# Bar plot cases by state

In [80]:
tmp = df_cases.groupby('Bundesland').sum()[['AnzahlFall','AnzahlTodesfall']]\
        .sort_values(by='AnzahlFall', ascending=False).reset_index()
tmp = tmp.melt(id_vars='Bundesland', value_vars = ['AnzahlFall', 'AnzahlTodesfall'], value_name=['Cases'])
tmp

Unnamed: 0,Bundesland,variable,value
0,Bayern,AnzahlFall,44365
1,Nordrhein-Westfalen,AnzahlFall,35130
2,Baden-Württemberg,AnzahlFall,33039
3,Niedersachsen,AnzahlFall,10854
4,Hessen,AnzahlFall,9012
5,Rheinland-Pfalz,AnzahlFall,6313
6,Berlin,AnzahlFall,6272
7,Sachsen,AnzahlFall,4915
8,Hamburg,AnzahlFall,4780
9,Brandenburg,AnzahlFall,3106


In [88]:
alt.Chart(tmp)\
            .mark_bar()\
            .encode(x=alt.X('Bundesland:O', title='Bundesland'),\
                    y=alt.Y('value:Q', title='Cumulative Cases'),\
                    color='variable',\
                   tooltip=['value','variable'])\
            .properties(width=800, height=400, title='Number of Cases')