In [3]:
import pandas as pd
import altair as alt
import numpy as np

def read_data_fix_date(file_name):
    df = pd.read_csv(file_name)
    df['Meldedatum'] = pd.to_datetime(df['Meldedatum'], format='%Y/%m/%d')
    return df

df_cases = read_data_fix_date('data_cases.csv')
df_deaths = read_data_fix_date('data_deaths.csv')
df_recovered = read_data_fix_date('data_recovered.csv')
df_cases_rolling = read_data_fix_date('data_cases_rolling.csv')
# country
df_ctr = read_data_fix_date('data_ctr_long.csv')
df_ctr_cum = read_data_fix_date('data_ctr_cum_long.csv')
# states
df_sta = read_data_fix_date('data_sta_long.csv')
df_sta_cum = read_data_fix_date('data_sta_cum_long.csv')
# districts
df_lkr = read_data_fix_date('data_lkr_long.csv')
df_lkr_cum = read_data_fix_date('data_lkr_cum_long.csv')


df_stats = pd.read_csv('data_stats.csv')
df_death_stats = pd.read_csv('data_death_stats.csv')
df_population = pd.read_csv('data_population.csv')


In [4]:
df_stats

Unnamed: 0,Unnamed: 1,Cases,Recovered,Deaths,Active
0,Total,173772,152608,7881,13283
1,Today,620,906,57,-343


In [17]:
_width = 800
_height = 400
alt.Chart(df_death_stats).mark_bar()\
    .encode(x=alt.X('Age'),\
            y=alt.Y('Count',scale=alt.Scale(type='log')),\
            tooltip = ['Age', 'Count']).\
    properties(width = _width*2/3, height = _height/2, title='Log Number of Deaths in Different Age Groups')

In [23]:
pd.Timestamp.today().strftime('%B %d, %Y')

'May 16, 2020'

In [19]:
df_cases.groupby(['Meldedatum','Bundesland']).sum().reset_index()\
                [['Meldedatum','Bundesland','AnzahlFall']]

Unnamed: 0,Meldedatum,Bundesland,AnzahlFall
0,2020-01-28,Bayern,2
1,2020-01-28,Nordrhein-Westfalen,2
2,2020-01-29,Bayern,2
3,2020-01-31,Bayern,3
4,2020-02-03,Bayern,1
...,...,...,...
1125,2020-05-11,Saarland,1
1126,2020-05-11,Sachsen,2
1127,2020-05-11,Sachsen-Anhalt,2
1128,2020-05-11,Schleswig-Holstein,4


In [18]:
df_ctr_cases = df_cases.groupby(['Meldedatum'])\
                        .sum()[['AnzahlFall','AnzahlTodesfall']]
df_ctr_cases_long = \
        pd.melt(df_ctr_cases.reset_index(), id_vars=['Meldedatum'],\
                value_vars = ['AnzahlFall','AnzahlTodesfall'],\
                var_name = 'category',\
                value_name = 'Number')

# State level data
df_sta = df_cases.groupby(['Meldedatum','Bundesland']).sum().reset_index()\
                [['Meldedatum','Bundesland','AnzahlFall','AnzahlTodesfall']]
#
df_lkr = df_cases.groupby(['Meldedatum','Landkreis']).sum().reset_index()\
                [['Meldedatum','Landkreis','AnzahlFall','AnzahlTodesfall']]                


KeyError: "['AnzahlTodesfall'] not in index"

# EDA

In [None]:
n_cases = df_cases.loc[df_cases['NeuerFall'].isin([0,1])]['AnzahlFall'].sum()
n_cases_new = df_cases.loc[df_cases['NeuerFall'].isin([-1,1])]['AnzahlFall'].sum()
n_deaths = df_cases.loc[df_cases['NeuerTodesfall'].isin([0,1])]['AnzahlTodesfall'].sum()
n_deaths_new = df_cases.loc[df_cases['NeuerTodesfall'].isin([-1,1])]['AnzahlTodesfall'].sum()
n_recovered = df_cases.loc[df_cases['NeuGenesen'].isin([0,1])]['AnzahlGenesen'].sum()
n_recovered_new = df_cases.loc[df_cases['NeuGenesen'].isin([-1,1])]['AnzahlGenesen'].sum()
n_active = n_cases - n_deaths - n_recovered


In [None]:
df_stats = pd.DataFrame({' ':['Total','Today'],\
                        'Cases':[n_cases, n_cases_new],\
                        'Recovered':[n_recovered, n_recovered_new],\
                        'Deaths':[n_deaths, n_deaths_new],\
                        'Active':[n_active, ' ']})
df_stats.set_index(' ')


# Plots on country level

In [44]:
alt.Chart(df_ctr)\
        .mark_bar(point=True)\
        .encode(x=alt.X('monthdate(Meldedatum):O', title='Date'),\
                y=alt.Y('mean(Number):Q', title='Cumulative Cases'),\
                color='category')\
        .properties(width=800, height=400, title='Number of Cases')

In [50]:
df_ctr.groupby(['Meldedatum','category']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Number
Meldedatum,category,Unnamed: 2_level_1
2020-01-28,case,4
2020-01-29,case,6
2020-01-31,case,9
2020-02-03,case,10
2020-02-04,case,14
...,...,...
2020-05-09,death,177349
2020-05-10,case,177652
2020-05-10,death,177654
2020-05-11,case,178040


In [None]:
df_cumulative = \
    pd.melt(cases_cumulative.reset_index(), id_vars=['Meldedatum'],\
            value_vars = ['AnzahlFall','AnzahlTodesfall'],\
            var_name = 'category',\
            value_name = 'Number')
df_cumulative.Meldedatum = pd.to_datetime(df_cumulative.Meldedatum, format='%Y/%m/%d')
df_cumulative.tail()

In [None]:
# take care of log problems
df_cumulative.loc[df_cumulative['Number']<1,'Number'] = np.nan

In [None]:
alt.Chart(df_cumulative)\
        .mark_line(point=True)\
        .encode(x=alt.X('monthdate(Meldedatum):O', title='Date'),\
                y=alt.Y('mean(Number):Q', title='Cumulative Cases'),\
                color='category')\
        .properties(width=800, height=400, title='Number of Cases')

In [None]:
alt.Chart(df_cumulative)\
        .mark_line(point=True)\
        .encode(x=alt.X('monthdate(Meldedatum):O', title='Date'),\
                y=alt.Y('mean(Number):Q', title='Cumulative Cases',\
                        scale=alt.Scale(type='log')), color='category')\
        .properties(width=800, height=400, title='Number of Cases')

In [None]:
df_daily_tmp = df_cases.groupby(['Meldedatum']).sum()[['AnzahlFall','AnzahlTodesfall']]
df_daily = \
    pd.melt(df_daily_tmp.reset_index(), id_vars=['Meldedatum'],\
            value_vars = ['AnzahlFall','AnzahlTodesfall'],\
            var_name = 'category',\
            value_name = 'Number')
df_daily.head()

In [None]:
alt.Chart(df_daily)\
        .mark_bar(point=True)\
        .encode(x=alt.X('monthdate(Meldedatum):O', title='Date'),\
                y=alt.Y('mean(Number):Q', title='Cumulative Cases'),\
                color='category')\
        .properties(width=800, height=400, title='Number of Cases')

In [None]:
df_daily.loc[df_daily['Number']>0].Number.unique()

In [None]:
alt.Chart(df_daily.loc[df_daily['Number']>0])\
        .mark_line(point=True)\
        .encode(x=alt.X('monthdate(Meldedatum):O', title='Date'),\
                y=alt.Y('mean(Number):Q', title='Cumulative Cases',\
                       scale=alt.Scale(type='log')),\
                color='category')\
        .properties(width=800, height=400, title='Number of Cases')

# State level plots

In [None]:
df_cases.groupby(['Meldedatum','Bundesland']).sum()


In [None]:
df = df_cases.groupby(['Meldedatum','Bundesland']).sum().reset_index()[['Meldedatum','Bundesland','AnzahlFall']]
df

In [None]:
df.loc[df.AnzahlFall <1,'AnzahlFall'] = np.nan

In [None]:
tuple(sorted(list(df['Bundesland'].unique())))

In [None]:
alt.Chart(df.loc[df['Bundesland'].isin(['Niedersachsen','Bayern'])]).mark_line(point=True).encode(x='Meldedatum',y='AnzahlFall', color='Bundesland')

In [None]:
df.loc[df['Bundesland'].isin(['Bremen'])]

In [None]:
alt.Chart(df.loc[df['Bundesland'].isin(['Niedersachsen','Bremen'])])\
        .mark_bar(point=True)\
        .encode(x=alt.X('monthdate(Meldedatum):O', title='Date'),\
                y=alt.Y('mean(AnzahlFall):Q', title='Cumulative Cases'),\
                color='Bundesland')\
        .properties(width=800, height=400, title='Number of Cases')

In [None]:
alt.Chart(df).mark_line(point=True)\
    .encode(x='Meldedatum',y = alt.Y('AnzahlFall',scale=alt.Scale(type='log')), color='Bundesland')

In [None]:
state = 'Bayern'

In [None]:
df_plot = df.loc[df['Bundesland']==state]

In [None]:
alt.Chart(df_plot).mark_bar(point=True).encode(x='Meldedatum',y='AnzahlFall', color='Bundesland')

In [66]:
df_sta_cum

Unnamed: 0,Meldedatum,Bundesland,AnzahlFall,AnzahlTodesfall,AnzahlGenesen
0,2020-01-28,Bayern,2,0,2
1,2020-01-28,Nordrhein-Westfalen,2,0,2
2,2020-01-29,Bayern,4,0,4
3,2020-01-31,Bayern,7,0,7
4,2020-02-03,Bayern,8,0,8
...,...,...,...,...,...
1125,2020-05-11,Saarland,2665,144,2363
1126,2020-05-11,Sachsen,4947,190,4287
1127,2020-05-11,Sachsen-Anhalt,1648,50,1453
1128,2020-05-11,Schleswig-Holstein,2956,125,2526


In [67]:
states_sel = ['Niedersachsen','Bayern']
alt.Chart(df_sta_cum.loc[df_sta_cum['Bundesland'].isin(states_sel)])\
            .mark_line(point=True)\
            .encode(x=alt.X('monthdate(Meldedatum):O', title='Date'),\
                    y=alt.Y('mean(AnzahlTodesfall):Q', title='Cumulative Cases'),\
                    color='Bundesland',\
                    tooltip=['Bundesland','AnzahlFall'])\
            .properties(width=800, height=400, title='Number of Cases')\
            .interactive()

In [38]:
def longify_df_cum(df_cum,cat,sel):
    tmp = df_cum.loc[df_cum[cat]==sel].drop(columns=[cat])
    tmp = pd.melt(tmp, id_vars = ['Meldedatum'], var_name= 'category',\
        value_vars = ['AnzahlFall','AnzahlTodesfall','AnzahlGenesen'],\
       value_name = 'Number')
    return tmp
    

In [24]:
tmp = df_sta_cum.loc[df_sta_cum['Bundesland']==state_sel_cum].drop(columns=['Bundesland'])
tmp

In [32]:
tmp = pd.melt(tmp, id_vars = ['Meldedatum'], var_name= 'category',\
        value_vars = ['AnzahlFall','AnzahlTodesfall','AnzahlGenesen'],\
       value_name = 'Number')

In [42]:
state_sel_cum = 'Niedersachsen'
alt.Chart(longify_df_cum(df_sta_cum, 'Bundesland',state_sel_cum))\
            .mark_area(point=True, opacity=0.5)\
            .encode(x=alt.X('monthdate(Meldedatum):O', title='Date'),\
                    y=alt.Y('mean(Number):Q',\
                            title='Cumulative Cases',\
                            stack=None),\
                    color='category',\
                    tooltip=['monthdate(Meldedatum)','category','Number'])\
            .properties(width=800, height=400, title='Number of Cases in '+state_sel_cum)

# Landkreise

In [None]:
df_lkr = df_cases.groupby(['Meldedatum','Landkreis']).sum().reset_index()[['Meldedatum','Landkreis','AnzahlFall']]
df_lkr

In [None]:
lkr = ['LK Celle','SK Flensburg']
df_plot = df_lkr.loc[df_lkr['Landkreis'].isin(lkr)]

In [None]:
df_cumulative['Meldedatum'].iloc[0]

In [None]:
alt.Chart(df_cumulative).mark_line().encode(
    alt.X('month:T', axis=alt.Axis(format='%b')),
    y='mean(Number):Q',
    color='category'
).transform_timeunit(
    month='month(Meldedatum)'
)

In [None]:
alt.Chart(df_cumulative).mark_bar().encode(
    x='monthdate(Meldedatum):O',
    y='mean(Number):Q',
    color = 'category'
)

Display Total cases by state

In [None]:
df_cases.groupby('Bundesland').sum()[['AnzahlFall','AnzahlTodesfall']]

In [None]:
df

In [9]:
def plot_cases(cat = 'Bundesland', el_list = ['Bayern']):
    if cat == 'Bundesland':
        df = df_sta
    elif cat == 'Landkreis':
        df = df_lkr
    else: 
        print('Please choose either Bundesland or Landkreis')
    c = alt.Chart(df.loc[df[cat].isin(el_list)])\
            .mark_bar(point=True)\
            .encode(x=alt.X('monthdate(Meldedatum):O', title='Date'),\
                    y=alt.Y('mean(AnzahlFall):Q', title='Cumulative Cases'),\
                    color=cat)\
            .properties(width=800, height=400, title='Number of Cases')
    c.display()
plot_cases('Landkreis',['LK Celle','Region Hannover']) 

In [12]:
df_lkr_cum.loc[df_lkr_cum['Bundesland']==state_sel_cum]

KeyError: 'Bundesland'

In [11]:
state_sel_cum = 'Bayern'
alt.Chart(df_lkr_cum.loc[df_lkr_cum['Bundesland']==state_sel_cum])\
            .mark_area(point=False, opacity=0.6)\
            .encode(x=alt.X('monthdate(Meldedatum):O', title='Date'),\
                    y=alt.Y('mean(Number):Q',\
                            title='Cumulative Cases',\
                            stack=None),\
                    color='category',\
                    tooltip=['monthdate(Meldedatum)','category','Number'])\
            .properties(width=800, height=400, title='Number of Cases')

KeyError: 'Bundesland'

In [3]:
cat = 'Bundesland'
el_list = ['Niedersachsen','Bayern']

alt.Chart(df.loc[df[cat].isin(el_list)])\
        .mark_bar(point=True)\
        .encode(x=alt.X('monthdate(Meldedatum):O', title='Date'),\
                y=alt.Y('mean(AnzahlFall):Q', title='Cumulative Cases'),\
                color=cat)\
        .properties(width=800, height=400, title='Number of Cases')

NameError: name 'df' is not defined

# State wise cases per 100k inhabitants

In [None]:
df_cases_rolling.head()

In [44]:
df_lkr_roll = \
        pd.melt(df_cases_rolling, id_vars=['Meldedatum', 'Landkreis'],\
                value_vars = ['AnzahlFall100k'],\
                var_name = 'category',\
                value_name = 'Number')
df_lkr_roll

Unnamed: 0,Meldedatum,Landkreis,category,Number
0,2020-01-28,SK Flensburg,AnzahlFall100k,0.000000
1,2020-01-29,SK Flensburg,AnzahlFall100k,0.000000
2,2020-01-30,SK Flensburg,AnzahlFall100k,0.000000
3,2020-01-31,SK Flensburg,AnzahlFall100k,0.000000
4,2020-02-01,SK Flensburg,AnzahlFall100k,0.000000
...,...,...,...,...
42501,2020-05-08,LK Altenburger Land,AnzahlFall100k,2.219312
42502,2020-05-09,LK Altenburger Land,AnzahlFall100k,2.219312
42503,2020-05-10,LK Altenburger Land,AnzahlFall100k,2.219312
42504,2020-05-11,LK Altenburger Land,AnzahlFall100k,7.767594


In [45]:
df_lkr.loc[df_lkr['Landkreis']=='LK Coesfeld']

Unnamed: 0,Meldedatum,Landkreis,AnzahlFall,AnzahlTodesfall,AnzahlGenesen
151,2020-03-04,LK Coesfeld,10,0,10
204,2020-03-05,LK Coesfeld,1,0,1
266,2020-03-06,LK Coesfeld,1,0,1
375,2020-03-08,LK Coesfeld,3,0,3
425,2020-03-09,LK Coesfeld,2,0,2
...,...,...,...,...,...
18755,2020-05-08,LK Coesfeld,21,0,0
18986,2020-05-09,LK Coesfeld,37,0,0
19164,2020-05-10,LK Coesfeld,31,0,1
19299,2020-05-11,LK Coesfeld,27,0,1


In [46]:
df = df_lkr_roll
cat = 'Landkreis'
el_list = ['LK Coesfeld','Region Hannover']

c = alt.Chart(df.loc[df[cat].isin(el_list)])\
        .mark_bar(point=True)\
        .encode(x=alt.X('monthdate(Meldedatum):O', title='Date'),\
                y=alt.Y('mean(Number):Q', title='Cumulative Cases'),\
                color=cat)\
        .properties(width=800, height=400, title='Number of Cases')
c.display()
#df[cat]
#plot_cases(df_lkr_roll, 'Landkreis',['LK Celle','Region Hannover'], 'mean(AnzahlFall100k):Q') 

In [47]:
df_lkr_roll

Unnamed: 0,Meldedatum,Landkreis,category,Number
0,2020-01-28,SK Flensburg,AnzahlFall100k,0.000000
1,2020-01-29,SK Flensburg,AnzahlFall100k,0.000000
2,2020-01-30,SK Flensburg,AnzahlFall100k,0.000000
3,2020-01-31,SK Flensburg,AnzahlFall100k,0.000000
4,2020-02-01,SK Flensburg,AnzahlFall100k,0.000000
...,...,...,...,...
42501,2020-05-08,LK Altenburger Land,AnzahlFall100k,2.219312
42502,2020-05-09,LK Altenburger Land,AnzahlFall100k,2.219312
42503,2020-05-10,LK Altenburger Land,AnzahlFall100k,2.219312
42504,2020-05-11,LK Altenburger Land,AnzahlFall100k,7.767594


In [48]:
def plot_cases_bar(df, cat , el_list, numeric_col, title_str):
    c = alt.Chart(df.loc[df[cat].isin(el_list)])\
            .mark_bar(point=False)\
            .encode(x=alt.X('monthdate(Meldedatum):O', title='Date'),\
                    y=alt.Y(numeric_col, title='Cases'),\
                    color=cat,\
                   tooltip = ['Landkreis','Meldedatum','Number'])\
            .properties(width=800, height=400, title=title_str)
    c.display()
plot_cases_bar(df_lkr_roll, 'Landkreis',['LK Celle','Region Hannover'],'mean(Number):Q','Rolling 7-day sum of cases per 100k' ) 

In [49]:
def plot_cases_line(df, cat , el_list, numeric_col, title_str):
    c1 = alt.Chart(df.loc[df[cat].isin(el_list)])\
            .mark_line(point=True)\
            .encode(x=alt.X('monthdate(Meldedatum):O', title='Date'),\
                    y=alt.Y(numeric_col, title='Cases'),\
                    color=cat,\
                   tooltip = ['Landkreis','Meldedatum','Number'])
            

    data = pd.DataFrame({'a': [50]})
    c2 = alt.Chart(data).mark_rule(strokeWidth=10).encode(y='a:Q',\
                                            opacity=alt.value(0.2),
                                            color = alt.value('red'))
    (c1+c2).properties(width=800, height=400, title=title_str).display()
#    c.display()
plot_cases_line(df_lkr_roll, 'Landkreis',['LK Celle','Region Hannover','LK Coesfeld'],'mean(Number):Q','Rolling 7-day sum of cases per 100k' ) 

In [50]:
def plot_cases_line(df, cat , el_list, numeric_col, title_str):
    c = alt.Chart(df.loc[df[cat].isin(el_list)])\
            .mark_line(point=False)\
            .encode(x=alt.X('monthdate(Meldedatum):O', title='Date'),\
                    y=alt.Y(numeric_col, title='Cases'),\
                    color=cat,\
                   tooltip = ['Landkreis','Meldedatum','Number'])\
            .properties(width=800, height=400, title=title_str).display()
            
    #c.display()
plot_cases_line(tmp, 'Landkreis',['LK Celle','Region Hannover','LK Coesfeld','Lockdown Threshold'],'mean(Number):Q','Rolling 7-day sum of cases per 100k' ) 

KeyError: 'Landkreis'

In [None]:
tmp = df_lkr_roll.append({'Meldedatum':df_lkr_roll['Meldedatum'].min(), \
                    'Landkreis':'Lockdown Threshold',\
                    'category':'AnzahlFall100k',\
                    'Number': 50},ignore_index=True)
tmp = tmp.append({'Meldedatum':df_lkr_roll['Meldedatum'].max(), \
                    'Landkreis':'Lockdown Threshold',\
                    'category':'AnzahlFall100k',\
                    'Number': 50},ignore_index=True)
tmp

In [None]:
#df_cases_pop.groupby(['Meldedatum']).sum()['AnzahlFall'].rolling(3).sum()
df_cases_rolling = df_cases_pop.groupby(['Meldedatum','Landkreis']).sum()[['Case_per_100k']].rolling(7).sum().reset_index()

In [None]:
tmp = df_cases_pop.groupby(['Meldedatum','Landkreis']).sum()[['Case_per_100k']].reset_index()
tmp.loc[tmp['Landkreis']=='LK Coesfeld' ][['Case_per_100k']].rolling(7).sum()

In [None]:
df2['AnzahlFall'].rolling('7d').sum()

In [None]:
keep_col = ['AnzahlFall','AnzahlTodesfall']#,'Bundesland','Landkreis']
df3 = df_cases_pop.groupby('IdLandkreis').rolling('7d').sum()[keep_col].reset_index()
#df3.loc[df3['Landkreis']=='LK Coesfeld']

In [None]:
df2 = df_cases.sort_values(by='Meldedatum').set_index('Meldedatum')

In [None]:
alt.Chart(df_cases_rolling.loc[df_cases_rolling['Landkreis'].isin(['LK Coesfeld','LK Celle'])])\
        .mark_bar(point=True)\
        .encode(x=alt.X('monthdate(Meldedatum):O', title='Date'),\
                y=alt.Y('mean(Case_per_100k):Q', title='Cumulative Cases'),\
                color='Landkreis',\
                tooltip = ['Landkreis','Case_per_100k'])\
        .properties(width=800, height=400, title='Rolling number of cases per week per 100k inhabitant')

# Bar plot cases by state

In [4]:
df_cases_pop['Meldedatum'].dtype

NameError: name 'df_cases_pop' is not defined

In [5]:
tmp = df_cases.groupby('Bundesland').sum()[['AnzahlFall','AnzahlTodesfall']]\
        .sort_values(by='AnzahlFall', ascending=False).reset_index()
tmp = tmp.melt(id_vars='Bundesland', value_vars = ['AnzahlFall', 'AnzahlTodesfall'], value_name='Cases')
list(tmp['Bundesland'].unique())

KeyError: "['AnzahlTodesfall'] not in index"

In [6]:
alt.Chart(tmp)\
            .mark_bar()\
            .encode(x=alt.X('Bundesland:O', title='Bundesland', sort=list(tmp['Bundesland'].unique())),\
                    y=alt.Y('Cases:Q', title='Cumulative Cases'),\
                    color='variable',\
                   tooltip=['Cases','variable'])\
            .properties(width=800, height=400, title='Number of Cases')

NameError: name 'tmp' is not defined