#  Update Data

In [1]:
# %load /media/sem/HDD/Home_Programming/Git/ads_covid-19-sem/src/data/get_data.py
import subprocess
import os

import pandas as pd
import numpy as np

from datetime import datetime

import requests
import json

def get_johns_hopkins():
    ''' Get data by a git pull request, the source code has to be pulled first
        Result is stored in the predifined csv structure
    '''
    git_pull = subprocess.Popen( "/usr/bin/git pull" ,
                         cwd = os.path.dirname( '/mnt/368AE7F88AE7B313/Files_Programming/Git/ads_covid-19-sem/data/raw/COVID-19/' ),
                         shell = True,
                         stdout = subprocess.PIPE,
                         stderr = subprocess.PIPE )
    (out, error) = git_pull.communicate()


    print("Error : " + str(error))
    print("out : " + str(out))


def get_current_data_germany():
    ''' Get current data from germany, attention API endpoint not too stable
        Result data frame is stored as pd.DataFrame

    '''
    # 16 states
    #data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

    # 400 regions / Landkreise
    data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/RKI_Landkreisdaten/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

    json_object=json.loads(data.content)
    full_list=[]
    for pos,each_dict in enumerate (json_object['features'][:]):
        full_list.append(each_dict['attributes'])

    pd_full_list=pd.DataFrame(full_list)
    pd_full_list.to_csv('/mnt/368AE7F88AE7B313/Files_Programming/Git/ads_covid-19-sem/data/raw/NPGEO/GER_state_data.csv',sep=';')
    print(' Number of regions rows: '+str(pd_full_list.shape[0]))

if __name__ == '__main__':
    get_johns_hopkins()
    get_current_data_germany()


Error : b''
out : b'Already up to date.\n'
 Number of regions rows: 412


#  Process Pipeline

In [None]:
# %load /media/sem/HDD/Home_Programming/Git/ads_covid-19-sem/src/data/process_JH_data.py
import pandas as pd
import numpy as np

from datetime import datetime


def store_relational_JH_data():
    ''' Transformes the COVID data in a relational data set

    '''

    data_path='/mnt/368AE7F88AE7B313/Files_Programming/Git/ads_covid-19-sem/data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    pd_raw=pd.read_csv(data_path)

    pd_data_base=pd_raw.rename(columns={'Country/Region':'country',
                      'Province/State':'state'})

    pd_data_base['state']=pd_data_base['state'].fillna('no')

    pd_data_base=pd_data_base.drop(['Lat','Long'],axis=1)


    pd_relational_model=pd_data_base.set_index(['state','country'])                                 .T                                                              .stack(level=[0,1])                                             .reset_index()                                                  .rename(columns={'level_0':'date',
                                                   0:'confirmed'},
                                                  )

    pd_relational_model['date']=pd_relational_model.date.astype('datetime64[ns]')

    pd_relational_model.to_csv('/mnt/368AE7F88AE7B313/Files_Programming/Git/ads_covid-19-sem/data/processed/COVID_relational_confirmed.csv',sep=';',index=False)
    print(' Number of rows stored: '+str(pd_relational_model.shape[0]))

if __name__ == '__main__':

    store_relational_JH_data()


#  Filter and Doubling Rate Calculation

In [None]:
# %load /media/sem/HDD/Home_Programming/Git/ads_covid-19-sem/src/features/build_features.py
import numpy as np
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)
import pandas as pd

from scipy import signal


def get_doubling_time_via_regression(in_array):
    ''' Use a linear regression to approximate the doubling rate

        Parameters:
        ----------
        in_array : pandas.series

        Returns:
        ----------
        Doubling rate: double
    '''

    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)

    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_

    return intercept/slope


def savgol_filter(df_input,column='confirmed',window=5):
    ''' Savgol Filter which can be used in groupby apply function (data structure kept)

        parameters:
        ----------
        df_input : pandas.series
        column : str
        window : int
            used data points to calculate the filter result

        Returns:
        ----------
        df_result: pd.DataFrame
            the index of the df_input has to be preserved in result
    '''

    degree=1
    df_result=df_input

    filter_in=df_input[column].fillna(0) # attention with the neutral element here

    result=signal.savgol_filter(np.array(filter_in),
                           window, # window size used for filtering
                           1)
    df_result[column+'_filtered']=result
    return df_result

def rolling_reg(df_input,col='confirmed'):
    ''' Rolling Regression to approximate the doubling time'

        Parameters:
        ----------
        df_input: pd.DataFrame
        col: str
            defines the used column
        Returns:
        ----------
        result: pd.DataFrame
    '''
    days_back=3
    result=df_input[col].rolling(
                window=days_back,
                min_periods=days_back).apply(get_doubling_time_via_regression,raw=False)
    return result




def calc_filtered_data(df_input,filter_on='confirmed'):
    '''  Calculate savgol filter and return merged data frame

        Parameters:
        ----------
        df_input: pd.DataFrame
        filter_on: str
            defines the used column
        Returns:
        ----------
        df_output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'

    pd_filtered_result=df_input[['state','country',filter_on]].groupby(['state','country']).apply(savgol_filter).reset_index()
    df_output=pd.merge(df_input,pd_filtered_result[['index',filter_on+'_filtered']],on=['index'],how='left')

    return df_output





def calc_doubling_rate(df_input,filter_on='confirmed'):
    ''' Calculate approximated doubling rate and return merged data frame

        Parameters:
        ----------
        df_input: pd.DataFrame
        filter_on: str
            defines the used column
        Returns:
        ----------
        df_output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'

    pd_DR_result= df_input.groupby(['state','country']).apply(rolling_reg,filter_on).reset_index()
    pd_DR_result=pd_DR_result.rename(columns={filter_on:filter_on+'_DR',
                             'level_2':'index'})

    df_output=pd.merge(df_input,pd_DR_result[['index',filter_on+'_DR']],on=['index'],how='left')
    return df_output


if __name__ == '__main__':
    test_data_reg=np.array([2,4,6])
    result=get_doubling_time_via_regression(test_data_reg)
    print('the test slope is: '+str(result))
    
    pd_JH_data=pd.read_csv('/mnt/368AE7F88AE7B313/Files_Programming/Git/ads_covid-19-sem/data/processed/COVID_relational_confirmed.csv',sep=';',parse_dates=[0])
    pd_JH_data=pd_JH_data.sort_values('date',ascending=True).reset_index().copy()

    pd_result_larg=calc_filtered_data(pd_JH_data)
    pd_result_larg=calc_doubling_rate(pd_result_larg)
    pd_result_larg=calc_doubling_rate(pd_result_larg,'confirmed_filtered')
    print(pd_result_larg.head())


# Dashboard Implementation 

In [None]:
# %load /media/sem/HDD/Home_Programming/Git/ads_covid-19-sem/src/visualization/dashboard.py
import pandas as pd
import numpy as np
import os
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output,State
from datetime import datetime
from scipy import optimize
from scipy import integrate
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
from plotly.subplots import make_subplots

df_input_large = pd.read_csv('/mnt/368AE7F88AE7B313/Files_Programming/Git/ads_covid-19-sem/data/processed/COVID_final_set.csv',
                             sep=';')
df_analyse = pd.read_csv(
    '/mnt/368AE7F88AE7B313/Files_Programming/Git/ads_covid-19-sem/data/processed/COVID_small_flat_table.csv', sep=';')

colors = {'background': '#111111', 'text': '#7FDBFF'}

N0 = 1000000  # max susceptible population
beta = 0.4  # infection spread dynamics
gamma = 0.1  # recovery rate


def SIR_model(SIR, beta, gamma):
    ''' Simple SIR model
        S: susceptible population
        I: infected people
        R: recovered people
        beta:

        overall condition is that the sum of changes (differnces) sum up to 0
        dS+dI+dR=0
        S+I+R= N (constant size of population)

    '''

    S, I, R = SIR
    dS_dt = -beta * S * I / N0  # S*I is the
    dI_dt = beta * S * I / N0 - gamma * I
    dR_dt = gamma * I
    return ([dS_dt, dI_dt, dR_dt])


def SIR_model_t(SIR, t, beta, gamma):
    ''' Simple SIR model
        S: susceptible population
        t: time step, mandatory for integral.odeint
        I: infected people
        R: recovered people
        beta:

        overall condition is that the sum of changes (differnces) sum up to 0
        dS+dI+dR=0
        S+I+R= N (constant size of population)

    '''

    S, I, R = SIR
    dS_dt = -beta * S * I / N0  # S*I is the
    dI_dt = beta * S * I / N0 - gamma * I
    dR_dt = gamma * I
    return dS_dt, dI_dt, dR_dt


def fit_odeint(x, beta, gamma):
    '''
    helper function for the integration
    '''
    return integrate.odeint(SIR_model_t, (S0, I0, R0), t, args=(beta, gamma))[:, 1]


ydata = np.array(df_analyse.Germany[35:])
t = np.arange(len(ydata))

I0 = ydata[0]
S0 = N0 - I0
R0 = 0

fig = go.Figure()

app = dash.Dash()

tab_1 = dcc.Tab(label='Analysis of Rate of infection', value='tab_1', children=[

    dcc.Dropdown(
        id='country_drop_down',
        options=[{'label': each, 'value': each} for each in df_input_large['country'].unique()],
        value=['US', 'Germany', 'Italy'],  # which are pre-selected
        multi=True
    ),
    dcc.Dropdown(
        id='doubling_time',
        options=[
            {'label': 'Timeline Confirmed ', 'value': 'confirmed'},
            {'label': 'Timeline Confirmed Filtered', 'value': 'confirmed_filtered'},
            {'label': 'Timeline Doubling Rate', 'value': 'doubling_rate'},
            {'label': 'Timeline Doubling Rate Filtered', 'value': 'doubling_rate_filtered'},
        ],
        value='confirmed',
        multi=False
    ),
    dcc.Markdown('''
            Regarding the filtration of data and doubling rate calculation, the following techniques are used.
                * The savgol signal filtration was used to filter the data mainly to smoothen reporting delays and 
                 human errors in reporting.A window  size of five data points was used.
                *  The doubling rate was calculated via rolling regression with a window size of 3 days back. 
    ''')
]
                )

tab_2 = dcc.Tab(label='SIR Model Demonstration For Germany', value='tab_2',children=[
            dcc.Markdown('''
                * For the static case we have that:
                    * beta is approximately 0.35424
                    * gamma is approximately 0.1604
                * For the dynamic case:
                    * gamma is held constant at 0.1
                    * with beta allowed to have a maximum value of 0.4 and a minimum of 0.11            
            ''')
])

app.layout = html.Div(
    [html.Center(html.H1('Covid19 Data Analysis')), dcc.Tabs(id='my_tabs', value='tab_1', children=[tab_1, tab_2]),
     html.Div(html.Center([dcc.Graph(figure=fig, id='main_window_slope')]))])


@app.callback(
    Output('main_window_slope', 'figure'),
    [Input('my_tabs', 'value'),
     Input('country_drop_down', 'value'),
     Input('doubling_time', 'value')])
def update_figure(tab, country_list, show_doubling):
    if tab == 'tab_1':

        if 'doubling_rate' in show_doubling:
            my_yaxis = {'type': "log",
                        'title': 'Approximated doubling rate over 3 days (larger numbers are better)'}
        else:
            my_yaxis = {'type': "log",
                        'title': 'Confirmed infected people (source johns hopkins csse, log-scale)'
                        }

        traces = []
        for each in country_list:

            df_plot = df_input_large[df_input_large['country'] == each]

            if show_doubling == 'doubling_rate_filtered':
                df_plot = df_plot[
                    ['state', 'country', 'confirmed', 'confirmed_filtered', 'doubling_rate', 'doubling_rate_filtered',
                     'date']].groupby(['country', 'date']).agg(np.mean).reset_index()
            else:
                df_plot = df_plot[
                    ['state', 'country', 'confirmed', 'confirmed_filtered', 'doubling_rate', 'doubling_rate_filtered',
                     'date']].groupby(['country', 'date']).agg(np.sum).reset_index()

            traces.append(go.Scatter(x=df_plot.date,
                                     y=df_plot[show_doubling],
                                     mode='markers+lines',
                                     opacity=0.9,
                                     name=each)
                          )
            layout = go.Layout(
                width=1280,
                height=720,
                plot_bgcolor=colors['background'],
                paper_bgcolor=colors['background'],
                font={'color': colors['text']},
                xaxis={'title': 'Timeline',
                       'tickangle': -45,
                       'nticks': 20,
                       'tickfont': dict(size=14, color="#7f7f7f"),
                       },
                yaxis=my_yaxis
            )

        return dict(data=traces, layout=layout)

    else:

        ydata = np.array(df_analyse.Germany[35:])
        t = np.arange(len(ydata))
        I0 = ydata[0]
        S0 = N0 - I0
        R0 = 0
        popt, pcov = optimize.curve_fit(fit_odeint, t, ydata)
        fitted = fit_odeint(t, *popt)

        t_initial = 28
        t_intro_measures = 14
        t_hold = 21
        t_relax = 21

        beta_max = 0.4
        beta_min = 0.11
        gamma = 0.1
        pd_beta = np.concatenate((np.array(t_initial * [beta_max]),
                                  np.linspace(beta_max, beta_min, t_intro_measures),
                                  np.array(t_hold * [beta_min]),
                                  np.linspace(beta_min, beta_max, t_relax),
                                  ))
        SIR = np.array([S0, I0, R0])
        propagation_rates = pd.DataFrame(columns={'susceptible': S0,
                                                  'infected': I0,
                                                  'recoverd': R0})
        for each_beta in pd_beta:
            new_delta_vec = SIR_model(SIR, each_beta, gamma)

            SIR = SIR + new_delta_vec

            propagation_rates = propagation_rates.append({'susceptible': SIR[0],
                                                          'infected': SIR[1],
                                                          'recovered': SIR[2]}, ignore_index=True)
        t_phases = np.array([t_initial, t_intro_measures, t_hold, t_relax]).cumsum()
        fig = make_subplots(rows=2, cols=2, specs=[[{"colspan": 2}, None], [{"colspan": 2}, None]], subplot_titles=(
            "Fit of SIR model for Germany cases with fixed beta and gamma",
            'Szenario SIR simulations with fixed gamma and dynamic beta')
                            )
        trace11 = go.Scatter(x=t, y=ydata, mode='markers',name = 'True infected number')
        trace22 = go.Scatter(x=t, y=fitted, mode='lines',name='fitted infected number')
        trace111 = go.Scatter(x=propagation_rates.index, y=propagation_rates.infected, name='simlated infected', mode='lines',
                              line=dict(width=5))
        trace222 = go.Bar(x=np.arange(len(ydata)), y=ydata, name='current infected germany')

        fig.add_trace(trace11, row=1, col=1)
        fig.add_trace(trace22, row=1, col=1)
        fig.add_trace(trace111, row=2, col=1)
        fig.add_trace(trace222, row=2, col=1)

        fig.update_yaxes(type='log', row=1, col=1,title_text='population infected')
        fig.update_yaxes(type='log', row=2, col=1,title_text ='population infected')
        
        fig.update_xaxes(row=1,col=1,title_text = 'time in days')
        fig.update_xaxes(row=2,col=1,title_text = 'time in days')
        
        fig.update_layout(plot_bgcolor=colors['background'],
                          paper_bgcolor=colors['background'],
                          font={'color': colors['text']})

        return fig

if __name__ == '__main__':
    app.run_server(debug=True, use_reloader=False)

Running on http://127.0.0.1:8050/
Debugger PIN: 713-910-554
 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on



Excess work done on this call (perhaps wrong Dfun type). Run with full_output = 1 to get quantitative information.

