## One run full walkthrough

* Do the full walkthrough on the large data set
* Refactor the source code and bring it to individual scripts
* Ensure a full run with one click

## 1 Update all data

In [1]:
# %load C:/Users/jitin/ads_covid-19/src/data/get_data_for_SIR.py

import subprocess
import os

import pandas as pd
import numpy as np

from datetime import datetime
from bs4 import BeautifulSoup

import requests
import json

def get_johns_hopkins():
    ''' Get data by a git pull request, the source code has to be pulled first
        Result is stored in the predifined csv structure
    '''
    git_pull = subprocess.Popen( ["git", "pull"],#"/usr/bin/git pull" ,
                         cwd = os.path.dirname( 'C:/Users/jitin/ads_covid-19/data/raw/COVID-19/' ),
                         shell = True,
                         stdout = subprocess.PIPE,
                         stderr = subprocess.PIPE )
    (out, error) = git_pull.communicate()


    print("Error : " + str(error))
    print("out : " + str(out))


def get_current_data_germany():
    ''' Get current data from germany, attention API endpoint not too stable
        Result data frame is stored as pd.DataFrame

    '''
    # 16 states
    #data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

    # 400 regions / Landkreise
    data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/RKI_Landkreisdaten/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

    json_object=json.loads(data.content)
    full_list=[]
    for pos,each_dict in enumerate (json_object['features'][:]):
        full_list.append(each_dict['attributes'])

    pd_full_list=pd.DataFrame(full_list)
    pd_full_list.to_csv('C:/Users/jitin/ads_covid-19/data/raw/NPGEO/GER_State_Data.csv',sep=';')
    print(' Number of regions rows: '+str(pd_full_list.shape[0]))

def get_world_population_data():
    page = requests.get("https://www.worldometers.info/world-population/population-by-country/")
    soup = BeautifulSoup(page.content, 'html.parser')
    html_table_pop = soup.find('table')
    all_rows_pop = html_table_pop.find_all('tr')
    final_pop_data_list=[]
    for pos,rows in enumerate(all_rows_pop):
        col_list= [each_col.get_text(strip=True) for each_col in rows.find_all('td') ]
        final_pop_data_list.append(col_list)
    reqd_pop_list = pd.DataFrame(final_pop_data_list).dropna()\
                    .rename(columns={1:'country', 2:'population'})
    reqd_pop_list = reqd_pop_list[['country','population']]
    #Replacing the Country List with Wrong Names
    reqd_pop_list["country"]= reqd_pop_list["country"].replace({'Myanmar':'Burma', 'Czech Republic (Czechia)': 'Czechia', 'DR Congo': 'Congo (Kinshasa)', 'Congo': 'Congo (Brazzaville)', 'South Korea': 'Korea, South', 'St. Vincent & Grenadines': 'Saint Vincent and the Grenadines', 'Taiwan': 'Taiwan*', 'United States': 'US','State of Palestine': 'West Bank and Gaza', 'Côte d\'Ivoire': 'Cote d\'Ivoire'})
    #Addition of New Data into the list : Like the Diamond Prince Kosova etc
    list_new_country = [pd.Series(['Diamond Princess', 3711], index = reqd_pop_list.columns ) ,
                    pd.Series(['Kosovo', 1845000], index = reqd_pop_list.columns ) ,
                    pd.Series(['MS Zaandam', 1432], index = reqd_pop_list.columns ),
                    pd.Series(['Saint Kitts and Nevis', 52441], index = reqd_pop_list.columns ),
                    pd.Series(['Sao Tome and Principe', 211028], index = reqd_pop_list.columns )]
                    #Changing the indexing of country from hightest population to Alphabhetical Order
    reqd_pop_list = reqd_pop_list.append(list_new_country, ignore_index=True)\
                                .sort_values('country')\
                                .reset_index(drop=True)
    reqd_pop_list.to_csv('C:/Users/jitin/ads_covid-19/data/raw/world_population_data.csv',sep=';',index=False)
    print('Number of rows: '+str(reqd_pop_list.shape[0]))

if __name__ == '__main__':
    get_johns_hopkins()
    get_current_data_germany()
    get_world_population_data()


Error : b''
out : b'Already up to date.\n'
 Number of regions rows: 412
Number of rows: 240


## 2 Process pipeline

In [2]:
# %load C:/Users/jitin/ads_covid-19/src/data/process_JH_data_SIR.py
import pandas as pd
import numpy as np

from datetime import datetime


def store_relational_JH_data():
    ''' Transformes the COVID data in a relational data set

    '''

    data_path='C:/Users/jitin/ads_covid-19/data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    pd_raw=pd.read_csv(data_path)

    pd_data_base=pd_raw.rename(columns={'Country/Region':'country',
                      'Province/State':'state'})

    pd_data_base['state']=pd_data_base['state'].fillna('no')

    pd_data_base=pd_data_base.drop(['Lat','Long'],axis=1)


    pd_relational_model=pd_data_base.set_index(['state','country']) \
                                .T                              \
                                .stack(level=[0,1])             \
                                .reset_index()                  \
                                .rename(columns={'level_0':'date',
                                                   0:'confirmed'},
                                                  )

    pd_relational_model['date']=pd_relational_model.date.astype('datetime64[ns]')

    pd_relational_model.to_csv('C:/Users/jitin/ads_covid-19/data/processed//COVID_relational_confirmed.csv',sep=';',index=False)
    print(' Number of rows stored: '+str(pd_relational_model.shape[0]))

def prepare_all_country_data():
    data_path='C:/Users/jitin/ads_covid-19/data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    df_raw_data = pd.read_csv(data_path)
    df_raw_data = df_raw_data.drop(['Lat','Long'],axis=1)
    df_raw_data = df_raw_data.rename(columns={'Country/Region':'country',
                                   'Province/State':'state'})
    df_raw_data['state'] = df_raw_data['state'].fillna('no')
    full_country_list = df_raw_data['country'].unique().tolist()
    time_idx = df_raw_data.columns[2:]
    df_analyse = pd.DataFrame({
        'date':time_idx})

    for each in full_country_list:
        df_analyse[each] = np.array(df_raw_data[df_raw_data['country']==each].iloc[:,2::].sum(axis=0))

    #Convert Date to Standard format YYYY-MM-DD
    time_idx=[datetime.strptime(each,"%m/%d/%y") for each in df_analyse.date]
    time_str=[each.strftime('%y-%m-%d') for each in time_idx]
    df_analyse['date']=time_idx
    df_analyse.to_csv('C:/Users/jitin/ads_covid-19/data/processed/all_country_data.csv',sep=';',index=False)

    print(' Number of rows stored in all country data: '+str(df_analyse.shape[0]))

if __name__ == '__main__':

    store_relational_JH_data()
    prepare_all_country_data()


 Number of rows stored: 62776
 Number of rows stored in all country data: 236


## 3 SIR Model 

In [3]:
# %load C:/Users/jitin/ads_covid-19/src/features/build_features_SIR.py

import pandas as pd
import numpy as np

from datetime import datetime

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

import seaborn as sns


sns.set(style="darkgrid")

mpl.rcParams['figure.figsize'] = (16, 9)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

from scipy import optimize
from scipy import integrate


def data_gathering():
    population_df = pd.read_csv('C:/Users/jitin/ads_covid-19/data/raw/world_population_data.csv',sep=';', thousands=',')
    population_df = population_df.set_index(['country']).T
    df_analyse = pd.read_csv('C:/Users/jitin/ads_covid-19/data/processed/all_country_data.csv',sep=';')
    country_list = df_analyse.columns[1:]

    infected_list = []
    t=[]

    for column in df_analyse.columns:
        infected_list.append(np.array(df_analyse[column][75:]))

    t = np.arange(len(infected_list))
    infected_df = pd.DataFrame(infected_list,index=df_analyse.columns).T
    infected_df.to_csv('C:/Users/jitin/ads_covid-19/data/processed/SIR/SIR_data.csv',sep=';',index=False)
    print('Number of rows in Infected df '+str(infected_df.shape[0]))
    
    optimized_df = pd.DataFrame(columns = df_analyse.columns[1:],
                     index = ['opt_beta', 'opt_gamma', 'std_dev_error_beta', 'std_dev_error_gamma'])


    t = []
    fitted_final_data = []

    global I0, N0, S0, R0
    for column in infected_df.columns[1:]:
        I0 = infected_df[column].loc[0]
        N0 = population_df[column].loc['population']
        S0 = N0-I0
        R0 = 0
        t  = np.arange(len(infected_df[column]))

        popt=[0.4,0.1]

        fit_odeint(t, *popt)


        popt, pcov = optimize.curve_fit(fit_odeint, t, infected_df[column], maxfev=5000)
        perr = np.sqrt(np.diag(pcov))



        optimized_df.at['opt_beta', column] = popt[0]
        optimized_df.at['opt_gamma', column] = popt[1]
        optimized_df.at['std_dev_error_beta', column] = perr[0]
        optimized_df.at['std_dev_error_gamma', column] = perr[1]

        fitted = fit_odeint(t, *popt)
        fitted_final_data.append(np.array(fitted))

    optimized_df.to_csv('C:/Users/jitin/ads_covid-19/data/processed/SIR/optimized_SIR_data.csv',sep=';',index=False)
    fitted_SIR_data_df = pd.DataFrame(fitted_final_data,index=df_analyse.columns[1:]).T
    fitted_SIR_data_df.to_csv('C:/Users/jitin/ads_covid-19/data/processed/SIR/fitted_SIR_data.csv',sep=';',index=False)
    print(' Number of rows stored in optimized df: '+str(optimized_df.shape[0]))
    print(' Number of rows stored in fitted SIR data: '+str(fitted_SIR_data_df.shape[0]))


def SIR_model_t(SIRN,t,beta,gamma):
    ''' Simple SIR model
        S: susceptible population
        t: time step, mandatory for integral.odeint
        I: infected people
        R: recovered people
        beta:

        overall condition is that the sum of changes (differnces) sum up to 0
        dS+dI+dR=0
        S+I+R= N (constant size of population)

    '''

    S,I,R,N=SIRN
    dS_dt=-beta*S*I/N          
    dI_dt=beta*S*I/N-gamma*I
    dR_dt=gamma*I
    dN_dt=0
    return dS_dt,dI_dt,dR_dt,dN_dt


def fit_odeint(t, beta, gamma):

    '''
    helper function for the integration
    '''
    return integrate.odeint(SIR_model_t, (S0, I0, R0, N0), t, args=(beta, gamma))[:,1] # we only would like to get dI









if __name__ == '__main__':
    # test_data_reg=np.array([2,4,6])
    # result=get_doubling_time_via_regression(test_data_reg)
    # print('the test slope is: '+str(result))
    #
    # pd_JH_data=pd.read_csv('C:/Users/jitin/ads_covid-19/data/processed//COVID_relational_confirmed.csv',sep=';',parse_dates=[0])
    # pd_JH_data=pd_JH_data.sort_values('date',ascending=True).copy()
    #
    # #test_structure=pd_JH_data[((pd_JH_data['country']=='US')|
    # #                  (pd_JH_data['country']=='Germany'))]
    #
    # pd_result_larg=calc_filtered_data(pd_JH_data)
    # pd_result_larg=calc_doubling_rate(pd_result_larg)
    # pd_result_larg=calc_doubling_rate(pd_result_larg,'confirmed_filtered')
    #
    #
    # mask=pd_result_larg['confirmed']>100
    # pd_result_larg['confirmed_filtered_DR']=pd_result_larg['confirmed_filtered_DR'].where(mask, other=np.NaN)
    # pd_result_larg.to_csv('C:/Users/jitin/ads_covid-19/data/processed/COVID_final_set.csv',sep=';',index=False)
    # print(pd_result_larg[pd_result_larg['country']=='Germany'].tail())
    data_gathering()


Number of rows in Infected df 161


  dI_dt=beta*S*I/N-gamma*I
  dR_dt=gamma*I


 Number of rows stored in optimized df: 4
 Number of rows stored in fitted SIR data: 161


## 4 Visual Board

In [None]:
# %load C:/Users/jitin/ads_covid-19/src/visualization/visualize_SIR.py
import pandas as pd
import numpy as np

import dash
dash.__version__
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output,State

import plotly.graph_objects as go

import os
print(os.getcwd())

fitted_final_data_dash_df = pd.read_csv('C:/Users/jitin/ads_covid-19/data/processed/SIR/fitted_SIR_data.csv',sep=';')
yd = fitted_final_data_dash_df.copy()
# Initial reading of required data

fitted_final_data_dash_df = pd.read_csv('C:/Users/jitin/ads_covid-19/data/processed/SIR/fitted_SIR_data.csv',sep=';')
optimized_dash_df = pd.read_csv('C:/Users/jitin/ads_covid-19/data/processed/SIR/optimized_SIR_data.csv',sep=';')
ydata_dash_df = pd.read_csv('C:/Users/jitin/ads_covid-19/data/processed/SIR/SIR_data.csv',sep=';')



fig = go.Figure()

app = dash.Dash()
app.layout = html.Div([

    dcc.Markdown('''
    #  Applied Data Science on COVID-19 data
    Goal of the project is to learn data science by applying a cross industry standard process,
    it covers the full walkthrough of: automated data gathering, data transformations,
    filtering and machine learning to approximating the doubling time, and
    (static) deployment of responsive dashboard.
    '''),

    dcc.Markdown('''
    ## Multi-Select Country for visualization
    '''),

    dcc.Dropdown(
        id='country_drop_down',
        options=[ {'label': each,'value':each} for each in fitted_final_data_dash_df],
        value=['US', 'Germany','Italy'], # which are pre-selected
        multi=True
    ),

    dcc.Markdown('''
        ## Select SIR Model and/or fitted SIR Model
        '''),

    dcc.Dropdown(
    id='doubling_time',
    options=[
        {'label': 'SIR curve and fitted SIR curve ', 'value': 'SIR_value'},
        #{'label': 'Fitted SIR curve', 'value': 'Fitted_SIR_curve'},
        #{'label': 'Timeline Doubling Rate', 'value': 'confirmed_DR'},
        #{'label': 'Timeline Doubling Rate Filtered', 'value': 'confirmed_filtered_DR'},
    ],
    value='SIR_value',
    multi=False
    ),

    dcc.Graph(figure=fig, id='main_window_slope')
])


@app.callback(
    Output('main_window_slope', 'figure'),
    [Input('country_drop_down', 'value'),
    Input('doubling_time', 'value')])
def update_figure(full_country_list,show_doubling):


    traces = []
    for each in full_country_list:
        #traces = []

        #df_plot=df_input_large[df_input_large['country']==each]
#         if show_doubling=='SIR_value':
#             yd = ydata_dash_df.copy()
#         else:
#             yd = fitted_final_data_dash_df.copy()



        traces.append(dict(x=ydata_dash_df.date, #df_plot.date,
                                y=ydata_dash_df[each], #=df_plot[show_doubling],
                                mode='markers+lines',
                                opacity=0.9,
                                line_width=2,
                                marker_size=4,
                                name=each

                        )
                )
        traces.append(dict(x=ydata_dash_df.date, #df_plot.date,
                                y=fitted_final_data_dash_df[each], #=df_plot[show_doubling],
                                mode='markers+lines',
                                opacity=0.9,
                                line_width=2,
                                marker_size=4,
                                name=each+'_fitted'

                        )
                )


    return {
            'data': traces,
            'layout': dict (
                width=1280,
                height=720,

                xaxis={'title':'Timeline',
                        'tickangle':-45,
                        'nticks':20,
                        'tickfont':dict(size=14,color="#7f7f7f"),
                      },

                yaxis={'type': 'log',
                        'range': '[1.1,5.5]',
                        'title':'Population infected'
                        }
                #yaxis=my_yaxis
        )
    }

if __name__ == '__main__':


    app.run_server(debug=True, port=8051, use_reloader=False)


C:\Users\jitin\ads_covid-19\notebooks
Dash is running on http://127.0.0.1:8051/

 in production, use a production WSGI server like gunicorn instead.

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on
