# Walktrhough

In [1]:
import os
os.chdir('/Users/rae616188919/covid-19-analysis/')

## 1 Update all data

In [3]:
import subprocess
import os

import pandas as pd
import numpy as np

from datetime import datetime

import requests
import json

from pandas.io.json import json_normalize

def get_johns_hopkins():
    git_pull = subprocess.Popen( "/usr/bin/git pull" ,
                         cwd = os.path.dirname( 'data/raw/COVID-19/' ),
                         shell = True,
                         stdout = subprocess.PIPE,
                         stderr = subprocess.PIPE )
    (out, error) = git_pull.communicate()


    print("Error : " + str(error))
    print("out : " + str(out))

def get_current_data_China():
    ''' Get current data from China

    '''
    url='https://c.m.163.com/ug/api/wuhan/app/data/list-total'
    headers= {'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:76.0) Gecko/20100101 Firefox/76.0'}
    ret=requests.get(url,headers=headers)
    result=json.loads(ret.content)
    t=result['data']['areaTree'][2]['children']
    sf=json_normalize(t)
    sf=sf[['name','total.confirm','total.dead','total.heal','today.storeConfirm','lastUpdateTime']]
    print('Number of regions rows:%i' %len(sf))
    
    
if __name__ == '__main__':
    get_johns_hopkins()
    get_current_data_China()

Error : b''
out : b'Already up to date.\n'
Number of regions rows:34


## 2. Process pipeline¶

In [4]:
import pandas as pd
import numpy as np

from datetime import datetime

def store_relational_JH_data():
    data1_path='data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    pd_raw1=pd.read_csv(data1_path)
    pd_data_base=pd_raw1.rename(columns={'Country/Region':'country',
                      'Province/State':'state'})
    pd_data_base['state']=pd_data_base['state'].fillna('no')
    pd_data_base=pd_data_base.drop(['Lat','Long'],axis=1)
    pd_relational_model=pd_data_base.set_index(['state','country'])
    pd_relational_model=pd_relational_model.T
    pd_relational_model=pd_relational_model.stack(level=[0,1])
    pd_relational_model=pd_relational_model.reset_index()
    pd_relational_model=pd_relational_model.rename(columns={'level_0':'date',0:'confirmed'})
    pd_relational_model['date']=pd_relational_model.date.astype('datetime64[ns]')
    pd_relational_model.confirmed=pd_relational_model.confirmed.astype(int)
    pd_relational_model.to_csv('data/processed/COVID_relational_confirmed.csv',sep=';',index=False)
    print(' Number of rows stored: '+str(pd_relational_model.shape[0]))
    print(' Latest date is: '+str(max(pd_relational_model.date)))

if __name__ == '__main__':

    store_relational_JH_data()

 Number of rows stored: 33782
 Latest date is: 2020-05-27 00:00:00


## 3 Filter and Doubling Rate Calculation

In [9]:
import numpy as np
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)
import pandas as pd

from scipy import signal

def get_doubling_time_via_regression(in_array):
    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)
    
    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_
    
    return intercept/slope

def savgol_filter(df_input,column='confirmed',window=5):
    degree=1
    df_result=df_input

    filter_in=df_input[column].fillna(0) # attention with the neutral element here

    result=signal.savgol_filter(np.array(filter_in),
                           window, # window size used for filtering
                           1)
    df_result[str(column+'_filtered')]=result
    return df_result

def rolling_reg(df_input,col='confirmed'):
    ''' input has to be a data frame'''
    ''' return is single series (mandatory for group by apply)'''
    days_back=3
    result=df_input[col].rolling(
                window=days_back,
                min_periods=days_back).apply(get_doubling_time_via_regression,raw=False)
    return result

def calc_filtered_data(df_input,filter_on='confirmed'):
    '''  Calculate savgol filter and return merged data frame

        Parameters:
        ----------
        df_input: pd.DataFrame
        filter_on: str
            defines the used column
        Returns:
        ----------
        df_output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'

    df_output=df_input.copy() # we need a copy here otherwise the filter_on column will be overwritten

    pd_filtered_result=df_output[['state','country',filter_on]].groupby(['state','country']).apply(savgol_filter)#.reset_index()

    #print('--+++ after group by apply')
    #print(pd_filtered_result[pd_filtered_result['country']=='Germany'].tail())

    #df_output=pd.merge(df_output,pd_filtered_result[['index',str(filter_on+'_filtered')]],on=['index'],how='left')
    df_output=pd.merge(df_output,pd_filtered_result[[str(filter_on+'_filtered')]],left_index=True,right_index=True,how='left')
    #print(df_output[df_output['country']=='Germany'].tail())
    return df_output.copy()


def calc_doubling_rate(df_input,filter_on='confirmed'):
    ''' Calculate approximated doubling rate and return merged data frame

        Parameters:
        ----------
        df_input: pd.DataFrame
        filter_on: str
            defines the used column
        Returns:
        ----------
        df_output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'


    pd_DR_result= df_input.groupby(['state','country']).apply(rolling_reg,filter_on).reset_index()

    pd_DR_result=pd_DR_result.rename(columns={filter_on:filter_on+'_DR',
                             'level_2':'index'})

    #we do the merge on the index of our big table and on the index column after groupby
    df_output=pd.merge(df_input,pd_DR_result[['index',str(filter_on+'_DR')]],left_index=True,right_on=['index'],how='left')
    df_output=df_output.drop(columns=['index'])


    return df_output


if __name__ == '__main__':
    test_data_reg=np.array([2,4,6])
    result=get_doubling_time_via_regression(test_data_reg)
    print('the test slope is: '+str(result))

    pd_JH_data=pd.read_csv('data/processed/COVID_relational_confirmed.csv',sep=';',parse_dates=[0])
    pd_JH_data=pd_JH_data.sort_values('date',ascending=True).copy()
    pd_result_larg=calc_filtered_data(pd_JH_data)
    pd_result_larg=calc_doubling_rate(pd_result_larg)
    pd_result_larg=calc_doubling_rate(pd_result_larg,'confirmed_filtered')


    mask=pd_result_larg['confirmed']>100
    pd_result_larg['confirmed_filtered_DR']=pd_result_larg['confirmed_filtered_DR'].where(mask, other=np.NaN)
    pd_result_larg.to_csv('data/processed/COVID_final_set.csv',sep=';',index=False)
    print(pd_result_larg[pd_result_larg['country']=='US'].tail())


the test slope is: [2.]
            date state country  confirmed  confirmed_filtered  confirmed_DR  \
32126 2020-05-23    no      US    1622612           1621248.8     70.394017   
32127 2020-05-24    no      US    1643246           1642002.0     76.686521   
32128 2020-05-25    no      US    1662302           1661649.8     82.777526   
32129 2020-05-26    no      US    1680913           1680729.3     88.255166   
32130 2020-05-27    no      US    1699176           1699808.8     91.164343   

       confirmed_filtered_DR  
32126              71.021115  
32127              75.662454  
32128              81.266975  
32129              85.803057  
32130              88.090846  


## 4 Visual Board

In [10]:
import pandas as pd
import numpy as np

import dash
dash.__version__
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output,State

import plotly.graph_objects as go

import os
print(os.getcwd())
df_input_large=pd.read_csv('data/processed/COVID_final_set.csv',sep=';')

/Users/rae616188919/covid-19-analysis


In [17]:
def country_compare():
    data_path='data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    pd_raw=pd.read_csv(data_path)
    time_index=pd_raw.columns[4:]
    df_plot=pd.DataFrame({'date':time_index})
    country_list=['Italy','US','Spain','Brazil','United Kingdom','India','China','Russia']
    for each in country_list:
        df_plot[each]=np.array(pd_raw[pd_raw['Country/Region']==each].iloc[:,4::].sum(axis=0))
    
    time_idx=[datetime.strptime(each,"%m/%d/%y")for each in df_plot.date]
    time_str=[each.strftime("%Y-%m-%d") for each in time_idx]
    df_plot['date']=time_idx
    df_plot.to_csv('data/processed/COVID_small_flat_table.csv',sep=';',index=False)
    
    fig = go. Figure()
    for each in country_list:
        fig.add_trace(go.Scatter(x=df_plot.date,
                            y=df_plot[each],
                            mode='markers+lines',
                            opacity=0.9,
                            line_width=2,
                            marker_size=4,
                            name=each
                            )
                     )
    fig.update_layout(
        width=1024,
        height=900,
        xaxis_title='Times',
        yaxis_title="Confirmed infected people (source johns hopkins csse, log-scale)",
    )
    fig.update_yaxes(type='log',range=[1.1,5.5])
    fig.update_layout(xaxis_rangeslider_visible=True)
    fig.show(renderer='firefox')
    fig.show()

    app = dash.Dash()
    app.layout = html.Div([
    
        html.Label('Multi-Select Country'),

        dcc.Dropdown(
            id='country_drop_down',
            options=[
                {'label': 'Italy', 'value': 'Italy'},
                {'label': 'US', 'value': 'US'},
                {'label': 'Spain', 'value': 'Spain'},
                {'label': 'Brazil', 'value': 'Brazil'},
                {'label':'United Kingdom','value':'United Kingdom'},
                {'label':'India','value':'India'},
                {'label': 'China', 'value': 'China'},
                {'label': 'Russia', 'value': 'Russia'}
            ],
            value=['US', 'China'], # which are pre-selected
            multi=True
        ),   
        
        dcc.Graph(figure=fig, id='main_window_slope')
    ])

    @app.callback(
        Output('main_window_slope', 'figure'),
        [Input('country_drop_down', 'value')])
    def update_figure(country_list):
    
        traces = [] 
        for each in country_list:
            traces.append(dict(x=df_plot.date,
                                    y=df_plot[each],
                                    mode='markers+lines',
                                    opacity=0.9,
                                    line_width=2,
                                    marker_size=4, 
                                    name=each
                            )
                    )
        
        return {
                'data': traces,
                'layout': dict (
                    width=1280,
                    height=720,
                    xaxis_title="Time",
                    yaxis_title="Confirmed infected people (source johns hopkins csse, log-scale)",
                    xaxis={'tickangle':-45,
                            'nticks':20,
                            'tickfont':dict(size=14,color="#7f7f7f"),
                        
                          },
                    yaxis={'type':"log",
                           'range':'[1.1,5.5]'
                          }
            )
        }
    app.run_server(debug=True, use_reloader=False,port=8000, host='127.0.0.1')
    
if __name__ == '__main__':
    country_compare()
    

Running on http://127.0.0.1:8000/
Running on http://127.0.0.1:8000/
Debugger PIN: 104-824-992
Debugger PIN: 104-824-992
 * Tip: There are .env files present. Do "pip install python-dotenv" to use them.
 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on


In [25]:
def quick_plot(x_in, df_input,y_scale='log',slider=False):
    fig = go.Figure()

    for each in df_input.columns:
        fig.add_trace(go.Scatter(
                        x=x_in,
                        y=df_input[each],
                        name=each,
                        opacity=0.8))
    
    fig.update_layout(autosize=True,
        width=1024,
        height=768,
        font=dict(
            family="PT Sans, monospace",
            size=18,
            color="#7f7f7f"
            )
        )
    fig.update_yaxes(type=y_scale),
    fig.update_xaxes(tickangle=-45,
                 nticks=20,
                 tickfont=dict(size=14,color="#7f7f7f")
                )
    if slider==True:
        fig.update_layout(xaxis_rangeslider_visible=True)
    fig.show()

def doubling_rate(N_0,t,T_d):
    return N_0*np.power(2,t/T_d)
    max_days=120

    norm_slopes={
        'doubling every two days':doubling_rate(100,np.arange(max_days),1),
        'doubling every two days':doubling_rate(100,np.arange(max_days),2),
        'doubling every 4 days':doubling_rate(100,np.arange(max_days),4),
        'doubling every 10 days':doubling_rate(100,np.arange(max_days),10),
    }

def pd_sync_timelines_w_slope():
    df_analyse=pd.read_csv('data/processed/COVID_small_flat_table.csv',sep=';',
                       parse_dates=[0])
    threshold=100
    compare_list=[]
    for pos,country in enumerate(df_analyse.columns[1:]):
        compare_list.append(np.array(df_analyse[country][df_analyse[country]>threshold]))
    pd_sync_timelines=pd.DataFrame(compare_list,index=df_analyse.columns[1:]).T
    pd_sync_timelines['date']=np.arange(pd_sync_timelines.shape[0])
    quick_plot(pd_sync_timelines.date,
           pd_sync_timelines.iloc[:,:-1],
           y_scale='log',
           slider=True)
    
    pd_sync_timelines_w_slope=pd.concat([pd.DataFrame(norm_slopes),pd_sync_timelines], axis=1)
    pd_sync_timelines_w_slope.to_csv('data/processed/COVID_small_sync_timeline_table.csv',sep=';',index=False)
    quick_plot(pd_sync_timelines_w_slope.date,
           pd_sync_timelines_w_slope.iloc[:,0:3],
           y_scale='log',
           slider=True)

if __name__ == '__main__':
    pd_sync_timelines_w_slope()

## 5 Regression

In [29]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
def Linear():
    df_analyse=pd.read_csv('/Users/rae616188919/covid-19-analysis/data/processed/COVID_small_flat_table.csv',sep=';',
                       parse_dates=[0])  
    l_vec=len(df_analyse['US'])
    X=np.arange(l_vec-5).reshape(-1, 1)
    y=np.log(np.array(df_analyse['US'][5:]))
    quadratic_featurizer = PolynomialFeatures(degree=2)
    X_train = quadratic_featurizer.fit_transform(X)
    model2 = LinearRegression()
    model2.fit(X_train, y)
    X_hat=quadratic_featurizer.transform(np.arange(l_vec).reshape(-1, 1))
    Y_hat=model2.predict(X_hat)
    LR_inspect=df_analyse[['date','US']].copy()
    LR_inspect['prediction']=np.exp(Y_hat)
    quick_plot(LR_inspect.date,
           LR_inspect.iloc[:,1:],
           y_scale='log',
           slider=True)

if __name__ == '__main__':
    Linear()