In [73]:
import pandas as  pd
import numpy as np
import os
import matplotlib as mpl
import matplotlib.pyplot as plt


pd.set_option('display.max_rows', 500)
mpl.rcParams['figure.figsize']=(16,9)

import plotly.graph_objects as go

# Data Load

In [74]:
#df_analyse=pd.read_csv("C:/Users/LATITUDE/ads_covid-19/data/processed/COVID_small_flat_table.csv",sep=':', parse_dates=[0])
#df_analyse.sort_values("date", ascending=True)
pd_raw=pd.read_csv(r"C:\Users\LATITUDE\ads_covid-19\data\raw\COVID-19\csse_covid_19_data\csse_covid_19_time_series\time_series_covid19_confirmed_global.csv")
pd_raw.head()
time_idx = pd_raw.columns[4:]
df_analyse = pd.DataFrame({
'date':time_idx})

In [75]:
country_list=['Italy',
              'US',
              'Spain',
              'Germany',
              'Brazil',
    ]

for each in country_list:
    df_analyse[each]=np.array(pd_raw[pd_raw['Country/Region']==each].iloc[:,4::].sum(axis=0))

In [76]:
df_analyse.head()

Unnamed: 0,date,Italy,US,Spain,Germany,Brazil
0,1/22/20,0,1,0,0,0
1,1/23/20,0,1,0,0,0
2,1/24/20,0,2,0,0,0
3,1/25/20,0,2,0,0,0
4,1/26/20,0,5,0,0,0


# Helper function

In [77]:
def quick_plot(x_in, df_input, y_scale='log',slider=False):
    fig = go.Figure()
    
    for each in df_input.columns:
        fig.add_trace(go.Scatter(
                        x=x_in,
                        y=df_input[each],
                        name=each,
                        ))
        fig.update_layout(autosize=True,
            width=900,
            height=768,
            font=dict(
                family="PT Sans, monospace",
                size=18,
                color='#7f7f7f'))
        fig.update_yaxes(type=y_scale),
        fig.update_xaxes(tickangle=-45,
                        nticks=20,
                        tickfont=dict(size=14, color='#7f7f7f'))
        if slider==True:
            fig.update_layout(xaxis_rangeslider_visible=True)
        fig.show()


In [78]:
quick_plot(df_analyse.date,
          df_analyse.iloc[:,1:],
          y_scale='linear',
          slider=True)

In [79]:
threshold=100

In [80]:
compare_list=[]
for pos,country in enumerate(df_analyse.columns[1:]):
    compare_list.append(np.array(df_analyse[country][df_analyse[country]>threshold]))

In [81]:
pd_sync_timelines=pd.DataFrame(compare_list, index=df_analyse.columns[1:]).T

In [82]:
pd_sync_timelines['date']=np.arange(pd_sync_timelines.shape[0])

In [83]:
quick_plot(pd_sync_timelines.date,
          pd_sync_timelines.iloc[:,:-1],
          y_scale='log',
          slider=True)

$N(t)=N_0*2^(t/T)$

In [84]:
def doubling_rate(N_0,t,T_d):
    return N_0*np.power(2,t/T_d)

In [85]:
max_days=180

norm_slopes={
    'doubling every day':doubling_rate(100,np.arange(max_days),1),
    'doubling every 2 days':doubling_rate(100,np.arange(max_days),2),
    'doubling every 4 days':doubling_rate(100,np.arange(max_days),4),
    'doubling every 10 days':doubling_rate(100,np.arange(max_days),10),
}

In [86]:
pd_sync_timelines_w_slope=pd.concat([pd.DataFrame(norm_slopes),pd_sync_timelines],axis=1)

In [87]:
quick_plot(pd_sync_timelines_w_slope.date,
          pd_sync_timelines_w_slope.iloc[:,0:5],
          y_scale='log',
          slider=True)

# Machine Learning

* Understanding Linear Regression

In [88]:
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)

In [89]:
l_vec=len(df_analyse['Germany'])
x=np.arange(l_vec-5).reshape(-1,1)
y=np.log((np.array(df_analyse['Germany'][5:])))

In [90]:
reg.fit(x,y)

LinearRegression()

In [91]:
x_hat=np.arange(l_vec).reshape(-1,1)
y_hat=reg.predict(x_hat)

In [92]:
LR_inspect=df_analyse[['date','Germany']].copy()

In [93]:
LR_inspect['prediction']=np.exp(y_hat)

In [94]:
quick_plot(LR_inspect.date,
          LR_inspect.iloc[:,1:],
          y_scale='log',
          slider=True)

# Doubling rate - Piecewise Linear Regression

In [95]:
from sklearn import linear_model
reg=linear_model.LinearRegression(fit_intercept=True)

In [96]:
from scipy import signal

In [97]:
## filter data
for each in country_list:
    df_analyse[each+'_filter']=signal.savgol_filter(df_analyse[each], 3, 1)

In [98]:
filter_cols=['US_filter','Spain_filter','Germany_filter','Brazil_filter']


In [99]:
start_pos=5
quick_plot(df_analyse.date[start_pos:],
          df_analyse[filter_cols].iloc[start_pos:,:], y_scale='log', slider=True)

In [100]:
def get_doubling_time_via_regression(in_array):
    y=np.array(in_array)
    x=np.arange(-1,2).reshape(-1,1)
    
    assert len(in_array)
    reg.fit(x,y)
    intercept=reg.intercept_
    slope=reg.coef_
    return intercept/slope


In [101]:
def doubling_time(in_array):
    y=np.array(in_array)
    return len(y)*np.log(2)/np.log(y[-1]/y[0])

In [102]:
# calculate slope of regression of last x days
days_back=3 #this gives smoothing effect
for pos, country in enumerate(country_list):
    df_analyse[country+'_DR']=df_analyse[country].rolling(window=days_back,
                                 min_periods=days_back).apply(get_doubling_time_via_regression, raw=False)

In [103]:
#run on all filtered data
days_back=3 #this gives smoothing effect
for pos, country in enumerate(filter_cols):
    df_analyse[country+'_DR']=df_analyse[country].rolling(window=days_back,
                                 min_periods=days_back).apply(get_doubling_time_via_regression, raw=False)

In [104]:
#cross check the math
df_analyse['Germany_DR_math']=df_analyse['Germany'].rolling(window=days_back,
                             min_periods=days_back).apply(doubling_time, raw=False)

In [105]:
#run on all filtered data
days_back=3 #this gives smoothing effect
for pos, country in enumerate(filter_cols):
    df_analyse[country+'_DR']=df_analyse[country].rolling(window=days_back,
                                 min_periods=days_back).apply(get_doubling_time_via_regression, raw=False)

In [106]:
df_analyse.columns

Index(['date', 'Italy', 'US', 'Spain', 'Germany', 'Brazil', 'Italy_filter',
       'US_filter', 'Spain_filter', 'Germany_filter', 'Brazil_filter',
       'Italy_DR', 'US_DR', 'Spain_DR', 'Germany_DR', 'Brazil_DR',
       'US_filter_DR', 'Spain_filter_DR', 'Germany_filter_DR',
       'Brazil_filter_DR', 'Germany_DR_math'],
      dtype='object')

In [107]:
start_pos=40
quick_plot(df_analyse.date[start_pos:], df_analyse.iloc[start_pos:,[11,12,13,14]], y_scale='linear',slider=True)

In [108]:
start_pos=40
quick_plot(df_analyse.date[start_pos:], df_analyse.iloc[start_pos:,[16,17,18,19]], y_scale='linear',slider=True)