### Importing Libraries

In [48]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np

%matplotlib inline
mpl.rcParams['figure.figsize'] = (16,10)
pd.set_option('display.max_rows', 500)

import plotly.graph_objects as go

### Data Load

In [49]:
# parse date stores the date format in a column
df_analyse=pd.read_csv('../data/processed/COVID_small_flat_table.csv', sep = ';',parse_dates = [0])

df_analyse.sort_values('date', ascending= True).tail()
# sorting in ascending values

Unnamed: 0,date,Italy,US,Spain,Germany,India
881,2022-06-21,17959329,86452232,12613634,27454225,43331645
882,2022-06-22,18014202,86636306,12613634,27573585,43344958
883,2022-06-23,18071634,86757621,12613634,27681775,43362294
884,2022-06-24,18128044,86909716,12681820,27771111,43378234
885,2022-06-25,18184917,86949088,12681820,27771112,43389973


In [50]:
country_list = df_analyse.columns[1:]

### Helper function

In [51]:
def quick_plot(x_in, df_input, y_scale= 'log', slider = False):
    
    fig= go.Figure()
    
    for each in df_input.columns:
        fig.add_trace(go.Scatter(
                        x=x_in,
                        y=df_input[each],
                        name= each,
                        opacity= 0.8))
    
    fig.update_layout(autosize=True,
        width = 1007,
        height= 770,
        font=dict(
            family="PT Sans, monospace",
            size= 16,
            color= "#7f7f7f"
            )
        )
    
    fig.update_yaxes(type=y_scale),
    fig.update_xaxes(tickangle = -45,
                    nticks= 20,
                    tickfont = dict(size=14, color = "#7f7f7f")
                    )
    if slider == True:
        fig.update_layout(xaxis_rangeslider_visible = True)
    fig.show()

In [52]:
quick_plot(df_analyse.date,
          df_analyse.iloc[:,1:],
          y_scale= 'linear',
          slider = True)

In [53]:
threshold = 100

In [54]:
compare_list= []
for pos, country in enumerate(df_analyse.columns[1:]):
    compare_list.append(np.array(df_analyse[country][df_analyse[country]>threshold]))

In [55]:
compare_list

[array([     155,      229,      322,      453,      655,      888,
            1128,     1694,     2036,     2502,     3089,     3858,
            4636,     5883,     7375,     9172,    10149,    12462,
           15113,    17660,    21157,    24747,    27980,    31506,
           35713,    41035,    47021,    53578,    59138,    63927,
           69176,    74386,    80589,    86498,    92472,    97689,
          101739,   105792,   110574,   115242,   119827,   124632,
          128948,   132547,   135586,   139422,   143626,   147577,
          152271,   156363,   159516,   162488,   165155,   168941,
          172434,   175925,   178972,   181228,   183957,   187327,
          189973,   192994,   195351,   197675,   199414,   201505,
          203591,   205463,   207428,   209328,   210717,   211938,
          213013,   214457,   215858,   217185,   218268,   219070,
          219814,   221216,   222104,   223096,   223885,   224760,
          225435,   225886,   226699,   227364, 

In [56]:
pd_sync_timelines= pd.DataFrame(compare_list, index= df_analyse.columns[1:]).T

In [57]:
pd_sync_timelines['date']= np.arange(pd_sync_timelines.shape[0]) #shape[0] is used to get the number of rows

In [58]:
pd_sync_timelines.tail() #date values are changed as index values by above command

Unnamed: 0,Italy,US,Spain,Germany,India,date
849,17959329.0,,,,,849
850,18014202.0,,,,,850
851,18071634.0,,,,,851
852,18128044.0,,,,,852
853,18184917.0,,,,,853


In [59]:
quick_plot(pd_sync_timelines.date,
          pd_sync_timelines.iloc[:,:-1],
          y_scale= 'linear',
          slider= True)

### Doubling Rate

$N(t)= N_0*2^{t/T}$

In [60]:
def doubling_rate(N_0, t, T_d):
    return N_0*np.power(2,t/T_d)

In [72]:
max_days = 800

norm_slopes= {
    #'doubling every day' : doubling_rate(100, np.arange(max_days),1),
    'doubling every 25 days' : doubling_rate(100, np.arange(max_days),25),
    'doubling every 35 days' : doubling_rate(100, np.arange(max_days),35),
    'doubling every 45 days' : doubling_rate(100, np.arange(max_days),45),
}


In [73]:
pd_sync_timelines_w_slope =pd.concat([pd.DataFrame(norm_slopes), pd_sync_timelines], axis= 1)
pd_sync_timelines_w_slope

Unnamed: 0,doubling every 25 days,doubling every 35 days,doubling every 45 days,Italy,US,Spain,Germany,India,date
0,100.000000,100.000000,100.000000,155.0,107.0,120.0,117.0,102.0,0
1,102.811383,102.000161,101.552251,229.0,184.0,165.0,150.0,113.0,1
2,105.701804,104.040328,103.128597,322.0,237.0,222.0,188.0,119.0,2
3,108.673486,106.121302,104.729412,453.0,403.0,259.0,240.0,142.0,3
4,111.728714,108.243899,106.355076,655.0,519.0,400.0,349.0,156.0,4
...,...,...,...,...,...,...,...,...,...
849,,,,17959329.0,,,,,849
850,,,,18014202.0,,,,,850
851,,,,18071634.0,,,,,851
852,,,,18128044.0,,,,,852


In [74]:
quick_plot(pd_sync_timelines_w_slope.date,
          pd_sync_timelines_w_slope.iloc[:,:-1],
          y_scale = 'log',
          slider = True)

In [75]:
pd_sync_timelines_w_slope.to_csv('../data/processed/COVID_small_sync_timeline_table.csv', sep = ';', index = False)

### Understanding linear regression

In [65]:
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept= False) # intercept is set to zero, and not shown the backwards trace to negative

In [19]:
l_vec= len(df_analyse['Germany']) # to get number of rows
X= np.arange(l_vec-5).reshape(-1,1) # to convert to column vector
y= np.log(np.array(df_analyse['Germany'][5:]))

In [20]:
reg.fit(X,y)

LinearRegression(fit_intercept=False)

In [21]:
X_hat = np.arange(l_vec).reshape(-1,1)
Y_hat = reg.predict(X_hat)

In [22]:
LR_inspect = df_analyse[['date','Germany']].copy()

In [23]:
LR_inspect['prediction'] = np.exp(Y_hat)
LR_inspect

Unnamed: 0,date,Germany,prediction
0,2020-01-22,0,1.000000e+00
1,2020-01-23,0,1.026476e+00
2,2020-01-24,0,1.053652e+00
3,2020-01-25,0,1.081548e+00
4,2020-01-26,0,1.110182e+00
...,...,...,...
881,2022-06-21,27454225,9.956519e+09
882,2022-06-22,27573585,1.022012e+10
883,2022-06-23,27681775,1.049071e+10
884,2022-06-24,27771111,1.076845e+10


In [24]:
quick_plot(LR_inspect.date,
          LR_inspect.iloc[:,1:],
          y_scale = 'log',
          slider = True)

### Doubling Rate - Piecewise Linear Regression

To predict future cases using doubling rate

In [25]:
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept= False)

In [26]:
from scipy import signal

In [27]:
df_analyse = pd.read_csv('../data/processed/COVID_small_flat_table.csv', sep = ';', 
                        parse_dates = [0])
country_list = df_analyse.columns[1:]

In [28]:
for each in country_list:
    df_analyse[each+'_filter'] = signal.savgol_filter(df_analyse[each],
                                                   5, # window size used for filtering
                                                   1) # order of filtered polynomial

In [29]:
filter_cols = ['Italy_filter','US_filter', 'Spain_filter', 'Germany_filter','India_filter']


In [30]:
start_pos = 5
quick_plot(df_analyse.date[start_pos:],
          df_analyse[filter_cols].iloc[start_pos:,:],
          y_scale = 'log',
          slider = True)

In [31]:
df_analyse

Unnamed: 0,date,Italy,US,Spain,Germany,India,Italy_filter,US_filter,Spain_filter,Germany_filter,India_filter
0,2020-01-22,0,1,0,0,0,0.0,0.4,0.0,0.0,0.0
1,2020-01-23,0,1,0,0,0,0.0,1.3,0.0,0.0,0.0
2,2020-01-24,0,2,0,0,0,0.0,2.2,0.0,0.0,0.0
3,2020-01-25,0,2,0,0,0,0.0,3.0,0.0,0.2,0.0
4,2020-01-26,0,5,0,0,0,0.0,3.8,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
881,2022-06-21,17959329,86452232,12613634,27454225,43331645,17964078.0,86477868.2,12593540.0,27451294.8,43333553.2
882,2022-06-22,18014202,86636306,12613634,27573585,43344958,18013854.8,86610591.2,12617224.2,27563137.8,43347305.4
883,2022-06-23,18071634,86757621,12613634,27681775,43362294,18071625.2,86740992.6,12640908.4,27650361.6,43361420.8
884,2022-06-24,18128044,86909716,12681820,27771111,43378234,18128127.0,86867704.8,12661364.2,27733491.6,43376414.0


### Doubling rate

In [32]:
def get_doubling_time_via_regression(in_array):
    '''using a linear regression to approximate the doubling time'''
    
    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1,1)
    
    assert len(in_array) == 3 # to restrict the input data points to 3
    reg.fit(X,y)
    intercept = reg.intercept_
    slope = reg.coef_
    
    return intercept/slope # formula for doubling rate

In [33]:
def doubling_time(in_array):
    '''Using a classical doubling time formula from wikipedia'''
    
    y= np.array(in_array)
    return len(y)*np.log(2)/np.log(y[-1]/y[0])

In [34]:
# calculating the slope of regression of the last x days
# We should always use a limited no. of days to approximate the traingle. Here we are taking 3

days_back = 3  # This gives the smoothing effect
for pos,country in enumerate(country_list):
    df_analyse[country+'_DR'] = df_analyse[country].rolling(
                                                window = days_back,
                                                min_periods = days_back).apply(get_doubling_time_via_regression, raw = False)
    # to get slope across the window as it rolls through the data

In [35]:
days_back = 3  # This gives the smoothing effect
for pos,country in enumerate(filter_cols):
    df_analyse[country+'_DR'] = df_analyse[country].rolling(
                                                window = days_back,
                                                min_periods = days_back).apply(get_doubling_time_via_regression, raw = False)

In [36]:
# Cross checking with the mathematical formula
df_analyse['Germany_DR_math'] = df_analyse['Germany'].rolling(
                                        window = days_back,
                                        min_periods = days_back).apply(doubling_time,raw=False)

In [37]:
# running on all the filtered data
days_back = 3  # This gives the smoothing effect
for pos,country in enumerate(filter_cols):
    df_analyse[country+'_DR'] = df_analyse[country].rolling(
                                                window = days_back,
                                                min_periods = days_back).apply(get_doubling_time_via_regression, raw = False)

In [38]:
df_analyse.head().T

Unnamed: 0,0,1,2,3,4
date,2020-01-22 00:00:00,2020-01-23 00:00:00,2020-01-24 00:00:00,2020-01-25 00:00:00,2020-01-26 00:00:00
Italy,0,0,0,0,0
US,1,1,2,2,5
Spain,0,0,0,0,0
Germany,0,0,0,0,0
India,0,0,0,0,0
Italy_filter,0.0,0.0,0.0,0.0,0.0
US_filter,0.4,1.3,2.2,3.0,3.8
Spain_filter,0.0,0.0,0.0,0.0,0.0
Germany_filter,0.0,0.0,0.0,0.2,1.0


In [39]:
start_pos = 40
quick_plot(df_analyse.date[start_pos:],
          df_analyse.iloc[start_pos:, [10,11,12,13,14]],
          y_scale = 'linear',
          slider = True)

In [40]:
start_pos = 40
quick_plot(df_analyse.date[start_pos:],
          df_analyse.iloc[start_pos:, [16,17,18,19]],
          y_scale = 'linear',
          slider = True)