In [23]:
import pandas as pd
import numpy as np

from datetime import datetime

%matplotlib inline
pd.set_option('display.max_rows', 500)

In [24]:
pd_JH_data=pd.read_csv('../data/processed/COVID_relational_confirmed.csv',sep=';',parse_dates=[0])
pd_JH_data=pd_JH_data.sort_values('date',ascending=True).reset_index(drop=True).copy()
pd_JH_data.head()

Unnamed: 0,date,state,country,confirmed
0,2020-01-22,Alberta,Canada,0.0
1,2020-01-22,no,Kosovo,0.0
2,2020-01-22,no,Kuwait,0.0
3,2020-01-22,no,Kyrgyzstan,0.0
4,2020-01-22,no,Laos,0.0


In [25]:
test_data=pd_JH_data[((pd_JH_data['country']=='Italy')|
                      (pd_JH_data['country']=='Kazakhstan'))&
                     (pd_JH_data['date']>'2020-03-20')]

In [26]:
test_data.head()

Unnamed: 0,date,state,country,confirmed
16868,2020-03-21,no,Kazakhstan,53.0
16872,2020-03-21,no,Italy,53578.0
17135,2020-03-22,no,Kazakhstan,61.0
17157,2020-03-22,no,Italy,59138.0
17438,2020-03-23,no,Kazakhstan,62.0


In [27]:
test_data.groupby(['country']).agg(np.max)

Unnamed: 0_level_0,date,state,confirmed
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Italy,2022-06-17,no,17809934.0
Kazakhstan,2022-06-17,no,1395123.0


In [28]:
import numpy as np
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)

def get_doubling_time_via_regression(in_array):
    ''' Use a linear regression to approximate the doubling rate'''

    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)

    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_

    return intercept/slope

In [29]:
test_data.groupby(['state','country']).agg(np.max)

Unnamed: 0_level_0,Unnamed: 1_level_0,date,confirmed
state,country,Unnamed: 2_level_1,Unnamed: 3_level_1
no,Italy,2022-06-17,17809934.0
no,Kazakhstan,2022-06-17,1395123.0


In [30]:
def rolling_reg(df_input,col='confirmed'):
    ''' input has to be a data frame'''
    ''' return is single series (mandatory for group by apply)'''
    days_back=3
    result=df_input[col].rolling(
                window=days_back,
                min_periods=days_back).apply(get_doubling_time_via_regression,raw=False)
    return result

In [31]:
test_data[['state','country','confirmed']].groupby(['state','country']).apply(rolling_reg,'confirmed')

state  country           
no     Italy       16872               NaN
                   17157               NaN
                   17442         11.379070
                   17727         12.767550
                   18012         13.225547
                                 ...      
       Kazakhstan  248858    126822.878788
                   249125    214624.820513
                   249429    116255.944444
                   249714     82063.823529
                   249979     73426.491228
Name: confirmed, Length: 1638, dtype: float64

In [32]:
pd_DR_result=pd_JH_data[['state','country','confirmed']].groupby(['state','country']).apply(rolling_reg,'confirmed').reset_index()

In [33]:
pd_DR_result=pd_DR_result.rename(columns={'confirmed':'confirmed_DR', 'level_2':'index'})
pd_DR_result.head()

Unnamed: 0,state,country,index,confirmed_DR
0,Alberta,Canada,0,
1,Alberta,Canada,499,
2,Alberta,Canada,762,
3,Alberta,Canada,1069,
4,Alberta,Canada,1333,


In [34]:
pd_JH_data=pd_JH_data.reset_index()
pd_JH_data.head()

Unnamed: 0,index,date,state,country,confirmed
0,0,2020-01-22,Alberta,Canada,0.0
1,1,2020-01-22,no,Kosovo,0.0
2,2,2020-01-22,no,Kuwait,0.0
3,3,2020-01-22,no,Kyrgyzstan,0.0
4,4,2020-01-22,no,Laos,0.0


In [35]:
pd_result_larg=pd.merge(pd_JH_data,pd_DR_result[['index','confirmed_DR']],on=['index'],how='left')
pd_result_larg.head()

Unnamed: 0,index,date,state,country,confirmed,confirmed_DR
0,0,2020-01-22,Alberta,Canada,0.0,
1,1,2020-01-22,no,Kosovo,0.0,
2,2,2020-01-22,no,Kuwait,0.0,
3,3,2020-01-22,no,Kyrgyzstan,0.0,
4,4,2020-01-22,no,Laos,0.0,


In [36]:
from scipy import signal

def savgol_filter(df_input,column='confirmed',window=5):
    ''' Savgol Filter which can be used in groupby apply function 
        it ensures that the data structure is kept'''
    window=5, 
    degree=1
    df_result=df_input
    
    filter_in=df_input[column].fillna(0) # attention with the neutral element here
    
    result=signal.savgol_filter(np.array(filter_in),
                           5, # window size used for filtering
                           1)
    df_result[column+'_filtered']=result
    return df_result
        

In [37]:
pd_filtered_result=pd_JH_data[['state','country','confirmed']].groupby(['state','country']).apply(savgol_filter).reset_index()

In [38]:
pd_result_larg=pd.merge(pd_result_larg,pd_filtered_result[['index','confirmed_filtered']],on=['index'],how='left')
pd_result_larg.head()

Unnamed: 0,index,date,state,country,confirmed,confirmed_DR,confirmed_filtered
0,0,2020-01-22,Alberta,Canada,0.0,,0.0
1,1,2020-01-22,no,Kosovo,0.0,,0.0
2,2,2020-01-22,no,Kuwait,0.0,,0.0
3,3,2020-01-22,no,Kyrgyzstan,0.0,,0.0
4,4,2020-01-22,no,Laos,0.0,,0.0


In [39]:
pd_filtered_doubling=pd_result_larg[['state','country','confirmed_filtered']].groupby(['state','country']).apply(rolling_reg,'confirmed_filtered').reset_index()

pd_filtered_doubling=pd_filtered_doubling.rename(columns={'confirmed_filtered':'confirmed_filtered_DR',
                             'level_2':'index'})

pd_filtered_doubling.tail()

Unnamed: 0,state,country,index,confirmed_filtered_DR
250225,no,Zimbabwe,248946,3471.211293
250226,no,Zimbabwe,249159,3073.432487
250227,no,Zimbabwe,249466,2258.185317
250228,no,Zimbabwe,249708,1783.373058
250229,no,Zimbabwe,250229,1649.674012


In [40]:
pd_result_larg=pd.merge(pd_result_larg,pd_filtered_doubling[['index','confirmed_filtered_DR']],on=['index'],how='left')
pd_result_larg.tail()

Unnamed: 0,index,date,state,country,confirmed,confirmed_DR,confirmed_filtered,confirmed_filtered_DR
250225,250225,2022-06-17,no,Andorra,43449.0,inf,43494.0,643.355556
250226,250226,2022-06-17,no,Algeria,265968.0,33245.17,265969.4,32043.506024
250227,250227,2022-06-17,no,Albania,277141.0,1730.798,277031.0,3377.426829
250228,250228,2022-06-17,no,Argentina,9313453.0,inf,9320820.0,1264.212434
250229,250229,2022-06-17,no,Zimbabwe,254753.0,1390.969,254699.0,1649.674012


In [41]:
mask=pd_result_larg['confirmed']>100
pd_result_larg['confirmed_filtered_DR']=pd_result_larg['confirmed_filtered_DR'].where(mask, other=np.NaN) 

In [43]:
pd_result_larg[pd_result_larg['country']=='Italy'].tail()

Unnamed: 0,index,date,state,country,confirmed,confirmed_DR,confirmed_filtered,confirmed_filtered_DR
248862,248862,2022-06-13,no,Italy,17664043.0,1177.563169,17678413.2,737.490244
249147,249147,2022-06-14,no,Italy,17703887.0,699.784936,17706353.0,667.680831
249433,249433,2022-06-15,no,Italy,17736696.0,487.29005,17737664.8,597.704602
249718,249718,2022-06-16,no,Italy,17773764.0,507.695398,17773830.7,525.782083
250001,250001,2022-06-17,no,Italy,17809934.0,485.361825,17809996.6,491.452741


In [44]:
pd_result_larg.to_csv('../data/processed/COVID_final_set.csv',sep=';',index=False)