In [1]:
import pandas as pd
import numpy as np

1. Import confirmed, recovered and deaths data set for a given country

In [176]:
confirmed = pd.read_csv('Corona-Virus-Dataset/time_series_covid_19_confirmed.csv')
recovered = pd.read_csv('Corona-Virus-Dataset/time_series_covid_19_recovered.csv')
deaths = pd.read_csv('Corona-Virus-Dataset/time_series_covid_19_deaths.csv')

2. Pre-process the datasets

In [177]:
def reformat(df, country, col_name):
    
    df = df[df['Country/Region'] == country]
    index = df.iloc[0].name

    df = df.T
    df = df.rename({index: col_name}, axis = 1)
    # get rid of columns that do not represent the number of recovered/confirmed/deaths 
    df = df.iloc[4:,:]
    
    return df

    #df = df[df[col_name] > 0]


In [178]:
recovered = reformat(recovered, 'Italy', 'Recovered cases')
confirmed = reformat(confirmed, 'Italy', 'Confirmed cases')
deaths = reformat(deaths, 'Italy', 'Deaths cases')

recovered.head()

Unnamed: 0,Recovered cases
1/22/20,0
1/23/20,0
1/24/20,0
1/25/20,0
1/26/20,0


3. Create new data frame recovered per day so we know how many people recovered from the epidemic on a given day


**problem** : Here, I see that recovered_daily contains a negative value, therefore recovered is not fully incremental. ??? Change dataset?

In [179]:
recovered_daily = recovered.diff()
recovered_daily.iloc[0,:] = 0

In [180]:
deaths_daily = deaths.diff()
deaths_daily.iloc[0] = 0

4. Generate dataset infected representing the number of infected individuals at a given time t (not the new number of infected, number of actively infected individuals)

$$\text{confirmed}(t)=\text{confirmed}(t)-\sum_{i=0}^{t}\text{recovered_daily(i)}-\sum_{i=0}^{t}\text{deaths_daily}(i)$$

In [181]:
infected_ = []
days_ = []
# i: date (time t)
# row: row
acc_deaths = 0
acc_recovered = 0

for i, row in confirmed.iterrows():
    acc_recovered += recovered_daily.loc[i]['Recovered cases']
    acc_deaths += deaths_daily.loc[i]['Deaths cases']
    
    diff = row['Confirmed cases'] - acc_recovered - acc_deaths
    
    infected_.append(diff)
    days_.append(i)

Careful, infected represents active number of infected, not new number of infected

In [182]:
infected = pd.DataFrame({'Date': days_,'Infected Cases': infected_})

infected.tail()

Unnamed: 0,Date,Infected Cases
54,3/16/20,23073
55,3/17/20,26062
56,3/18/20,28710
57,3/19/20,33190
58,3/20/20,38549


In [183]:
recovered = recovered.reset_index().rename({'index':'Date'}, axis = 1)

Now we can start working on predicting our gammas. 
Recovered is cumulative, as expected (absorbing state in our SIR model) and infected represents the number of infected individual at a certain date

In [185]:
data = recovered.merge(infected, on='Date')

In [192]:
dataframe = concat([data['Date'],data['Recovered cases'], data['Infected Cases'].shift(7),
                   data['Infected Cases'].shift(8),data['Infected Cases'].shift(9),
                   data['Infected Cases'].shift(10),data['Infected Cases'].shift(11),
                   data['Infected Cases'].shift(12),data['Infected Cases'].shift(13),
                   data['Infected Cases'].shift(14),data['Infected Cases'].shift(15),
                   data['Infected Cases'].shift(16),data['Infected Cases'].shift(17),
                   data['Infected Cases'].shift(18),data['Infected Cases'].shift(19),
                   data['Infected Cases'].shift(20),data['Infected Cases'].shift(21)], axis=1)

In [196]:
dataframe

Unnamed: 0,Date,Recovered cases,Infected Cases,Infected Cases.1,Infected Cases.2,Infected Cases.3,Infected Cases.4,Infected Cases.5,Infected Cases.6,Infected Cases.7,Infected Cases.8,Infected Cases.9,Infected Cases.10,Infected Cases.11,Infected Cases.12,Infected Cases.13,Infected Cases.14
0,1/22/20,0,,,,,,,,,,,,,,,
1,1/23/20,0,,,,,,,,,,,,,,,
2,1/24/20,0,,,,,,,,,,,,,,,
3,1/25/20,0,,,,,,,,,,,,,,,
4,1/26/20,0,,,,,,,,,,,,,,,
5,1/27/20,0,,,,,,,,,,,,,,,
6,1/28/20,0,,,,,,,,,,,,,,,
7,1/29/20,0,0.0,,,,,,,,,,,,,,
8,1/30/20,0,0.0,0.0,,,,,,,,,,,,,
9,1/31/20,0,0.0,0.0,0.0,,,,,,,,,,,,


To go on I think it would be better to have a larger data set

In [199]:
# create lagged dataset
dataframe.columns = ['Date', 'Recovered cases', 'Infected cases t-7','Infected cases t-8','Infected cases t-9',
                    'Infected cases t-10','Infected cases t-11','Infected cases t-12','Infected cases t-13',
                    'Infected cases t-14','Infected cases t-15','Infected cases t-16','Infected cases t-17',
                    'Infected cases t-18','Infected cases t-19','Infected cases t-20','Infected cases t-21']

dataframe = dataframe[~np.isnan(dataframe['Infected cases t-21'])]

# split into train and test sets
X = dataframe.values
train, test = X[1:len(X)-7], X[len(X)-7:]
train_X, train_y = train[:,0], train[:,1]
test_X, test_y = test[:,0], test[:,1]
 
# persistence model
def model_persistence(x):
	return x
 
# walk-forward validation
predictions = list()
for x in test_X:
	yhat = model_persistence(x)
	predictions.append(yhat)
test_score = mean_squared_error(test_y, predictions)
print('Test MSE: %.3f' % test_score)
# plot predictions vs expected
pyplot.plot(test_y)
pyplot.plot(predictions, color='red')
pyplot.show()

"# split into train and test sets\nX = dataframe.values\ntrain, test = X[1:len(X)-7], X[len(X)-7:]\ntrain_X, train_y = train[:,0], train[:,1]\ntest_X, test_y = test[:,0], test[:,1]\n \n# persistence model\ndef model_persistence(x):\n\treturn x\n \n# walk-forward validation\npredictions = list()\nfor x in test_X:\n\tyhat = model_persistence(x)\n\tpredictions.append(yhat)\ntest_score = mean_squared_error(test_y, predictions)\nprint('Test MSE: %.3f' % test_score)\n# plot predictions vs expected\npyplot.plot(test_y)\npyplot.plot(predictions, color='red')\npyplot.show()"

In [210]:
dataframe[~np.isnan(dataframe['Infected cases t-21'])]

Unnamed: 0,Date,Recovered cases,Infected cases t-7,Infected cases t-8,Infected cases t-9,Infected cases t-10,Infected cases t-11,Infected cases t-12,Infected cases t-13,Infected cases t-14,Infected cases t-15,Infected cases t-16,Infected cases t-17,Infected cases t-18,Infected cases t-19,Infected cases t-20,Infected cases t-21
21,2/12/20,0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22,2/13/20,0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23,2/14/20,0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24,2/15/20,0,3.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
25,2/16/20,0,3.0,3.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0
26,2/17/20,0,3.0,3.0,3.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0
27,2/18/20,0,3.0,3.0,3.0,3.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0
28,2/19/20,0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0
29,2/20/20,0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0
30,2/21/20,0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [208]:
np.isnan(1)

False