<a href="https://colab.research.google.com/github/herysedra/covid19-mankaiza-clone/blob/andrana/scripts/paper/simple_blocks/data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats
import matplotlib
import pickle

"""
Mahasedra comments:
Here we will trace most of treatments done on the data 'confirmed_cases'
"""

confirmed_cases_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
confirmed_cases = pd.read_csv(confirmed_cases_url, sep=',')
deaths_url =  'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
deaths = pd.read_csv(deaths_url, sep=',')
path_to_save = '../../figures/'
path_data = '../../data/'


In [8]:
import pymc3 as pm
import theano.tensor as tt
import theano
import datetime
import time

date_data_begin = datetime.date(2020,3,1)
date_data_end = datetime.date(2020,3,15)
num_days_to_predict = 28


diff_data_sim = 16 # should be significantly larger than the expected delay, in 
                   # order to always fit the same number of data points.

"""
Mahasedra:
diff_data_sim: number of days which separate the date_data_begin (1 march 2020:
the fitting starts) and the date_begin_sim (15 march 2020: the simulation starts
)
"""

date_begin_sim = date_data_begin - datetime.timedelta(days = diff_data_sim)

format_date = lambda date_py: '{}/{}/{}'.format(date_py.month, date_py.day,
                                                 str(date_py.year)[2:4])
date_formatted_begin = format_date(date_data_begin)
date_formatted_end = format_date(date_data_end)

"""
Mahasedra & Joely: format date
"""

cases_obs =  np.array(
    confirmed_cases.loc[confirmed_cases["Country/Region"] == "Germany", 
                        date_formatted_begin:date_formatted_end])[0]

print(confirmed_cases)
print(date_data_begin)
print(date_formatted_begin)

print('cases_obs from date_data_begin to date_data_end is', cases_obs, sep=" ")

type(cases_obs)
cases_obs.shape
#From the output, cases_obs is made of 15

                Province/State         Country/Region  ...  4/14/20  4/15/20
0                          NaN            Afghanistan  ...      714      784
1                          NaN                Albania  ...      475      494
2                          NaN                Algeria  ...     2070     2160
3                          NaN                Andorra  ...      659      673
4                          NaN                 Angola  ...       19       19
..                         ...                    ...  ...      ...      ...
259  Saint Pierre and Miquelon                 France  ...        1        1
260                        NaN            South Sudan  ...        4        4
261                        NaN         Western Sahara  ...        6        6
262                        NaN  Sao Tome and Principe  ...        4        4
263                        NaN                  Yemen  ...        1        1

[264 rows x 89 columns]
2020-03-01
3/1/20
cases_obs from date_data_begin to

(15,)

In [5]:
cases0_obs =  np.array(
    confirmed_cases.loc[confirmed_cases["Country/Region"] == "Germany", 
                        date_formatted_begin:date_formatted_end])

# Notice the absence of [0] at the end compared to cases_obs.

print(cases0_obs)
type(cases0_obs)
cases0_obs.shape
#From the output, cases0_obs is made of 1 row and 15 columns 
# The action of [0] is to pick the index 0.

[[ 130  159  196  262  482  670  799 1040 1176 1457 1908 2078 3675 4585
  5795]]


(1, 15)

In [0]:
def delay_cases(new_I_t, len_new_I_t, len_new_cases_obs , delay, delay_arr):
    """
    Delays the input new_I_t by delay and return and array with length len_new_cases_obs
    The initial delay of the output is set by delay_arr. 
    
    Take care that delay is smaller or equal than delay_arr, otherwise zeros are 
    returned, which could potentially lead to errors

    Also assure that len_new_I_t is larger then len(new_cases_obs)-delay, otherwise it 
    means that the simulated data is not long enough to be fitted to the data.
    """

    """
    Mahasedra comments:
    The function delay_cases() constructs the newly reported cases (C_t) of the article (p. 11. Eq. 4).
    Indeed, it will be used below in many equalities of the form: new_cases_inferred = delay_cases(...),
    where: 
    new_cases_inferred is the number of new cases given by the SIR model 
    new_cases_obs is the newly reported or observed cases.

    The function delay_cases() is constructed by building other 2 functions: 
    make_delay_matrix() and interpolate(). [The constructions of these 
    2 functions are just programming details.]
    """



    delay_mat = make_delay_matrix(n_rows=len_new_I_t, 
                                  n_columns=len_new_cases_obs, initial_delay=delay_arr)
    inferred_cases = interpolate(new_I_t, delay, delay_mat)
    return inferred_cases 

def make_delay_matrix(n_rows, n_columns, initial_delay=0):
    """
    Has in each entry the delay between the input with size n_rows and the output
    with size n_columns
    """
    size = max(n_rows, n_columns)
    mat = np.zeros((size, size))
    for i in range(size):
        diagonal = np.ones(size-i)*(initial_delay + i)
        mat += np.diag(diagonal, i)
    for i in range(1, size):
        diagonal = np.ones(size-i)*(initial_delay - i)
        mat += np.diag(diagonal, -i)
    return mat[:n_rows, :n_columns]
    
    """
    Mahasedra comments:
    Above is the construction of make_delay_matrix().
    """

def interpolate(array, delay, delay_matrix):
    interp_matrix = tt.maximum(1-tt.abs_(delay_matrix - delay), 0)
    interpolation = tt.dot(array,interp_matrix)
    return interpolation

    """
    Mahasedra comments:
    Above is the construction of interpolate().
    """



In [0]:
# Construction of new_case_inferred

"""
new_cases_inferred = delay_cases(tt.concatenate([new_I_past[-diff_data_sim:], new_I_no_change]), 
                                 len_new_I_t=diff_data_sim + num_days_to_predict, 
                                 len_new_cases_obs=num_days_to_predict, 
                                 delay=delay, delay_arr=diff_data_sim)
"""