In [2]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
try:
    import filterpy
    import geopy
except ImportError:
    !pip install filterpy
    !pip install geopy
from filterpy.kalman import KalmanFilter, EnsembleKalmanFilter, UnscentedKalmanFilter, MerweScaledSigmaPoints
from filterpy.common import Q_discrete_white_noise, Saver
import loggings
logging.getLogger('matplotlib').setLevel(logging.WARNING)
plt.rcParams.update({'figure.max_open_warning': 0})

pd.set_option("display.max_columns", 500)

import warnings

warnings.filterwarnings("ignore")


ModuleNotFoundError: No module named 'loggings'

In [None]:
# data = pd.read_csv('https://coronavirus.data.gov.uk/downloads/csv/coronavirus-cases_latest.csv')

In [None]:
path = "https://raw.githubusercontent.com/rs-delve/covid19_datasets/master/dataset/combined_dataset_latest.csv"

stringency = pd.read_csv(path, parse_dates = ['DATE'])

# stringency['country_name'] = stringency['country_name'].apply(lambda z : z.upper()) 

stringency.head()

In [None]:
stringency[stringency['ISO']=='BEL']

### Test condition

In [None]:
# region name or None for country-level aggregation
region = None # 'Flanders', 'Brussels', 'Wallonia' or None
# date range to be used (must match with '%m-%d' format)
date_range = None
# date_range = ['03-15', '05-09'] # if date_range is None, use all dates
# interpolate cases and tests values during the weekends?
interpolate_weekend = False

### Load data

In [None]:
data_file = 'https://epistat.sciensano.be/Data/COVID19BE.xlsx'
#data_file = 'data/Belgium/COVID19BE.xlsx'
xls = pd.ExcelFile(data_file)
df_hosp = pd.read_excel(xls, 'HOSP')
df_hosp.columns = map(str.lower, df_hosp.columns)
if region:
    df_hosp = df_hosp[df_hosp.region == region]
df_hosp.date = pd.to_datetime(df_hosp.date, format='%Y-%m-%d').dt.strftime('%Y-%m-%d')
df_hosp = df_hosp.sort_values('date').set_index('date')
df_hosp = df_hosp.groupby('date').sum()

In [None]:
df_hosp.head()

### Select dates of interest

In [None]:
# default is without year. if year is required, make sure to modify accordingly.
if date_range:
    if date_range[0] is None:
        date_range[0] = '00-00'
    if date_range[1] is None:
        date_range[1] = '99-99'
    df_hosp = df_hosp[(df_hosp.index >= date_range[0]) & (df_hosp.index <= date_range[1])]

In [None]:
df_hosp.index.max()

### Interpolate weekends if needed

In [None]:
def interpolate_weekend(x, col, first_friday):
    print('Interpolate for %s (Fri, Sat, Sun, Mon)' % col)
    first_friday_index = np.flatnonzero((x.index==first_friday))[0]
    for i in range(first_friday_index, x.shape[0]-3, 7):
        delta = (x.loc[x.index[i+3], col] - x.loc[x.index[i], col]) / 3
        value_sat = x.loc[x.index[i], col] + delta
        value_sun = value_sat + delta
#         print('[%s:%s] %7.1f, %7.1f > %-7.1f, %7.1f > %-7.1f, %7.1f' % (x.index[i], x.index[i+3],\
#             x.loc[x.index[i], col], x.loc[x.index[i+1], col], value_sat, x.loc[x.index[i+2], col], value_sun, x.loc[x.index[i+3], col]))
        x.loc[x.index[i+1], col] = value_sat
        x.loc[x.index[i+2], col] = value_sun
        return x

if interpolate_weekend is True:
    first_friday = '0320'
    interpolate_weekend(df_case, 'cases', first_friday)
    interpolate_weekend(df_case, 'vul_count', first_friday)
    interpolate_weekend(df_test, 'tests', first_friday)

    # recalculate vul_ratio after interpolation
    df_case['vul_ratio'] = df_case.vul_count / df_case.cases

### Kalman filter models

In [None]:
def kalman_predictor(initial_state, kf_p, kf_r, kf_q, kf_a):
    """
    We model Covid development as a dynamical system composed of 3 components:
    - measurement (observable) = case count,
    - speed (latent) = growth rate (cases per day)
    - acceleration (latent) = growth acceleration (cases per day^2)    
    - used params of kf_p=0, kf_r=10, kf_q=20
    """
    # day is our observation interval
    dt = 1
    # transition matrix (x:measurement, v:growth rate, a:growth acceleration)
    F = np.array([[1, dt, 0.5*(dt**2)], # x_new           = x_old + v*dt + 1/2*a*dt^2
                    [0, 1, dt],           # d(v_new) / dt   = v     + a*dt + 0
                    [0, 0, 1]])           # d(x_new) / dt^2 = 0     + 0    + a
    '''
    F = np.array([[1, dt, (dt**2)/2, (dt**3)/6], # x_new = x_old + v*dt + 1/2*a*dt^2 + 1/6*j*dt^3
                  [0, 1, dt, (dt**2)/2],         # d(x_new) / dt = v + a*dt + 1/2*j*dt^2
                  [0, 0, 1, dt],                 # d(v_new) / dt = a + jt
                  [0, 0, 0, 1]])                 # d(a_new) / dt = j
    '''
    # define a linear KF with position, velocity, acceleration parameters
    dim_x = F.shape[0]
    kf = KalmanFilter(dim_x=dim_x, dim_z=1)
    kf.F = F
    # state vector: initial position, velocity, acceleration
    kf.x = np.zeros(dim_x)
    kf.x[0] = initial_state
    # measuremnet matrix: can only directly measure case counts, not velocity & acceleration
    kf.H = np.zeros((1, dim_x))
    kf.H[0][0] = 1
    # covariance matrix
    kf.P *= kf_p
    # measurement noise
    kf.R = kf_r
    # process noise
    kf.Q = Q_discrete_white_noise(dim=dim_x, dt=1, var=kf_q)
    # fading factor
    kf.alpha = kf_a
    return kf

def ensemble_kalman_predictor(initial_state, kf_p, kf_r, kf_q, kf_n):
    """
    We model Covid development as a dynamical system composed of 3 components:
    - measurement (observable) = case count,
    - speed (latent) = growth rate (cases per day)
    - acceleration (latent) = growth acceleration (cases per day^2)
    """
    # day is our observation interval
    dt = 1
    # transition matrix (x:measurement, v:growth rate, a:growth acceleration)
    F = np.array([[1, dt, (dt**2)/2], # x_new           = x_old + v*dt + 1/2*a*dt^2
                  [0, 1, dt],           # d(x_new) / dt   = v     + a*dt + 0
                  [0, 0, 1]])           # d(x_new) / dt^2 = 0     + 0    + a
    ''' also consider jerk
    F = np.array([[1, dt, (dt**2)/2, (dt**3)/6], # x_new = x_old + v*dt + 1/2*a*dt^2 + 1/6*j*dt^3
                  [0, 1, dt, (dt**2)/2],         # d(x_new) / dt = v + a*dt + 1/2*j*dt^2
                  [0, 0, 1, dt],                 # d(v_new) / dt = a + jt
                  [0, 0, 0, 1]])                 # d(a_new) / dt = j
    '''
    # state vector: initial position, velocity, acceleration
    X0 = np.zeros(F.shape[0])
    X0[0] = initial_state
    # transition function
    Fx = lambda x, dt: np.dot(F, x)
    # measuremnet function
    Hx = lambda x: np.array([x[0]])
    # covariance matrix
    P = np.eye(F.shape[0]) * kf_p
    # measurement noise
    R = kf_r
    # process noise
    Q = Q_discrete_white_noise(dim=F.shape[0], dt=1, var=kf_q)
    # let's make it
    kf = EnsembleKalmanFilter(x=X0, P=P, dim_z=1, dt=1, N=kf_n, hx=Hx, fx=Fx)
    kf.R = R
    kf.Q = Q
    return kf

### Forecast using Kalman filter

In [None]:
def kalman_forecast(series, days, kf_type, params):
    """
    Forecast based on history data.
    
    Input
    -----
    series: Pandas Series object with dates being index in ascending order.
    days: Prediction window length
    kf_type: linear, unscented, ensemble
    kf_*: Parameters for Kalman filter. Default values work reasonably well on several countries.
    
    Output
    ------
    Pandas DataFrame object with the following columns
    pred_raw: Raw prediction
    pred: Final prediction (with smoothing, etc.)
    ci_*: Lower and upper bounds of CI
    """
    if days <= 0:
        raise ValueError
    dates = series.index
    if kf_type == 'linear':
        if params is None:
            params = {'kf_p':1, 'kf_r':4, 'kf_q':0.1, 'kf_a':1}
        if params['kf_a'] < 1:
            raise ValueError
        kf = kalman_predictor(series[dates[0]], params['kf_p'], params['kf_r'], params['kf_q'], params['kf_a'])
    elif kf_type == 'ensemble':
        if params is None:
            params = {'kf_p':100, 'kf_r':1000, 'kf_q':0.1, 'kf_n':1000}
        kf = ensemble_kalman_predictor(series[dates[0]], params['kf_p'], params['kf_r'], params['kf_q'], params['kf_n'])
    else:
        raise NotImplementedError
    
    # fit model
    for measurement in series:
        kf.predict()
        kf.update([measurement])
    
    # start forecasting, starting from the last observation date
    
    if isinstance(dates[-1], str):
        
         last_date = dt.datetime.strptime(dates[-1], '%Y-%m-%d')
            
    else:
        
        last_date = dates[-1]
        
    predictions = []
    pred_acc = []
    pred_vel = []
    pred_dates = []
    ci_bounds = []
    for day in range(days):
        future_date = (last_date + dt.timedelta(days=day+1))
        pred_dates.append(future_date)
        kf.predict()
        predictions.append(kf.x[0])
        pred_acc.append(kf.x[2])
        pred_vel.append(kf.x[1])
        ci_bounds.append(kf_ci_bound(kf))
    
    # smoothen and add confidence intervals
    predictions = np.array(predictions)
    predictions[np.where(predictions < 0)[0]] = 0
    #smooth_buffer = list(series[dates[len(dates)-days+1:]])
    #predictions_smooth = smoother(smooth_buffer + predictions, days)[days-1:]
    predictions_smooth = smoother(predictions, days)
    ci_bounds = np.array(ci_bounds)
    ci_upper = predictions_smooth + ci_bounds
    ci_lower = predictions_smooth - ci_bounds
    ci_lower[np.where(ci_lower < 0)[0]] = 0
    
    df_pred = pd.DataFrame({'pred_raw':predictions, 'pred':predictions_smooth, 'pred_vel': pred_vel, 
                          'pred_acc': pred_acc, 'ci_lower':ci_lower, 'ci_upper':ci_upper}, index=pred_dates)
    return df_pred

def kf_ci_bound(kf):
    """
    Compute 95% confidence interval from KF's positive semi-definite covariance matrix
    
    returns a positive single-sided boundary (half) of the interval
    -> CI = kf.x[0] +- kf_ci_bound(kf)
    """
    return 1.96 * (np.diag(kf.P)[0])**0.5

def smoother(x, winsize, method = 'slide'):
    if method == 'slide':
        x_smooth = []
        for i in range(len(x)):
            x_smooth.append(np.mean(x[max(0, i-winsize+1):i+1]))
    elif method == 'slide_recurse':
        x_smooth = predictions.copy()
        for i in range(len(x)):
            x_smooth[i] = np.mean(x_smooth[max(0, i-winsize):i+1])
    else:
        raise NotImplementedError
    assert(len(x) == len(x_smooth))
    return np.array(x_smooth)

### Helpers

In [None]:
def get_stats(observations, predictions):
    r2 = r2_score(observations, predictions)
    mae = mean_absolute_error(observations, predictions)
    rmse = mean_squared_error(observations, predictions) ** 0.5
    return r2, mae, rmse

def rescale(df_x):
    x = df_x.copy()
    x -= x.min()
    x /= x.max()
    return x

### Performance testing and debugging

In [None]:
def kalman_test(series, winsize, kf_type, params=None):
    """
    To test the performance compared with true values along all data points
    
    Input
    -----
    series: Pandas Series object with dates being index in ascending order.
    winsize: Prediction window size in number of days.
    kf_type: linear, unscented, ensemble
    kf_*: factors for Kalman filter. Default values work reasonably well.
          For long-term prediction, usually increasing the fading factor (kf_a) helps.
    
    Output
    ------
    Pandas DataFrame object with the following columns
    pred_raw: Raw prediction
    pred: Final prediction (with smoothing, etc.)
    obs: Ground-truth values
    history: recursive prediction history at each time point (for debugging purpose)
    """
    if winsize <= 0:
        raise ValueError
    observations = []
    predictions = []
    predictions_acc = []
    predictions_vel = []
    pred_dates = []
    history = []
    ci_bounds = []
    dates = series.index.to_numpy()
    if kf_type == 'linear':
        if params is None:
            params = {'kf_p':1, 'kf_r':4, 'kf_q':0.1, 'kf_a':1}
        if params['kf_a'] < 1:
            raise ValueError
        kf = kalman_predictor(series[dates[0]], kf_type, params['kf_p'], params['kf_r'], params['kf_q'], params['kf_a'])
    elif kf_type == 'ensemble':
        if params is None:
            params = {'kf_p':100, 'kf_r':1000, 'kf_q':0.1, 'kf_n':1000}
        kf = ensemble_kalman_predictor(series[dates[0]], params['kf_p'], params['kf_r'], params['kf_q'], params['kf_n'])
    else:
        raise NotImplementedError(kf_type)
    
    for i in range(dates.shape[0]-winsize):
        # save the current state of the model
        saver = Saver(kf, skip_callable=True, save_current=True)
        
        # recursive prediction
        history_window = [kf.x[0]]
        date_window = [dates[i]]
        for day in range(winsize):
            kf.predict()
            history_window.append(kf.x[0])
            date_window.append(dates[i+day+1])
        history.append(pd.DataFrame({'pred':history_window}, index=date_window))
        pred_date = dates[i+winsize]
        pred_dates.append(pred_date)
        prediction = kf.x[0]
        predictions.append(prediction)
        predictions_vel.append(kf.x[1])
        predictions_acc.append(kf.x[2])
        observation = series[pred_date]
        observations.append(observation)
        ci_bounds.append(kf_ci_bound(kf))
        
        # restore model states and update to next day
        for attr in saver.keys:
            try:
                setattr(kf, attr, getattr(saver, attr)[-1])
            except AttributeError: # property decoration causes problem
                #print('.%s skip' % attr)
                continue
        kf.predict() ## is this predict call necessary at all?
        kf.update([series[dates[i+1]]])
    
    # smoothen output
    predictions_smooth = smoother(predictions, winsize)
    predictions_smooth[predictions_smooth < 0] = 0
    #acc_smooth = smoother(predictions_acc, winsize)
    #vel_smooth = smoother(predictions_vel, winsize)
    acc_smooth = predictions_acc
    vel_smooth = predictions_vel
    ci_bounds = np.array(ci_bounds)
    ci_lower = predictions_smooth - ci_bounds
    ci_upper = predictions_smooth + ci_bounds
    ci_lower[np.where(ci_lower < 0)[0]] = 0
    df_pred = pd.DataFrame({'pred_raw':predictions[winsize:], 'pred':predictions_smooth[winsize:],
                          'obs':observations[winsize:], 'history':history[winsize:],
                          'ci_lower':ci_lower[winsize:], 'ci_upper':ci_upper[winsize:],
                          'pred_acc':acc_smooth[winsize:], 'pred_vel':vel_smooth[winsize:]},
                           index=pred_dates[winsize:])
    return df_pred

### Predict + plot functions

In [None]:
def test_plot(series, winsize, kf_type, params=None, title='', y_lims=None):
    """
    Make a prediction and plot at once for convenience.
    
    """
    PLOT_RAW = True
    SHOW_TRAJECTORY = True
    DO_FORECAST = True
    
    result = kalman_test(series, winsize, kf_type, params)
    
    if DO_FORECAST:
        forecast = kalman_forecast(series, winsize, kf_type, params)
    
    r2, mae, rmse = get_stats(result.obs, result.pred)
    #r2, mae, rmse = get_stats(result.obs, result.pred_raw)
    
    plt.figure(figsize=[13,4])
    plt.plot(series.index, series, 'o', color='g', linewidth=3)
    plt.plot(result.index, result.pred, '-', color=(1,0,0,0.8), linewidth=3)
    if PLOT_RAW:
        plt.plot(result.index, result.pred_raw, 'o', color=(1,0,0,0.1), linewidth=3)
        if DO_FORECAST:
            plt.plot(forecast.index, forecast.pred_raw, 'o', color=(1,0,0,0.1), linewidth=3)
    plt.fill_between(result.index, result.ci_lower, result.ci_upper, color=(1,0,0,0.1))
    if DO_FORECAST:
        plt.plot(forecast.index, forecast.pred, color='r', linewidth=3)
        plt.fill_between(forecast.index, forecast.ci_lower, forecast.ci_upper, color=(1,0,0,0.1))
    if SHOW_TRAJECTORY:
        for df_h in result.history:
            plt.plot(df_h.index, df_h.pred, '-', color=(0,0,1,0.2))
    if y_lims:
        plt.ylim(y_lims)
    plt.xticks(rotation=45, fontsize='small')
    plt.title(r'%s ($r^2$:%.3f, mae:%d, rmse:%d=%.1f%% of global max %d)' % (title, r2, mae, rmse, rmse * 100 / series.max(), series.max()))
    plt.legend(['True', 'Prediction'])
    plt.grid(True, 'both')
    plt.tight_layout()
    
    plt.figure(figsize=[13,2])
    plt.plot(series.index, [0] * len(series.index), 'k')
    if DO_FORECAST:    
        plt.plot(forecast.index, [0]*len(forecast.index), 'k')
    plt.plot(result.index, result.pred_vel, '-', color=(0,0,1,0.8), linewidth=3)
    plt.gca().axes.xaxis.set_ticklabels([])
    plt.grid(True, 'both')
    plt.title('Estimated growth rate')
    plt.tight_layout()
    
    plt.figure(figsize=[13,2])
    plt.plot(series.index, [0]*len(series.index), 'k')
    if DO_FORECAST:
        plt.plot(forecast.index, [0]*len(forecast.index), 'k')
    plt.plot(result.index, result.pred_acc, '-', color=(0,0,1,0.8), linewidth=3)
    plt.gca().axes.xaxis.set_ticklabels([])
    plt.grid(True, 'both')
    plt.title('Estimated growth acceleration')
    plt.tight_layout()


In [None]:
def forecast_plot(series, winsize, kf_type, params=None, title='', y_lims=None):
    """
    Make a prediction and plot at once for convenience.    
    """
    PLOT_RAW = True
    
    forecast = kalman_forecast(series, winsize, kf_type, params)
    plt.figure(figsize=[13,5])
    plt.plot(series.index, series, 'o', color='g', linewidth=3)
    plt.plot(forecast.index, forecast.pred, '-', color=(1,0,0,0.9), linewidth=3)
    if PLOT_RAW:
        plt.plot(forecast.index, forecast.pred_raw, '.', color='r')
    plt.fill_between(forecast.index, forecast.ci_lower, forecast.ci_upper, color=(1,0,0,0.1))
    if y_lims:
        plt.ylim(y_lims)
    plt.xticks(rotation=45, fontsize='small')
    plt.title(title)
    plt.legend(['True', 'Prediction'])
    plt.grid(True, 'both')
    plt.tight_layout()

### ICU prediction up to several days in the future

In [None]:
target = 'total_in_icu'
#kf_type = 'linear'
#kf_params = {'kf_p':1, 'kf_r':4, 'kf_q':0.10, 'kf_a':1}
kf_type = 'ensemble'
kf_params = {'kf_p':1, 'kf_r':4, 'kf_q':0.1, 'kf_n':1000}
winsizes = [7]
show_trajectory = False
for winsize in winsizes:
    title = 'Belgium: %d-day prediction of required ICU beds' % winsize
    test_plot(df_hosp[target], winsize, kf_type, kf_params, title)

In [None]:
stringency.head()

### Predicted Cases up to several days in the future

In [None]:
## TODO: Write a wrapper function for the final Dataframe generation. Imprtant for WW Dataset Generation ...

In [None]:
target = 'cases_total'

winsize = 6

In [None]:
belgium_cases = stringency[stringency['ISO']=='BEL'][['DATE', 'country_name', 'ISO', 'cases_new']]

belgium_cases.set_index('DATE', inplace = True)

belgium_cases.head()

In [None]:
# plt.rcParams["figure.figsize"] = (10,6)

# belgium_cases = interpolate_weekend(belgium_cases, 'cases_new', '2020-01-03')

# plt.plot(belgium_cases['cases_new']) 

In [None]:
belgium_new_cases_preds = kalman_forecast(belgium_cases[target], winsize, 'ensemble', None)

belgium_new_cases_preds

In [None]:
belgium_new_cases_test = kalman_test(belgium_cases[target], winsize, 'ensemble')

belgium_new_cases_test.tail()

In [None]:
belgium_new_cases_preds = pd.concat([belgium_new_cases_test, belgium_new_cases_preds])

belgium_new_cases_preds.iloc[:,-20:]

In [None]:
belgium_cases.index

In [None]:
# kf_type = 'ensemble'
# kf_params = {'kf_p':1, 'kf_r':4, 'kf_q':0.1, 'kf_n':1000}
# winsizes = [6]
# show_trajectory = True
# for winsize in winsizes:
#     title = 'Belgium: %d-day prediction of Covid-19 Cases' % winsize
#     test_plot(belgium_cases[target], winsize, kf_type, kf_params, title)

In [None]:
belgium_df = belgium_new_cases_preds[['pred', 'ci_lower', 'ci_upper', 'pred_acc', 'pred_vel']]

belgium_df['country'] = 'belgium'

belgium_df.reset_index(inplace = True)

belgium_df.rename(columns = {'index' : 'DATE'}, inplace=True)

belgium_df['DATE'] = pd.to_datetime(belgium_df['DATE'], format ="%Y-%m-%d")

belgium_df.head()

In [None]:
stringency_belgium = stringency[['DATE', 'ISO', 'npi_stringency_index', 'tests_new_per_thousand', 'stats_population_density' , 'stats_population_urban',  'country_name']]

stringency_belgium = stringency_belgium[stringency_belgium['ISO']=='BEL'].drop_duplicates()

stringency_belgium.head()

In [None]:
belgium_risk_df.shape

In [None]:
belgium_risk_df = belgium_df.merge(stringency_belgium, left_on = 'DATE' , right_on = 'DATE', how = 'left')

# assert belgium_risk_df.shape[0] == belgium_df.shape[0]

belgium_risk_df.head()

In [None]:
belgium_risk_df

In [None]:
belgium_risk_df['npi_stringency_index'].fillna(method='ffill', inplace = True)

belgium_risk_df['infections_var'] = belgium_risk_df['pred'].diff()

In [None]:
belgium_risk_df

In [None]:
from geopy.geocoders import Nominatim
# geolocator = Nominatim(user_agent="my_geocoder")
# location = geolocator.geocode("Belgium")
# belgium_risk_df['latitude'] = location.latitude
# belgium_risk_df['longitude'] = location.longitude

# belgium_risk_df

In [None]:
# from math import exp

# def risk_index(cases_acceleration, npi_stringency_index, **kwargs):
    
#     raw_index =  (0.6 * cases_acceleration - 0.4 * (npi_stringency_index/100))
    
#     return 1 / (1 + exp(-raw_index))

# ## The lower the stringency idenx --> Higher risk 
# ## Mobility Data ? 


# def discrete_risk_idx(risk_index):
    
#     if risk_index < 0.3:
        
#         return "low risk"
    
#     elif risk_index < 0.5:
            
#             return 'moderate risk'
    
#     else:
        
#         return "high risk"

# def generate_predictions(iso_country, target, winsize):
    
#     '''
    
#     '''
    
#     print("Processing {} Data".format(iso_country))
    
#     print("="*50)

    
#     ## Extract Target Data from Main Dataframe
    
#     columns = ['DATE', 'country_name', 'ISO'] + [target]
    
#     filtered_data  = stringency[stringency['ISO']==iso_country][columns]
    
#     filtered_data.set_index('DATE', inplace = True)
    
#     ## Train KF and Predict History , Predict Future
    
#     print("Training KF and Predicting {}, {} days ahead".format(target, winsize))
    
#     print("-"*32)
    
#     predict_hist = kalman_test(filtered_data[target], winsize, 'ensemble')
    
#     predict_future = kalman_forecast(filtered_data[target], winsize, 'ensemble', None)

#     kf_predictions_all = pd.concat([predict_hist, predict_future])
    
#     kf_predictions_all = kf_predictions_all[['pred', 'ci_lower', 'ci_upper', 'pred_acc', 'pred_vel']]

#     kf_predictions_all['country'] = iso_country

#     kf_predictions_all.reset_index(inplace = True)

#     kf_predictions_all.rename(columns = {'index' : 'DATE'}, inplace=True)

#     kf_predictions_all['DATE'] = pd.to_datetime(kf_predictions_all['DATE'], format ="%Y-%m-%d")

#     print("Reading and merging with Stringency and Complementary Data")
    
#     print("-"*32)
    
#     stringency_data = stringency[stringency['ISO']==iso_country].drop_duplicates()
    
#     stringency_data = stringency_data[['DATE', 'npi_stringency_index', 'tests_new_per_thousand', 'stats_population_density' , 'stats_population_urban',  'stats_population']]

#     stringency_merged_df  = kf_predictions_all.merge(stringency_data, left_on = 'DATE' , right_on = 'DATE', how = 'left')

#     print("Extrapolating Stringency Indices and Static Data")
    
#     print("-"*32)

#     # assert belgium_risk_df.shape[0] == belgium_df.shape[0]
    
#     stringency_merged_df['npi_stringency_index'].fillna(method='ffill', inplace = True)
    
#     stringency_merged_df['stats_population_density'].fillna(method='ffill', inplace = True)
    
#     stringency_merged_df['stats_population_urban'].fillna(method='ffill', inplace = True)
    
#     stringency_merged_df['stats_population'].fillna(method='ffill', inplace = True)

#     stringency_merged_df['infections_var'] = stringency_merged_df['pred'].pct_change()
    
#     print("Calculating the Risk Index")
    
#     print("-"*32)

#     stringency_merged_df['risk_index'] = stringency_merged_df[['infections_var', 'npi_stringency_index']].apply(lambda x: risk_index(x[0], x[1]), axis =1)
    
#     stringency_merged_df['risk_index_disc'] = stringency_merged_df['risk_index'].apply(lambda x: discrete_risk_idx(x))

#     print("Adding Geolocation Data")
    
#     print("-"*32)

#     geolocator = Nominatim(user_agent = "my_geocoder")
    
#     location = geolocator.geocode(iso_country)
    
#     stringency_merged_df['latitude'] = location.latitude
    
#     stringency_merged_df['longitude'] = location.longitude
    
#     return stringency_merged_df


In [None]:
france_df = generate_predictions('FRA', 'cases_total', 6)

belgium_df = generate_predictions('BEL', 'cases_total', 6)

germany_df = generate_predictions('DEU', 'cases_total', 6)

england_df = generate_predictions('GBR', 'cases_total', 6)

spain_df = generate_predictions('ESP', 'cases_total', 6)

italy_df = generate_predictions('ITA', 'cases_total', 6)

morocco_df = generate_predictions('MAR', 'cases_total', 6)


In [None]:
france_df.tail()

In [None]:
belgium_df.head()

In [None]:
plt.plot(stringency[stringency['ISO']=='FRA']['cases_total'])

In [None]:
df_all = pd.concat([france_df, belgium_df, germany_df, england_df, spain_df, italy_df, morocco_df])

df_all.head()

In [None]:
df_all[(df_all['country'] == 'GBR')]

In [None]:
france_cases = stringency[stringency['ISO']=='FRA'][['DATE', 'country_name', 'ISO', 'cases_new']]

france_cases.set_index('DATE', inplace = True)

france_cases.head()

In [None]:
france_new_cases_preds = kalman_forecast(france_cases[target], winsize, 'ensemble', None)

france_new_cases_preds

In [None]:
france_new_cases_test = kalman_test(france_cases[target], winsize, 'ensemble')

france_new_cases_test.tail()

## Regional and Departmental Risk Index

### Reading UK Data at the Regional and the ULTA level

In [None]:
%%time

import requests

## Reading UK Regional and LTLA Reported Cases, source: https://coronavirus.data.gov.uk/

filename = 'https://coronavirus.data.gov.uk/downloads/csv/coronavirus-cases_latest.csv'

r = requests.get(filename, stream=True)

uk_regional_cov19 = pd.read_csv(r.raw)


## Reading the Health Life Expectancy for 65+ population , 
## https://www.ons.gov.uk/peoplepopulationandcommunity/healthandsocialcare/healthandlifeexpectancies/datasets/healthstatelifeexpectancyatbirthandatage65bylocalareasuk

xls = pd.ExcelFile('/project_data/data_asset/hsleatbirthandatage65byukla201618.xlsx')

uk_regional_hle_males = pd.read_excel(xls, 'HE - Male at 65', skiprows = 3)

uk_regional_hle_males['sex'] = 'male'

uk_regional_hle_females = pd.read_excel(xls, 'HE - Female at 65', skiprows = 3)

uk_regional_hle_females['sex'] = 'female'

uk_regional_hle = pd.concat([uk_regional_hle_males, uk_regional_hle_females], axis = 0)

## Reading the UK Pop. Stats

xls = pd.ExcelFile('/project_data/data_asset/ukmidyearestimates20192020ladcodes.xls')

uk_pop_stats = pd.read_excel(xls, 'MYE2 - Persons', skiprows = 4)

## Reading UK Health Systems

uk_healthcare_system = pd.read_csv('/project_data/data_asset/20191015_stateofcare1819_ratingsdata.csv', encoding = 'latin')


In [None]:
uk_regional_hle[uk_regional_hle['Area Codes']  == 'E09000027']


In [None]:
uk_pop_stats.rename(columns = {'+90': 90})

age_cols = np.arange(65, 90)

uk_pop_stats['pop_above_65'] = uk_pop_stats[age_cols].sum(axis = 1)

uk_pop_stats.rename(columns = {'Geography1': 'area_type', 'Code': 'area_code', 'Name': 'area_name'}, inplace = True)

uk_pop_stats = uk_pop_stats[['area_code', 'area_name', 'area_type', 'All ages', 'pop_above_65']]

uk_pop_stats['area_name'] = uk_pop_stats['area_name'].apply(lambda z : z[0].upper() + z[1:].lower() if isinstance(z, str) else z)

uk_pop_stats['area_type'] = uk_pop_stats['area_type'].apply(lambda z : z.upper() if isinstance(z, str) else z)

uk_pop_stats.head()


In [None]:
uk_healthcare_system.head()

In [None]:
uk_regional_cov19.head()

In [None]:
uk_regional_cov19['Specimen date'] = pd.to_datetime(uk_regional_cov19['Specimen date'], format = "%Y-%m-%d")

uk_regional_cov19.sort_values('Specimen date', ascending = True, inplace = True)

uk_regional_cov19.set_index('Specimen date', inplace = True)

# uk_regional_cov19['Area name'] = uk_regional_cov19['Area name'].apply(lambda x : x.upper())

uk_regional_cov19['Area type'] = uk_regional_cov19['Area type'].apply(lambda x : x.upper())

uk_regional_cov19['Area type'] = uk_regional_cov19['Area type'].apply(lambda x : 'COUNTRY' if x == 'NATION' else x)

uk_regional_cov19.rename(columns = {'Area type': 'area_type', 'Area name': 'area_name', 'Area code': 'area_code'}, inplace = True)

uk_regional_cov19.head()


In [None]:
uk_regional_cov19['area_type'].unique()

In [None]:
stringency[stringency['ISO']=='GBR']

In [None]:
# series = uk_regional_cov19[uk_regional_cov19['Area type']=='Nation']

# predict_hist = kalman_test(series['Cumulative lab-confirmed cases'], 6, 'ensemble')

# predict_future = kalman_forecast(series['Cumulative lab-confirmed cases'], 6, 'ensemble', None)

# kf_predictions_all = pd.concat([predict_hist, predict_future])

# kf_predictions_all = kf_predictions_all[['pred', 'ci_lower', 'ci_upper', 'pred_acc', 'pred_vel']]

# kf_predictions_all.loc[:,'real_figures'] =  pd.Series(real_hist)

# kf_predictions_all 

In [None]:
from math import exp

def risk_index(growth_rate, npi_stringency_index, population_density, **kwargs):
    
    raw_index =  (0.5 * growth_rate  + 0.2 * (population_density/100) - 0.3 * (npi_stringency_index/100))
    
    raw_index = 100 * 1 / (1 + exp(-0.1 * raw_index))
        
    return raw_index


def discrete_risk_idx(risk_index):
    
    if risk_index < 20:
        
        return "0 - 20"
    
    elif risk_index < 50:
            
        return "20 - 50"
    
    elif risk_index < 70:
            
        return "50 - 70"
        
    elif risk_index < 90:
            
        return "70 - 90"
        
    else:
        
        return "90 - 100"
    
## stringency + mobility --> population's behaviour 
## WS2 --> regional's mood and awarness could be an input 


def generate_predictions_granular(df, target, pop_stats, winsize, country_name, country_iso_code, area_type, area_name, geo_locate):
    
    '''
    
    '''
    
    print("Predicting {} Days ahead".format(winsize))
    
    print("="*50)
    
    columns = ['area_name', 'area_type', 'area_code'] + [target]
    
    series = df[(df['area_name'] == area_name) & (df['area_type'] == area_type)][columns]
    
    print("Series Shape: {}".format(series.shape))
    
    if len(series) > 0 :
        
       ## Train KF and Predict History , Predict Future

        print("Training KF and Predicting {}, {} days ahead".format(target, winsize))

        print("-"*32)

        predict_hist = kalman_test(series[target], winsize, 'linear')

        predict_future = kalman_forecast(series[target], winsize, 'linear', None)

        kf_predictions_all = pd.concat([predict_hist, predict_future])

        kf_predictions_all = kf_predictions_all[['pred', 'ci_lower', 'ci_upper', 'pred_acc', 'pred_vel']]
        
        kf_predictions_all.loc[:,'real_figures'] =  pd.Series(series[target])
        
        kf_predictions_all['country'] = country_name

        kf_predictions_all['area_name'] = series['area_name'][0]

        kf_predictions_all['area_type'] = series['area_type'][0]

        kf_predictions_all.reset_index(inplace = True)

        kf_predictions_all.rename(columns = {'index' : 'DATE'}, inplace=True)

        kf_predictions_all['DATE'] = pd.to_datetime(kf_predictions_all['DATE'], format ="%Y-%m-%d")

        print("Reading and merging with Stringency and Complementary Data")

        print("-"*32)

        stringency_data = stringency[stringency['ISO']==country_iso_code].drop_duplicates()

        stringency_data = stringency_data[['DATE', 'npi_stringency_index', 'tests_new_per_thousand', 'stats_population_density' , 'stats_population_urban',  'stats_population', 

                                           'mobility_retail_recreation', 'mobility_grocery_pharmacy', 'mobility_parks', 'mobility_transit_stations', 'mobility_workplaces']]

        stringency_merged_df  = kf_predictions_all.merge(stringency_data, left_on = 'DATE' , right_on = 'DATE', how = 'left')

        print("Merging with Regional Population Data")

        print("-"*32)
        
        if country_name != 'Italy':

            area_code = series[(series['area_name'] == area_name)]['area_code'][0]  

            if len(pop_stats[pop_stats['area_code'] == str(area_code)]) > 0:

                if country_name == 'France': 

                     stringency_merged_df['area_population_density'] = int(pop_stats[(pop_stats['area_code']==str(area_code)) & (pop_stats['area_type'] == area_type)]['All ages'])*100.00/int(pop_stats[pop_stats['area_name'] == country_name]['All ages'].values[0])


                elif country_name == 'England': 

                     stringency_merged_df['area_population_density'] = int(pop_stats[(pop_stats['area_code']==str(area_code))]['All ages'])*100.00/int(pop_stats[pop_stats['area_name'] == country_name]['All ages'].values[0])      


            else:

                stringency_merged_df['area_population_density'] = 0     
        
        else:
            
            print("Processing Exceptional Case of Italy")
            
            print("-"*32)
            
            if len(pop_stats[(pop_stats['area_name'] == area_name)]) > 0 :
            
                    stringency_merged_df['area_population_density'] = int(pop_stats[(pop_stats['area_name'] == area_name)]['All ages'])*100.00/int(pop_stats[pop_stats['area_name'] == country_name]['All ages'].values[0])
            
            else:
                
                   stringency_merged_df['area_population_density'] = 0 
                    
        print("Extrapolating Stringency Indices and Static Data")

        print("-"*32)
        
        ### --> TBD: should be updated with Predictions instead

        stringency_merged_df['npi_stringency_index'].fillna(method='ffill', inplace = True)

        stringency_merged_df['stats_population_density'].fillna(method='ffill', inplace = True)

        stringency_merged_df['stats_population_urban'].fillna(method='ffill', inplace = True)

        stringency_merged_df['stats_population'].fillna(method='ffill', inplace = True)

        stringency_merged_df['area_population_density'].fillna(method = 'ffill', inplace = True)

    #     stringency_merged_df['infections_var'] = stringency_merged_df['pred'].pct_change()

        print("Calculating the Risk Index")

        print("-"*32)

        stringency_merged_df['risk_index'] = stringency_merged_df[['pred_vel', 'npi_stringency_index', 'area_population_density']].apply(lambda x: risk_index(x[0], x[1], x[2]), axis =1)

        stringency_merged_df['risk_index_disc'] = stringency_merged_df['risk_index'].apply(lambda x: discrete_risk_idx(x))

        if geo_locate:
            
            print("Adding Geolocation Data")

            print("-"*32)

            geolocator = Nominatim(user_agent = "my_geocoder")

            location = geolocator.geocode(area_name)

            stringency_merged_df['latitude'] = location.latitude

            stringency_merged_df['longitude'] = location.longitude
        
        else:
            
            stringency_merged_df['latitude'] = None

            stringency_merged_df['longitude'] = None


        return stringency_merged_df


In [None]:
# columns = ['Area name', 'Area type', 'Area code'] + ['Cumulative lab-confirmed cases']

# series = series[series['Area type']!='Lower tier local authority']

# series = uk_regional_data[uk_regional_data['Area name']=='Kent'][columns]

# dates = series.index

# series['Cumulative lab-confirmed cases'][dates[0]]


# generate_predictions_granular(uk_regional_data, 'Cumulative lab-confirmed cases', 6, 'ENGLAND', 'GBR', 'Buckinghamshire')

In [None]:
# (series.reset_index().groupby(['Specimen date', 'Area name', 'Area type', 'Area code'], as_index=False).apply(lambda x: x if len(x)==1 else x.iloc[[-2]])

#    .reset_index(level=0, drop=True))

## Processing Risk Index for England

In [None]:
uk_regional_cov19.dtypes

In [None]:
# int(uk_pop_stats[uk_pop_stats['area_name'] == 'ENGLAND']['All ages'].values[0])

# series = uk_regional_cov19[uk_regional_cov19['Area type'] == 'NATION']

# area_code = series[(series['Area name'] == 'ENGLAND')]['Area code'][0]  

# uk_pop_stats[(uk_pop_stats['area_code']==str(area_code)) & (uk_pop_stats['area_type'] == 'NATION')]['All ages']

uk_pop_stats[uk_pop_stats['area_code'] == 'E92000001']

In [None]:
uk_regional_cov19.head()

In [None]:
## Sanity Chack for EAST MIDLANDS

In [None]:
%%time

# area_types_uk = uk_regional_data['Area type'].unique()

area_types_uk = ['COUNTRY', 'REGION', 'UPPER TIER LOCAL AUTHORITY']

geo_locate = True

all_areas_uk_risk_index = pd.DataFrame()

for area_type_i in area_types_uk: 
    
    uk_regional_data_i = uk_regional_cov19[uk_regional_cov19['area_type'] == area_type_i]
    
    geo_list = uk_regional_data_i['area_name'].unique()
    
    all_areas_names_uk = pd.DataFrame()
    
    for ii in geo_list:
        
        print("Generating risk index for: {} - {}".format(area_type_i, ii))

        kf_predictions_all = generate_predictions_granular(uk_regional_data_i, 'Cumulative lab-confirmed cases', uk_pop_stats, 6, 'England', 'GBR', area_type_i, ii, geo_locate)

        all_areas_names_uk = all_areas_names_uk.append(kf_predictions_all)
    
    all_areas_uk_risk_index = all_areas_uk_risk_index.append(all_areas_names_uk)
                
all_areas_uk_risk_index.head()


In [None]:
all_areas_uk_risk_index[all_areas_uk_risk_index['area_name'] == 'Leicester']

In [None]:
all_areas_uk_risk_index.head()

In [None]:
# length = np.arange(all_areas_uk_risk_index.shape[0])

plt.hist(all_areas_uk_risk_index[['risk_index']].to_numpy().flatten())

In [None]:
# !pip install folium 

import folium

regions = uk_regional_data['Area name'].unique()

geo_loc = []

for i in regions:
    
    geolocator = Nominatim(user_agent = "my_geocoder")
    
    location = geolocator.geocode(i)
    
    geo_loc.append([location.latitude, location.longitude])
    

map = folium.Map(location=[38.9, -77.05], zoom_start=12)

for point in range(0, len(geo_loc)):
    
    folium.Marker(geo_loc[point], popup = geo_loc[point]).add_to(map)

map

In [None]:
plt.figure(figsize=[13,4])
plt.plot(series.index, series['Cumulative lab-confirmed cases'], 'o', color='g', linewidth=3)
plt.plot(predictions.index, predictions.pred, '-', color=(1,0,0,0.8), linewidth=3)


### Processing France Data




In [None]:
codes_officiel_france = pd.read_csv('/project_data/data_asset/code-officiel-geographique-2019-regions-et-departement (1).csv', sep = ";")

codes_officiel_france.head()


In [None]:
## Reading Infections Data

fr_infections_regions = pd.read_csv('https://www.data.gouv.fr/fr/datasets/r/ad09241e-52fa-4be8-8298-e5760b43cae2', sep = ";")

fr_infections_regions_agg = fr_infections_regions[['reg', 'jour', 'P']].groupby(['reg', 'jour'])['P'].sum().reset_index()

fr_infections_regions_agg = fr_infections_regions_agg.merge(codes_officiel_france[['Code INSEE Région', 'Nom région']], left_on = 'reg', right_on = 'Code INSEE Région')


In [None]:
fr_infections_regions_agg.rename(columns={'Nom Région': 'area_name', 'Code INSEE Région': 'area_code', 'jour': 'Specimen date'}, inplace = True)

fr_infections_regions_agg.drop('reg', inplace = True, axis =1 )

fr_infections_regions_agg['area_type'] = 'Region'

print("The dataset contains {} duplicates".format(fr_infections_regions_agg.duplicated().sum()))

fr_infections_regions_agg.drop_duplicates(inplace = True)

fr_infections_regions_agg.head()


In [None]:
fr_infections_dep = pd.read_csv('/project_data/data_asset/donnees-hospitalieres-covid19-2020-06-30-19h00.csv', sep = ';')

fr_infections_dep_agg = fr_infections_dep[['dep', 'jour', 'hosp']].groupby(['dep', 'jour'])['hosp'].sum().reset_index()

fr_infections_dep_agg = fr_infections_dep_agg.merge(codes_officiel_france[['Code INSEE Département', 'Nom Département majuscule']], left_on = 'dep', right_on = 'Code INSEE Département')

fr_infections_dep_agg['Area type'] = 'DEPARTMENT'

print("The dataset contains {} duplicates".format(fr_infections_dep_agg.duplicated().sum()))

fr_infections_dep_agg.drop_duplicates(inplace = True)

fr_infections_dep_agg.head()

In [None]:
fr_infections_dep_agg.rename(columns={'Nom Département majuscule': 'Area name', 'Code INSEE Département': 'Area code', 'jour': 'Specimen date'}, inplace = True)

fr_infections_dep_agg.drop('dep', inplace = True, axis =1)

fr_infections_dep_agg.head()

In [None]:
fr_infections_agg = pd.concat([fr_infections_regions_agg, fr_infections_dep_agg], axis =0 )

fr_infections_agg.set_index('Specimen date', inplace=True)

fr_infections_agg.index = pd.to_datetime(fr_infections_agg.index)

fr_infections_agg.head()

In [None]:
fr_infections_agg[(fr_infections_agg['Area name']=='ILE DE FRANCE') & (fr_infections_agg['Area type']=='REGION') ]

In [None]:
## Reading Demographic Data

xls = pd.ExcelFile('/project_data/data_asset/TCRD_021.xls')

fr_regional_pop_data = pd.read_excel(xls, 'REG', skiprows = 3)

fr_regional_pop_data['area_type'] = 'Region'

fr_dep_pop_data = pd.read_excel(xls, 'DEP', skiprows = 3)

fr_dep_pop_data['area_type'] = 'Department'

fr_pop_data = pd.concat([fr_regional_pop_data, fr_dep_pop_data], axis = 0)

fr_pop_data.rename(columns={'Unnamed: 0': 'area_code', 'Unnamed: 1': 'area_name', 'Ensemble': 'All ages', 'Part des 60 ans ou plus (en %)': 'pop_above_65'}, inplace = True)

fr_pop_data = fr_pop_data[['area_code', 'area_name', 'area_type', 'All ages', 'pop_above_65']]

fr_pop_data['area_code'] = fr_pop_data['area_code'].apply(str)

fr_pop_data['area_name'] = fr_pop_data['area_name'].apply(lambda z : z.upper() if isinstance(z, str) else z)

fr_pop_data['area_type'] = fr_pop_data['area_type'].apply(lambda z : z.upper() if isinstance(z, str) else z)

fr_pop_data.head()

In [None]:
fr_pop_data['area_type'].unique()

In [None]:
%%time

fr_all_areas_preds = pd.DataFrame()

geo_locate = True

area_types_all = fr_infections_agg['Area type'].unique()

for area_type in area_types_all:
    
    df_area_type = fr_infections_agg[fr_infections_agg['Area type'] == area_type]
    
    fr_area_preds = pd.DataFrame()
        
    area_names_all = df_area_type['Area name'].unique()
    
    for area_name in area_names_all:
        
        print("Generating risk index for: {} - {}".format(area_type, area_name))  
        
        one_preds = generate_predictions_granular(df_area_type, 'hosp', fr_pop_data, 6, 'FRANCE', 'FRA', area_type, area_name, geo_locate)
  
        fr_area_preds = fr_area_preds.append(one_preds)
    
    fr_all_areas_preds = fr_all_areas_preds.append(fr_area_preds)
        
fr_all_areas_preds.head() 


In [None]:
fr_all_areas_preds[fr_all_areas_preds['area_name']=='ILE DE FRANCE']

In [None]:
series = fr_infections_agg[fr_infections_agg['Area type'] == 'Region']

area_code = str(series[(series['Area name'] == 'ILE DE FRANCE') & (series['Area type'] == 'Region')]['Area code'][0])

area_code

# int(fr_pop_data[(fr_pop_data['area_code']==str(area_code)) & (fr_pop_data['area_type'] == area_type)]['All ages'])*100.00


# fr_pop_data[fr_pop_data['area_code']=='11']

generate_predictions_granular(series, 'hosp', fr_pop_data, 6, 'FRANCE', 'FRA', 'REGION', 'ILE DE FRANCE', True)

# fr_pop_data[fr_pop_data['area_code']==area_code]['All ages']


# fr_pop_data[(fr_pop_data['area_code']==str(area_code)) & (fr_pop_data['area_type'] == 'Region')]


In [None]:
 series = df[(df['Area name'] == area_name) & (df['Area type'] == area_type)][columns]

## Data For Italy


In [None]:
italy_data_national = pd.read_csv('https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-andamento-nazionale/dpc-covid19-ita-andamento-nazionale.csv')

italy_data_province = pd.read_csv('https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-province/dpc-covid19-ita-province.csv')

italy_data_regional = pd.read_csv('https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv')

italy_data_national.head()

In [None]:
italy_data_regional[italy_data_regional['denominazione_regione'] == 'Basilicata']


In [None]:
italy_data_province.head()


In [None]:
italy_data_province['denominazione_provincia'].unique()

In [None]:
italy_data_province.head()

In [None]:
italy_data_national_preds = italy_data_national[['data', 'stato', 'totale_casi']]

italy_data_national_preds.rename(columns = {'data': 'Specimen date', 'totale_casi': 'hosp'}, inplace = True)

italy_data_national_preds['Specimen date'] = pd.to_datetime(italy_data_national_preds['Specimen date'], format = "%Y-%m-%d")

italy_data_national_preds['Specimen date'] = italy_data_national_preds['Specimen date'].apply(lambda x : dt.datetime.strftime(x,  "%Y-%m-%d"))

italy_data_national_preds['area_code'] = None

italy_data_national_preds['area_name'] = 'ITALY'

italy_data_national_preds['area_type'] = 'COUNTRY'

italy_data_national_preds.drop('stato', inplace = True, axis = 1)

italy_data_national_preds.set_index('Specimen date', inplace = True)


In [None]:
italy_data_national_preds.head()

In [None]:
plt.plot(italy_data_national_preds['hosp'])

In [None]:
italy_data_regional_preds = italy_data_regional[['data', 'stato', 'denominazione_regione', 'codice_regione', 'totale_casi']]

italy_data_regional_preds.rename(columns = {'data': 'Specimen date', 'totale_casi': 'hosp', 'denominazione_regione': 'area_name', 'codice_regione': 'area_code'}, inplace = True)

italy_data_regional_preds['Specimen date'] = pd.to_datetime(italy_data_regional_preds['Specimen date'], format = "%Y-%m-%d")

italy_data_regional_preds['Specimen date'] = italy_data_regional_preds['Specimen date'].apply(lambda x : dt.datetime.strftime(x,  "%Y-%m-%d"))

italy_data_regional_preds['area_type'] = 'REGION'

italy_data_regional_preds.drop('stato', inplace = True, axis = 1)

italy_data_regional_preds.set_index('Specimen date', inplace = True)

In [None]:
italy_data_regional_preds.head()

In [None]:
plt.plot(italy_data_regional_preds[italy_data_regional_preds['Area name']=='Basilicata']['hosp'])

In [None]:
italy_data_province.head()

In [None]:
italy_data_province_preds = italy_data_province[['data', 'denominazione_provincia', 'totale_casi']].groupby(['data', 'denominazione_provincia'])['totale_casi'].sum().reset_index()

italy_data_province_preds.rename(columns = {'data': 'Specimen date', 'totale_casi': 'hosp', 'denominazione_provincia': 'area_name'}, inplace = True)

italy_data_province_preds['Specimen date'] = pd.to_datetime(italy_data_province_preds['Specimen date'], format = "%Y-%m-%d")

italy_data_province_preds['Specimen date'] = italy_data_province_preds['Specimen date'].apply(lambda x : dt.datetime.strftime(x,  "%Y-%m-%d"))

italy_data_province_preds['area_code'] = None

italy_data_province_preds['area_type'] = 'PROVINCE'

italy_data_province_preds.set_index('Specimen date', inplace = True)


In [None]:
italy_data_province_preds.head()

In [None]:
italy_data_province_preds[italy_data_province_preds['Area name']=='In fase di definizione/aggiornamento']

In [None]:
plt.plot(italy_data_province[italy_data_province['denominazione_provincia'] == 'Siracusa']['totale_casi'])

In [None]:
plt.plot(italy_data_province[italy_data_province['denominazione_provincia'] == 'Vicenza']['totale_casi'])

In [None]:
plt.plot(italy_data_province_preds[italy_data_province_preds['Area name']=='Fuori Regione / Provincia Autonoma']['hosp'])

In [None]:
plt.plot(italy_data_province_preds[italy_data_province_preds['Area name']=='Chieti']['hosp'])

In [None]:
italy_data_all_areas = pd.concat([italy_data_national_preds, italy_data_regional_preds], axis = 0)

print("The dataset contains {} duplicates".format(italy_data_all_areas.duplicated().sum()))

italy_data_all_areas.drop_duplicates(inplace = True)

italy_data_all_areas['area_name'] = italy_data_all_areas['area_name'].apply(lambda z : z.upper())

italy_data_all_areas.head()

In [None]:
italy_data_all_areas['area_name'].unique()

In [None]:
italy_data_all_areas['area_name'] = italy_data_all_areas['area_name'].apply(lambda x : 'BOLZANO' if x == 'P.A. BOLZANO' else 'TRENTO' if x == 'P.A. TRENTO' else x)

In [None]:
italy_data_all_areas['area_name'].unique()

In [None]:
## Reading Demographic Data 

ita_pop_data = pd.read_csv('/project_data/data_asset/DCIS_POPRES1_06072020030435663.csv')

ita_pop_data.head()


In [None]:
ita_pop_data[ita_pop_data['Territory'] == 'FRIULI VENEZIA GIULIA']

In [None]:
ita_pop_data = ita_pop_data[(ita_pop_data['Gender']=='total') & (ita_pop_data['STATCIV2']==99)][['Territory', 'Age', 'Value']].drop_duplicates()

ita_pop_data['All ages'] = ita_pop_data['Value'].groupby(ita_pop_data['Territory']).transform(max)

# sub_data = ita_pop_data[(ita_pop_data['Territory']=='Italy') & (ita_pop_data['SEXISTAT1']==9) & (ita_pop_data['Gender']=='total') & (ita_pop_data['STATCIV2']==99)]

# ita_pop_data['pop_above_65'] = sub_data['Value'].groupby(sub_data['Territory']).tail(10).sum()

ita_pop_data = ita_pop_data[['Territory', 'All ages']].drop_duplicates()

ita_pop_data.rename(columns= {'Territory': 'area_name'}, inplace = True)

ita_pop_data['area_name'] = ita_pop_data['area_name'].apply(lambda z : z.upper())

ita_pop_data.head()

In [None]:
set(italy_data_all_areas['area_name']) & set(ita_pop_data['area_name'])

In [None]:
%%time

geo_locate = True 

ita_all_areas_preds = pd.DataFrame()

area_types_all = italy_data_all_areas['area_type'].unique()

for area_type in area_types_all:
    
    df_area_type = italy_data_all_areas[italy_data_all_areas['area_type'] == area_type]
    
    ita_area_preds = pd.DataFrame()
    
    area_names_all = df_area_type['area_name'].unique()
    
    for area_name in area_names_all:
        
        print("Generating risk index for: {} - {}".format(area_type, area_name))
        
        one_preds = generate_predictions_granular(italy_data_all_areas, 'hosp', ita_pop_data, 6, 'ITALY', 'ITA', area_type, area_name, geo_locate)
        
        ita_area_preds = ita_area_preds.append(one_preds)
    
    ita_all_areas_preds = ita_all_areas_preds.append(ita_area_preds)
        
ita_all_areas_preds.head()  


In [None]:
ita_all_areas_preds.head()

In [None]:
generate_predictions_granular(italy_data_all_areas, 'hosp', ita_pop_data, 6, 'ITALY', 'ITA', area_type, area_name, geo_locate)

In [None]:
italy_data_all_areas[italy_data_all_areas['Area name'] == 'In fase di definizione/aggiornamento']

## Predictions for Germany


In [None]:
data_germany = pd.read_csv("/project_data/data_asset/df_pipeline_germany.csv")

data_germany.head()

In [None]:
data_germany.rename(columns = {'Cognos Area name': 'area_name', 'Area type': 'area_type', 'Cca 2': 'area_code'}, inplace = True)

data_germany = data_germany[[ 'area_name', 'area_type', 'cum cases', 'area_code']]

data_germany['area_code'] = data_germany['area_code'].apply(pd.to_numeric, errors='coerce')

data_germany.head()


In [None]:
germ_pop_stats.columns

In [None]:
germ_pop_stats = pd.read_csv('/project_data/data_asset/regional_population_germany.csv', sep = ";")

germ_pop_stats.rename(columns = {'Schluessel': 'area_code', ' Name': 'area_name', ' Wert,,': 'population_density'}, inplace = True)

germ_pop_stats['population_density'] = germ_pop_stats['population_density'].apply(lambda x: x.replace(',', '')) 

germ_pop_stats['population_density'] = germ_pop_stats['population_density'].apply(pd.to_numeric, errors='coerce')

germ_pop_stats.head()


In [None]:
germ_pop_stats[' Name'].unique()

### Merging All results in final DataFrame

In [None]:
# all_areas_uk_risk_index.reset_index(inplace = True)

# fr_all_areas_preds.reset_index(inplace = True)

risk_index_world = pd.concat([all_areas_uk_risk_index, fr_all_areas_preds, ita_all_areas_preds], axis = 0)

risk_index_world.shape

In [None]:
risk_index_world.head()

In [None]:
all_areas_uk_risk_index.shape

In [None]:
fr_all_areas_preds.shape

In [None]:
ita_all_areas_preds.shape

In [None]:
plt.hist(risk_index_world[['risk_index']].to_numpy().flatten())

In [None]:
## Write Data to CoS

from project_lib import Project

project = Project.access()

project.save_data(file_name = "risk_index_world.csv",data = risk_index_world.to_csv(index = False), overwrite = True)


In [None]:
from project_lib import Project

project = Project.access()

project.save_data(file_name = "risk_index_uk_only.csv",data = all_areas_uk_risk_index.to_csv(index = False), overwrite = True)
