In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import errno
import os
import matplotlib.pyplot as plt
import requests
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
try:
    import filterpy
    import geopy
    import iso3166
except ImportError:
    !pip install filterpy
    !pip install geopy
    !pip install pycountry
    
# import pycountry
import pycountry
from filterpy.kalman import KalmanFilter, EnsembleKalmanFilter, UnscentedKalmanFilter, MerweScaledSigmaPoints
from filterpy.common import Q_discrete_white_noise, Saver
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)
plt.rcParams.update({'figure.max_open_warning': 0})

pd.set_option("display.max_columns", 500)

import warnings

warnings.filterwarnings("ignore")




## Model Definition

In [2]:
class kalman_filter():
    
    def __init__(self, kf_type, params):
        
        self.kf_type = kf_type
            
        self.params = params
        
    
    def get_params_(self):
        
        return self.params
    
    
    def kalman_model_(self, initial_state, kf_p, kf_r, kf_q, kf_a):
        
        """
        The Covid dynamics are modelled as a linear dynamic system, with a state space vector containing:
        - speed = growth rate (# of cases per day)
        - acceleration = growth acceleration (# of cases per day^2)
        
        """
        
        # day = 1 is our observation interval
        
        dt = 1
        
        ## Transition Matrix , Kinematic equation
        
        F = np.array([[1, dt, 0.5 * (dt ** 2)], 
                     
                      [0, 1, dt], 
                      
                      [0, 0, 1]])
        
        # define a linear KF with position, velocity , acceleration parameters
        
        dim_x = F.shape[0]
        
        kf = KalmanFilter(dim_x = dim_x, dim_z = 1)
        
        kf.F = F
        
        ## Initialize the State Space Vector
        
        kf.x = np.zeros(dim_x)
        
        kf.x[0] = initial_state
        
        ## Measuremeent Matrix
        
        kf.H = np.zeros((1, dim_x))
        
        kf.H[0][0] = 1

        # Covariance Matrix
        
        kf.P *= kf_p

        ## Measurement Noise
        
        kf.R = kf_r
        
        ## Process Noise
        
        kf.Q = Q_discrete_white_noise(dim = dim_x, dt = 1, var = kf_q)
        
        ## Fading Factor 
        
        kf.alpha = kf_a
        
        return kf
    
    
    def smoother(self, series, winsize, method = 'slide'):
        
        print("Series inside smoother func: {}".format(len(series)))
        
        if method == 'slide':
            
            x_smooth = []
            
            for i in range(len(series)):
                
                x_smooth.append(np.mean(series[max(0, i - winsize + 1) : i+1]))
                
        elif method == 'recursive_slide':
            
            x_smooth = series.copy()
            
            for i in range(len(x_smooth)):
                
                x_smooth[i] = np.mean(x_smooth[max(0, i - winsize) : i])
        else:
            
            raise NotImplementedError
                
        return np.array(x_smooth)
                
    
    def kf_ci_bounds(self, kf):
        
        '''
        Returns the CI, calculated based on the KF Covariance matrix. Supposes the state space vectors are Gaussian.  
        
        '''
        
        return 1.96 * (np.diag(kf.P)[0])**0.5
    
    
    def forecast(self, series, days, smooth):
        
        """
        Forecast based on Historical Data. The Kalman Filter tracks the velocity and acceleration of the Time Series, 
        updating the estimated states space vectors and covariances with the measurement of the states, while balancing the uncertainty of each. 
        
        
        Input:
        -----
        
        series: Pandas Series object with dates being index in ascending order.
        days: Prediction window length
        kf_type: linear, unscented, ensemble
        kf_*: Parameters for Kalman filter. Default values work reasonably well on several countries.

        Outputs:
        -------
        
        Pandas DataFrame object with the following columns
        pred_raw: Raw prediction
        pred: Final prediction (with smoothing, etc.)
        ci_*: Lower and upper bounds of CI

        """
        
        if days <= 0:
            
            raise ValueError
        
        dates = series.index
        
        if self.kf_type == 'linear':
            
            if self.params is None:
                
                self.params = {'kf_p': 1, 'kf_r': 4, 'kf_q': 0.1, 'kf_a': 1}
            
            if self.params['kf_a'] < 0:
                
                 raise ValueError
            
            kf = self.kalman_model_(series[dates[0]], self.params['kf_p'], self.params['kf_r'], self.params['kf_q'], self.params['kf_a'])
             
        else:
            
             raise NotImplementedError
        
        for measurement in series:
            
            kf.predict()
            
            kf.update([measurement])
            
        ## Predicting starting. from the last available observation
        
        if isinstance(dates[-1], str):
            
            last_date = dt.datetime.strptime(dates[-1] , "%Y-%m-%d")
        
        else:
            
            last_date = dates[-1]
        
        predictions = []
        
        pred_acc = []
        
        pred_vel = []
        
        pred_dates = []
        
        ci_bounds = []
        
        for day in range(days):
            
            future_date = (last_date + dt.timedelta(days = day + 1))
            
            pred_dates.append(future_date)
            
            kf.predict()
            
            predictions.append(kf.x[0])
            
            pred_vel.append(kf.x[1])
            
            pred_acc.append(kf.x[2])
            
            ci_bounds.append(self.kf_ci_bounds(kf))
            
        predictions = np.array(predictions)
        
        predictions[np.where(predictions < 0)[0]] = 0
        
        if smooth:
        
              predictions_smoother = self.smoother(predictions, days)
        
        ci_bounds = np.array(ci_bounds)
        
        ci_lower = predictions - ci_bounds      
        
        ci_upper = predictions + ci_bounds 
        
        ci_lower[np.where(ci_lower < 0)[0]] = 0
        
        df_preds = pd.DataFrame({'pred_raw': predictions, 'pred': predictions_smoother, 'pred_vel': pred_vel, 'pred_acc': pred_acc, 'ci_lower': ci_lower, 
                                
                                'ci_upper': ci_upper}, index = pred_dates)
        
        return df_preds
        
        
    def kalman_test(self, series, winsize):
            
            """
            To test the performance compared with true values along all data points

            Input
            -----
            series: Pandas Series object with dates being index in ascending order.
            winsize: Prediction window size in number of days.
            kf_type: linear, unscented, ensemble
            kf_*: factors for Kalman filter. Default values work reasonably well.
                  For long-term prediction, usually increasing the fading factor (kf_a) helps.

            Output
            ------
            Pandas DataFrame object with the following columns
            pred_raw: Raw prediction
            pred: Final prediction (with smoothing, etc.)
            obs: Ground-truth values
            history: recursive prediction history at each time point (for debugging purpose)
            """
            if winsize <= 0:
                
                raise ValueError
                
            observations = []
            
            predictions = []
            
            predictions_acc = []
            
            predictions_vel = []
            
            pred_dates = []
            
            history = []
            
            ci_bounds = []
            
            dates = series.index.to_numpy()
            
            if self.kf_type == 'linear':
                
                if self.params is None:
                    
                    self.params = {'kf_p':1, 'kf_r':4, 'kf_q':0.1, 'kf_a':1}
                    
                if self.params['kf_a'] < 1:
                    
                    raise ValueError
                    
                kf = self.kalman_model_(series[dates[0]], self.params['kf_p'], self.params['kf_r'], self.params['kf_q'], self.params['kf_a'])
            
            else:
                
                raise NotImplementedError(self.kf_type)

            for i in range(dates.shape[0] - winsize):
                
                # save the current state of the model
                
                saver = Saver(kf, skip_callable = True, save_current = True)

                # recursive prediction
                
                history_window = [kf.x[0]]
                
                date_window = [dates[i]]
                
                for day in range(winsize):
                    
                    kf.predict()
                    
                    history_window.append(kf.x[0])
                    
                    date_window.append(dates[i + day + 1])
                    
                history.append(pd.DataFrame({'pred':history_window}, index=date_window))
                
                pred_date = dates[i + winsize]
                
                pred_dates.append(pred_date)
                
                prediction = kf.x[0]
                
                predictions.append(prediction)
                
                predictions_vel.append(kf.x[1])
                
                predictions_acc.append(kf.x[2])
                
                observation = series[pred_date]
                
                observations.append(observation)
                
                ci_bounds.append(self.kf_ci_bounds(kf))

                # restore model states and update to next day
                
                for attr in saver.keys:
                    
                    try:
                        setattr(kf, attr, getattr(saver, attr)[-1])
                        
                    except AttributeError: # property decoration causes problem
                        
                        #print('.%s skip' % attr)
                        continue
                        
                kf.predict()
                
                kf.update([series[dates[i+1]]])

            # smoothen output
            
            predictions_smooth = self.smoother(predictions, winsize)
            
            predictions_smooth[predictions_smooth < 0] = 0
            
            #acc_smooth = smoother(predictions_acc, winsize)
            #vel_smooth = smoother(predictions_vel, winsize)
            acc_smooth = predictions_acc
            
            vel_smooth = predictions_vel
            
            ci_bounds = np.array(ci_bounds)
            
            ci_lower = predictions_smooth - ci_bounds
            
            ci_upper = predictions_smooth + ci_bounds
            
            ci_lower[np.where(ci_lower < 0)[0]] = 0
            
            df_pred = pd.DataFrame({'pred_raw':predictions[winsize:], 'pred':predictions_smooth[winsize:],
                                  
                                    'obs':observations[winsize:], 'history':history[winsize:],
                                  
                                    'ci_lower':ci_lower[winsize:], 'ci_upper':ci_upper[winsize:],
                                  
                                    'pred_acc':acc_smooth[winsize:], 'pred_vel':vel_smooth[winsize:]},
                                   
                                     index=pred_dates[winsize:])
            
            return df_pred 
    

        
    def interpolate_weekend(self, ):
        
        
        pass
    
    
        

## Reading and Processing Data

In [3]:
def read_merge_data(country):
    
    '''
    A helper function for reading and merging various Data sources used in the Risk Index calculation. 
    
    Parameters:
    ----------
    
    
    
    Outputs:
    -------
    
    '''
    
    country_iso = pycountry.countries.search_fuzzy(country)[0].alpha_3

    print("Reading Stringency Integrated Dataset for Country: {}".format(country))
    
    print('-'*32)
        
    path = "https://raw.githubusercontent.com/rs-delve/covid19_datasets/master/dataset/combined_dataset_latest.csv"

    stringency = pd.read_csv(path, parse_dates = ['DATE'])

    stringency['country_name'] = stringency['country_name'].apply(lambda z : z[0].upper() + z[1:].lower() if isinstance(z, str) else z)
    
    stringency_data = stringency[stringency['ISO'] == country_iso].drop_duplicates()
    
    stringency_data = stringency_data[['DATE', 'npi_stringency_index', 'tests_new_per_thousand', 'stats_population_density' , 'stats_population_urban',  'stats_population', 

                                   'mobility_retail_recreation', 'mobility_grocery_pharmacy', 'mobility_parks', 'mobility_transit_stations', 'mobility_workplaces']]

    
    if country == 'England':
                
        try:
            
            print('Processing UK reported cases')
            
            filename = 'https://coronavirus.data.gov.uk/downloads/csv/coronavirus-cases_latest.csv'
        
            r = requests.get(filename, stream=True)

            uk_regional_cov19 = pd.read_csv(r.raw)
            
            num_regions = len(uk_regional_cov19['Name'].unique())
            
            print("Found {} area names in Dataset".format(num_regions))
                  
            print('-'*32)
            
            uk_regional_cov19['Specimen date'] = pd.to_datetime(uk_regional_cov19['Specimen date'], format = "%Y-%m-%d")

            uk_regional_cov19.sort_values('Specimen date', ascending = True, inplace = True)

            uk_regional_cov19.set_index('Specimen date', inplace = True)

            # uk_regional_cov19['Area name'] = uk_regional_cov19['Area name'].apply(lambda x : x.upper())

            uk_regional_cov19['Area type'] = uk_regional_cov19['Area type'].apply(lambda x : x.upper())

            uk_regional_cov19['Area type'] = uk_regional_cov19['Area type'].apply(lambda x : 'COUNTRY' if x == 'NATION' else x)

            uk_regional_cov19.rename(columns = {'Area type': 'area_type', 'Area name': 'area_name', 'Area code': 'area_code'}, inplace = True)

        
        except: 
            
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), filename)
        
        print("Reading Healthy Life Expectancy Data in EN")
        
        print('-'*32)

        try: 
            
            filename = '/project_data/data_asset/hsleatbirthandatage65byukla201618.xlsx'
            
            xls = pd.ExcelFile(filename)

            uk_regional_hle_males = pd.read_excel(xls, 'HE - Male at 65', skiprows = 3)

            uk_regional_hle_males['sex'] = 'male'

            uk_regional_hle_females = pd.read_excel(xls, 'HE - Female at 65', skiprows = 3)

            uk_regional_hle_females['sex'] = 'female'

            uk_regional_hle = pd.concat([uk_regional_hle_males, uk_regional_hle_females], axis = 0)
        
        except: 
            
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), filename)

        
        print("Reading Regional Population Statistics in EN")
        
        print('-'*32)

        try: 
            
            filename = '/project_data/data_asset/ukmidyearestimates20192020ladcodes.xls'
            
            uk_pop_stats = pd.read_excel(filename, 'MYE2 - Persons', skiprows = 4)
            
            print("Processing Regional Population Data")
            
            print('-'*25)
            
            uk_pop_stats.rename(columns = {'+90': 90})

            age_cols = np.arange(65, 90)

            uk_pop_stats['pop_above_65'] = uk_pop_stats[age_cols].sum(axis = 1)

            uk_pop_stats.rename(columns = {'Geography1': 'area_type', 'Code': 'area_code', 'Name': 'area_name'}, inplace = True)

            uk_pop_stats = uk_pop_stats[['area_code', 'area_name', 'area_type', 'All ages', 'pop_above_65']]

            uk_pop_stats['area_name'] = uk_pop_stats['area_name'].apply(lambda z : z[0].upper() + z[1:].lower() if isinstance(z, str) else z)

            uk_pop_stats['area_type'] = uk_pop_stats['area_type'].apply(lambda z : z.upper() if isinstance(z, str) else z)
            
            uk_pop_stats = uk_pop_stats[~uk_pop_stats['Name'].isna()]
    
        except: 
            
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), filename)
        
        
        try: 
            
            filename = '/project_data/data_asset/20191015_stateofcare1819_ratingsdata.csv'
            
            uk_healthcare_system = pd.read_csv(filename, encoding = 'latin')
      
        except: 
            
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), filename)
            
        
        return stringency_data, uk_regional_cov19, uk_regional_hle, uk_pop_stats, uk_healthcare_system

        
    elif country == 'France':
        
        print("Reading France Official Codes")
        
        try:
        
             codes_officiel_france = pd.read_csv('/project_data/data_asset/code-officiel-geographique-2019-regions-et-departement (1).csv', sep = ";")
            
        
        except: 
            
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), 'code-officiel-geographique-2019')

        
        print("Reading Departmental Covid Cases in France")
        
        filename = 'https://www.data.gouv.fr/fr/datasets/r/19a91d64-3cd3-42fc-9943-d635491a4d76'
        
        cov_pos_dep_tests = pd.read_csv(filename, sep = ";")
        
        ## Aggregating The Age Column
        
        cov_pos_dep_tests = cov_pos_dep_tests[['dep', 'jour', 'P']].groupby(['dep', 'jour'])['P'].sum().reset_index()
        
        cov_pos_dep_tests = cov_pos_dep_tests.sort_values(['dep'], ascending = True) 

        cov_pos_dep_tests = cov_pos_dep_tests.groupby(['dep'], sort = False).apply(lambda z : z.sort_values(['jour'], ascending = True)).reset_index(drop = True)

        cov_pos_dep_tests['P_cum'] = cov_pos_dep_tests.groupby(['dep'])['P'].cumsum()
        
        cov_pos_dep_tests.rename(columns = {'dep': 'area_code', 'P_cum': 'Cumulative_cases'}, inplace = True)
        
        cov_pos_dep_tests['area_type'] = 'Departement'
        
        cov_pos_dep_tests = cov_pos_dep_tests.merge(codes_officiel_france[['Code INSEE Département', 'Nom Département']], left_on = 'area_code', right_on = 'Code INSEE Département')

        cov_pos_dep_tests.rename(columns={'Nom Département': 'area_name'}, inplace = True)

        cov_pos_dep_tests = cov_pos_dep_tests[['jour', 'area_code', 'area_type', 'area_name', 'Cumulative_cases']]

        
        print("Reading Regional Covid Cases in France")
        
        filename = 'https://www.data.gouv.fr/fr/datasets/r/ad09241e-52fa-4be8-8298-e5760b43cae2'
        
        cov_pos_reg_tests = pd.read_csv(filename, sep = ";")
        
         ## Aggregating The Age Column
        
        cov_pos_reg_tests = cov_pos_reg_tests[['reg', 'jour', 'P']].groupby(['reg', 'jour'])['P'].sum().reset_index()
        
        cov_pos_reg_tests = cov_pos_reg_tests.sort_values(['reg'], ascending = True) 

        cov_pos_reg_tests = cov_pos_reg_tests.groupby(['reg'], sort = False).apply(lambda z : z.sort_values(['jour'], ascending = True)).reset_index(drop = True)

        cov_pos_reg_tests['P_cum'] = cov_pos_reg_tests.groupby(['reg'])['P'].cumsum()
        
        cov_pos_reg_tests.rename(columns = {'reg': 'area_code', 'P_cum': 'Cumulative_cases'}, inplace = True)
        
        cov_pos_reg_tests['area_type'] = 'Region'
        
        cov_pos_reg_tests = cov_pos_reg_tests.merge(codes_officiel_france[['Code INSEE Région', 'Nom région']], left_on = 'area_code', right_on = 'Code INSEE Région')

        cov_pos_reg_tests.rename(columns={'Nom région': 'area_name'}, inplace = True)

        cov_pos_reg_tests = cov_pos_reg_tests[['jour', 'area_code', 'area_type', 'area_name', 'Cumulative_cases']]

        
        print("Reading France Global Covid Cases")
        
        filename = 'https://www.data.gouv.fr/fr/datasets/r/57d44bd6-c9fd-424f-9a72-7834454f9e3c'
        
        cov_pos_all_tests = pd.read_csv(filename, sep = ";")
        
        cov_pos_all_tests = cov_pos_all_tests[['fra', 'jour', 'P']].groupby(['fra', 'jour'])['P'].sum().reset_index()
        
        cov_pos_all_tests['P_cum'] = cov_pos_all_tests['P'].cumsum()
        
        cov_pos_all_tests.rename(columns = {'fra': 'area_code', 'P_cum': 'Cumulative_cases'}, inplace = True)
        
        cov_pos_all_tests['area_name'] = 'France'
        
        cov_pos_all_tests['area_type'] = 'Country'
        
        cov_pos_all_tests = cov_pos_all_tests[['jour', 'area_code', 'area_type', 'area_name', 'Cumulative_cases']]

        
        ## Merging the Datasets:
        
        cov_pos_all = pd.concat([cov_pos_dep_tests, cov_pos_reg_tests, cov_pos_all_tests],axis = 0)
        
        cov_pos_all.drop_duplicates(inplace = True)
        
        cov_pos_all['jour'] = pd.to_datetime(cov_pos_all['jour'], format = "%Y-%m-%d")
        
        cov_pos_all.set_index('jour', inplace = True)
        
       
        ## Merging with main Dataset
        
#         cov_pos_all = cov_pos_all.merge(codes_officiel_france[['Code INSEE Région', 'Nom région']], left_on = 'area_code', right_on = 'Code INSEE Région')

#         cov_pos_all.rename(columns={'Nom région': 'area_name'}, inplace = True)

#         cov_pos_all = cov_pos_all[['jour', 'area_code', 'area_type', 'area_name', 'Cumulative_cases']]
        
#         print("The FR Dataset contains {} duplicates".format(cov_pos_all.duplicated().sum()))

        
        ## Reading Demographic Data
        
        print('Reading Population Data')
        
        print('-'*32)

        xls = pd.ExcelFile('/project_data/data_asset/TCRD_021.xls')

        fr_regional_pop_data = pd.read_excel(xls, 'REG', skiprows = 3)

        fr_regional_pop_data['area_type'] = 'Region'

        fr_dep_pop_data = pd.read_excel(xls, 'DEP', skiprows = 3)

        fr_dep_pop_data['area_type'] = 'Departement'

        fr_pop_data = pd.concat([fr_regional_pop_data, fr_dep_pop_data], axis = 0)
                

        fr_pop_data.rename(columns={'Unnamed: 0': 'area_code', 'Unnamed: 1': 'area_name', 'Ensemble': 'All ages', 'Part des 60 ans ou plus (en %)': 'pop_above_60'}, inplace = True)

        fr_pop_data = fr_pop_data[['area_code', 'area_name', 'area_type', 'All ages', 'pop_above_60']]
        
        ## Special case: Country France
        
        fr_pop_data.loc[fr_pop_data.area_code == 'F', 'area_code'] = 'FR'

        fr_pop_data.loc[fr_pop_data.area_code == 'FR', 'area_type'] = 'Country'
        
        fr_pop_data.drop_duplicates(inplace = True)
        
        
        # Reading ICU Beds occupation 
        
        icu_beds_capa = pd.read_csv('https://www.data.gouv.fr/fr/datasets/r/4acad602-d8b1-4516-bc71-7d5574d5f33e')
        
        
        return cov_pos_all, fr_pop_data, stringency_data, icu_beds_capa
    
        
    elif country == 'Italy':
        
        print('Reading Covid Cases Data in Italy')
        
        print('-'*32)
        
        italy_data_national = pd.read_csv('https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-andamento-nazionale/dpc-covid19-ita-andamento-nazionale.csv')

        italy_data_province = pd.read_csv('https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-province/dpc-covid19-ita-province.csv')

        italy_data_regional = pd.read_csv('https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni.csv')
        
        italy_data_national_preds = italy_data_national[['data', 'stato', 'totale_casi']]

        italy_data_national_preds.rename(columns = {'data': 'date', 'totale_casi': 'Cum_cases'}, inplace = True)

        italy_data_national_preds['date'] = pd.to_datetime(italy_data_national_preds['date'], format = "%Y-%m-%d")

        italy_data_national_preds['date'] = italy_data_national_preds['date'].apply(lambda x : dt.datetime.strftime(x,  "%Y-%m-%d"))

        italy_data_national_preds['area_code'] = None

        italy_data_national_preds['area_name'] = 'Italy'

        italy_data_national_preds['area_type'] = 'Country'

        italy_data_national_preds.drop('stato', inplace = True, axis = 1)

        italy_data_national_preds.set_index('date', inplace = True)

        ## Processing The Regional Data

        italy_data_regional_preds = italy_data_regional[['data', 'stato', 'denominazione_regione', 'codice_regione', 'totale_casi']]

        italy_data_regional_preds.rename(columns = {'data': 'date', 'totale_casi': 'Cum_cases', 'denominazione_regione': 'area_name', 'codice_regione': 'area_code'}, inplace = True)

        italy_data_regional_preds['date'] = pd.to_datetime(italy_data_regional_preds['date'], format = "%Y-%m-%d")

        italy_data_regional_preds['date'] = italy_data_regional_preds['date'].apply(lambda x : dt.datetime.strftime(x,  "%Y-%m-%d"))

        italy_data_regional_preds['area_type'] = 'Region'

        italy_data_regional_preds.drop('stato', inplace = True, axis = 1)
        
        italy_data_regional_preds.set_index('date', inplace = True)
        
        
        ## Processing the Provinces
        
        italy_data_province_preds = italy_data_province[['data', 'denominazione_provincia', 'totale_casi']].groupby(['data', 'denominazione_provincia'])['totale_casi'].sum().reset_index()

        italy_data_province_preds.rename(columns = {'data': 'date', 'totale_casi': 'Cum_cases', 'denominazione_provincia': 'area_name'}, inplace = True)

        italy_data_province_preds['date'] = pd.to_datetime(italy_data_province_preds['date'], format = "%Y-%m-%d")

        italy_data_province_preds['date'] = italy_data_province_preds['date'].apply(lambda x : dt.datetime.strftime(x,  "%Y-%m-%d"))

        italy_data_province_preds['area_code'] = None

        italy_data_province_preds['area_type'] = 'Province'

        italy_data_province_preds.set_index('date', inplace = True)

        ## Data concatenation
        
        italy_data_all_areas = pd.concat([italy_data_national_preds, italy_data_regional_preds], axis = 0)

        print("The dataset contains {} duplicates".format(italy_data_all_areas.duplicated().sum()))

        italy_data_all_areas.drop_duplicates(inplace = True)

        ## Some Data cleaning
        
        italy_data_all_areas['area_name'] = italy_data_all_areas['area_name'].apply(lambda x : 'BOLZANO' if x == 'P.A. BOLZANO' else 'TRENTO' if x == 'P.A. TRENTO' else x)

        ## Reading Demographic Data 

        ita_pop_data = pd.read_csv('/project_data/data_asset/DCIS_POPRES1_06072020030435663.csv')

        ita_pop_data = ita_pop_data[(ita_pop_data['Gender']=='total') & (ita_pop_data['STATCIV2']==99)][['Territory', 'Age', 'Value']].drop_duplicates()

        ita_pop_data['All ages'] = ita_pop_data['Value'].groupby(ita_pop_data['Territory']).transform(max)
        
        ita_pop_data = ita_pop_data[['Territory', 'All ages']].drop_duplicates()

        ita_pop_data.rename(columns= {'Territory': 'area_name'}, inplace = True)
        
        return italy_data_all_areas, ita_pop_data, stringency_data
    
    
    elif country == 'Germany':
        
        print("Reading Covid 19 and Other complementary Data")
        
        print('-'*32)
        
        germ_cov_cases = pd.read_csv("/project_data/data_asset/df_pipeline_germany.csv")

        germ_vulnerable_pop = pd.read_csv("/project_data/data_asset/Germany_population_precondition_regions.csv")

        germ_pop_stats = pd.read_csv("/project_data/data_asset/Germany_demographic_distribution_regions.csv")

        germ_life_exp = pd.read_csv("/project_data/data_asset/Germany_life_expectation.csv")

        germ_pop_density = pd.read_csv("/project_data/data_asset/Germany_population_density.csv")
        
#         germ_hosp_beds = pd.read_csv("/project_data/data_asset/Hospital_beds_germany.csv")

        germ_icu_capa = pd.read_csv("/project_data/data_asset/Germany_ICU_capacity_20200715.csv")
        
        ## Some basic cols processing
        
        germ_cov_cases.rename(columns = {'Cognos Area name': 'area_name', 'Area type': 'area_type', 'Cca 2': 'area_code'}, inplace = True)

        germ_cov_cases = germ_cov_cases[['area_name', 'area_type', 'area_code', 'Country', 'cum cases']]

        #########
        
        germ_vulnerable_pop = germ_vulnerable_pop[['no of patients with at least 1 pre-condition', 'percentage', 'CI', 'Cca 2', 'regiontype', 'cognos name']]

        germ_vulnerable_pop.rename(columns = {'no of patients with at least 1 pre-condition': 'number_patients', 'Cca 2': 'area_code', 'regiontype': 'area_type', 'cognos name': 'area_name'}, inplace = True)

        #########
        
        germ_pop_stats = germ_pop_stats[['Cca 2','75 Jahre und mehr', 'Insgesamt', 'cognos name']]

        germ_pop_stats.rename(columns = {'Cca 2': 'area_code', '75 Jahre und mehr': 'pop_above_75', 'Insgesamt': 'All-in_all', 'cognos_name': 'area_name'}, inplace = True)
        
        ########
        
        germ_pop_density.rename(columns = {'Cca 2': 'area_code', 'cognos name': 'area_name'}, inplace = True)
        
        return germ_cov_cases, germ_vulnerable_pop, germ_pop_density, germ_pop_stats, stringency_data
        
    
    

In [None]:
# germ_hosp_beds = pd.read_csv("/project_data/data_asset/Hospital_beds_germany.csv")

In [None]:
# filename = '/project_data/data_asset/ukmidyearestimates20192020ladcodes.xls'

# uk_pop_stats = pd.read_excel(filename, 'MYE2 - Persons', skiprows = 4)
            
# uk_pop_stats[uk_pop_stats['Name'] == 'Gloucestershire']



In [4]:
from math import exp

def risk_index(growth_rate, npi_stringency_index, population_density, **kwargs):
    
    raw_index =  (0.5 * growth_rate  + 0.2 * (population_density/100) - 0.3 * (npi_stringency_index/100))
    
    raw_index = 100 * 1 / (1 + exp(-0.1 * raw_index))
        
    return raw_index


def discrete_risk_idx(risk_index):
    
    if risk_index < 20:
        
        return "0 - 20"
    
    elif risk_index < 50:
            
        return "20 - 50"
    
    elif risk_index < 70:
            
        return "50 - 70"
        
    elif risk_index < 90:
            
        return "70 - 90"
        
    else:
        
        return "90 - 100"
    

def process_risk_index(model, df, target, pop_stats_df, stringency_df, winsize, country_name, area_type, area_name, geo_locate):
    
    '''
    Function for processing the Risk Index. Each country having its own specificity in terms of available data sources,
    
    a different fornula is used for each country. 
    
    Inputs:
    ------
    
    
    
    Outputs:
    -------
    
    
    '''
    
    print("Predicting {} Days ahead".format(winsize))
    
    print("="*50)
        
    series = df[(df['area_name'] == area_name) & (df['area_type'] == area_type)]
    
    print("Series Shape: {}".format(series.shape))
    
    if len(series) > 0 :
        
        ## Train KF and Predict History , Predict Future

        print("Training KF and Predicting {}, {} days ahead".format(target, winsize))

        print("-"*32)

        predict_hist = model.kalman_test(series[target], winsize)

        predict_future = model.forecast(series[target], winsize, True)

        kf_predictions_all = pd.concat([predict_hist, predict_future])
        
        kf_predictions_all = kf_predictions_all[['pred', 'ci_lower', 'ci_upper', 'pred_acc', 'pred_vel']]
        
        kf_predictions_all.loc[:,'real_figures'] =  pd.Series(series[target])
        
        kf_predictions_all['country'] = country_name

        kf_predictions_all['area_name'] = series['area_name'][0]

        kf_predictions_all['area_type'] = series['area_type'][0]

        kf_predictions_all.reset_index(inplace = True)

        kf_predictions_all.rename(columns = {'index' : 'DATE'}, inplace=True)

        kf_predictions_all['DATE'] = pd.to_datetime(kf_predictions_all['DATE'], format ="%Y-%m-%d")
                
        print("Merging with Stringency and Complementary Data")

        print("-"*32)
        
        stringency_merged_df  = kf_predictions_all.merge(stringency_df, left_on = 'DATE' , right_on = 'DATE', how = 'left')
        
        print("Merging with Regional Population Data")

        print("-"*32)

        if country_name == 'Italy':
            
            if len(pop_stats_df[(pop_stats_df['area_name'] == area_name)]) > 0 :

                 stringency_merged_df['area_population_density'] = int(pop_stats_df[(pop_stats_df['area_name'] == area_name)]['All ages'])*100.00/int(pop_stats_df[pop_stats_df['area_name'] == country_name]['All ages'].values[0])
 
            else:
                
                 stringency_merged_df['area_population_density'] = 0 

        elif  country_name == 'France':
            
            ## One specificity in France is that the area code is not uniique, so we need to use both the area_code and area_type
            
            area_code = series[(series['area_name'] == area_name)]['area_code'][0]  
            
            if len(pop_stats_df[pop_stats_df['area_code'] == str(area_code)]) > 0:
                
                stringency_merged_df['area_population_density'] = int(pop_stats_df[(pop_stats_df['area_code']==str(area_code)) & (pop_stats_df['area_type'] == area_type)]['All ages'])*100.00/int(pop_stats_df[pop_stats_df['area_name'] == country_name]['All ages'].values[0])

            
            else:
                
                stringency_merged_df['area_population_density'] = 0 
                
                
        elif country_name == 'England':
            
            area_code = series[(series['area_name'] == area_name)]['area_code'][0]  
            
            if len(pop_stats_df[pop_stats_df['area_code'] == str(area_code)]) > 0:

                stringency_merged_df['area_population_density'] = int(pop_stats_df[(pop_stats_df['area_code']==str(area_code))]['All ages'])*100.00/int(pop_stats_df[pop_stats_df['area_name'] == country_name]['All ages'].values[0])      
       
            else:
            
                stringency_merged_df['area_population_density'] = 0
        
        elif country_name == 'Germany':
            
                area_code = series[(series['area_name'] == area_name)]['area_code'][0]  
                
                if len(pop_stats_df[pop_stats_df['area_code'] == area_code]) > 0:
            
                         stringency_merged_df['area_population_density'] = pop_stats_df[pop_stats_df['area_code']==area_code]['population density'].values[0]/100.0
            
                else:
                    
                    print("No Density Population Data found for: {}".format(area_name))
                    
                    stringency_merged_df['area_population_density'] = 0
        
    
        print("Extrapolating Stringency Indices and Static Data")

        print("-"*32)

        stringency_merged_df['npi_stringency_index'].fillna(method='ffill', inplace = True)

        stringency_merged_df['stats_population_density'].fillna(method='ffill', inplace = True)

        stringency_merged_df['stats_population_urban'].fillna(method='ffill', inplace = True)

        stringency_merged_df['stats_population'].fillna(method='ffill', inplace = True)

        stringency_merged_df['area_population_density'].fillna(method = 'ffill', inplace = True)

    #     stringency_merged_df['infections_var'] = stringency_merged_df['pred'].pct_change()

        print("Calculating the Risk Index")

        print("-"*32)

        stringency_merged_df['risk_index'] = stringency_merged_df[['pred_vel', 'npi_stringency_index', 'area_population_density']].apply(lambda x: risk_index(x[0], x[1], x[2]), axis =1)

        stringency_merged_df['risk_index_disc'] = stringency_merged_df['risk_index'].apply(lambda x: discrete_risk_idx(x))

        
#         if geo_locate:
            
#                 print("Adding Geolocation Data")

#                 print("-"*32)

#                 geolocator = Nominatim(user_agent = "my_geocoder")

#                 location = geolocator.geocode(area_name)

#                 stringency_merged_df['latitude'] = location.latitude

#                 stringency_merged_df['longitude'] = location.longitude
    
#         else:
            
#                 stringency_merged_df['latitude'] = None

#                 stringency_merged_df['longitude'] = None


        return stringency_merged_df

           

In [None]:
## Unit Test #

# stringency_data, uk_regional_cov19, uk_regional_hle, uk_pop_stats, uk_healthcare_system = read_merge_data('England')

# kf = kalman_filter('linear', None)

# process_risk_index(kf, uk_regional_cov19, 'Cumulative lab-confirmed cases', uk_pop_stats, stringency_data, 6, 'England', 'UPPER TIER LOCAL AUTHORITY', 'Gloucestershire', False)


In [None]:
# predictions_all = process_risk_index(kf, uk_regional_cov19, TARGET, uk_pop_stats, stringency_data, PRED_WIN, COUNTRY, 'UPPER TIER LOCAL AUTHORITY', 'Gloucestershire', GEO_LOCATE)

# predictions_all_fin = predictions_all_fin.merge(predictions_all, left_on = 'DATE', right_on = 'DATE', how = 'outer')

# predictions_all_fin.sort_values('DATE', ascending = True, inplace = True)

# predictions_all_fin.fillna(method='ffill', inplace = True)

# predictions_all_fin


In [8]:
kf = kalman_filter('linear', None)

In [5]:
## Creating a calendar Dataframe

start_date = pd.to_datetime('2020-04-01', format = "%Y-%m-%d").date()

end_date = (dt.datetime.now() + dt.timedelta(days = + 5)).date()

calendar = []

for i in range((end_date - start_date).days):
    
    calendar.append(dt.datetime.strftime(start_date + dt.timedelta(days = + i), format = "%Y-%m-%d"))
       
calendar = pd.DataFrame(calendar)

calendar.columns = ['DATE']

calendar['DATE'] = pd.to_datetime(calendar['DATE'], format = "%Y-%m-%d")


## England Prediction Loop ### 

In [None]:
%%time

COUNTRY = 'England'

TARGET = 'Cumulative lab-confirmed cases'

PRED_WIN = 5

GEO_LOCATE = False

## Reading Data

stringency_data, uk_regional_cov19, uk_regional_hle, uk_pop_stats, uk_healthcare_system = read_merge_data(COUNTRY)

# area_types_uk = uk_regional_data['Area type'].unique()

area_types_uk = ['COUNTRY', 'REGION', 'UPPER TIER LOCAL AUTHORITY']

all_areas_uk_risk_index = pd.DataFrame()

for area_type_i in area_types_uk: 
    
    uk_regional_data_i = uk_regional_cov19[uk_regional_cov19['area_type'] == area_type_i]
    
    geo_list = uk_regional_data_i['area_name'].unique()
    
    all_areas_names_uk = pd.DataFrame()
    
    for ii in geo_list:
        
        print("Generating risk index for: {} - {}".format(area_type_i, ii)) 

        predictions_all = process_risk_index(kf, uk_regional_cov19, TARGET, uk_pop_stats, stringency_data, PRED_WIN, COUNTRY, area_type_i, ii, GEO_LOCATE)
        
        predictions_all = calendar.merge(predictions_all, left_on ='DATE', right_on = 'DATE', how = 'left')
        
        predictions_all.fillna(method='ffill', inplace = True)

        all_areas_names_uk = all_areas_names_uk.append(predictions_all)
    
    all_areas_uk_risk_index = all_areas_uk_risk_index.append(all_areas_names_uk)
                

In [None]:
all_areas_uk_risk_index[all_areas_uk_risk_index['area_name']=='Gloucestershire']

In [None]:
plt.plot(all_areas_uk_risk_index[all_areas_uk_risk_index['area_name']=='England']['risk_index'])

In [None]:
plt.plot(all_areas_uk_risk_index[all_areas_uk_risk_index['area_name']=='England']['real_figures'])

In [None]:
plt.plot(uk_regional_cov19[(uk_regional_cov19['area_name'] == 'England') & (uk_regional_cov19['area_type'] == 'COUNTRY')]['Cumulative lab-confirmed cases'])

## France Prediction Loop ### 

In [None]:
fr_regional_cov19, fr_pop_stats, stringency_data, fr_icu_beds_capa = read_merge_data('France')

In [None]:
fr_regional_cov19.index.min()

In [None]:
%%time

COUNTRY = 'France'

TARGET = 'Cumulative_cases'

PRED_WIN = 5

GEO_LOCATE = False

## Reading Data

fr_regional_cov19, fr_pop_stats, stringency_data, fr_icu_beds_capa = read_merge_data(COUNTRY)

area_types_fr = ['Departement', 'Region', 'Country']

all_areas_fr_risk_index = pd.DataFrame()

for area_type_i in area_types_fr: 
    
    fr_regional_data_i = fr_regional_cov19[fr_regional_cov19['area_type'] == area_type_i]
    
    geo_list = fr_regional_data_i['area_name'].unique()
    
    all_areas_names_fr = pd.DataFrame()
    
    for ii in geo_list:
        
        print("Generating risk index for: {} - {}".format(area_type_i, ii)) 

        predictions_all = process_risk_index(kf, fr_regional_cov19, TARGET, fr_pop_stats, stringency_data, PRED_WIN, COUNTRY, area_type_i, ii, GEO_LOCATE)
        
        predictions_all = calendar.merge(predictions_all, left_on ='DATE', right_on = 'DATE', how = 'left')
        
        predictions_all.fillna(method='ffill', inplace = True)

        all_areas_names_fr = all_areas_names_fr.append(predictions_all)
    
    all_areas_fr_risk_index = all_areas_fr_risk_index.append(all_areas_names_fr)
                

In [None]:
all_areas_fr_risk_index.tail()

In [None]:
plt.hist(all_areas_fr_risk_index['risk_index'])

In [None]:
area_code = fr_regional_cov19[(fr_regional_cov19['area_name'] == 'France')]['area_code'][0] 

stringency_data['area_population_density'] = int(pop_stats_df[(pop_stats_df['area_code']==str(area_code)) & (pop_stats_df['area_type'] == area_type)]['All ages'])*100.00/int(pop_stats_df[pop_stats_df['area_name'] == country_name]['All ages'].values[0])


## Germany Prediction Loop ###

In [9]:
%%time

COUNTRY = 'Germany'

TARGET = 'cum cases'

PRED_WIN = 5

GEO_LOCATE = False

## Reading Data

germ_regional_cov19, germ_vulnerable_pop, germ_pop_density, germ_pop_stats, stringency_data = read_merge_data(COUNTRY)

area_types_germ = list(germ_regional_cov19['area_type'].unique())

all_areas_germ_risk_index = pd.DataFrame()

for area_type_i in area_types_germ: 
    
    germ_regional_data_i = germ_regional_cov19[germ_regional_cov19['area_type'] == area_type_i]
    
    geo_list = germ_regional_data_i['area_name'].unique()
    
    all_areas_names_germ = pd.DataFrame()
    
    for ii in geo_list:
        
        print("Generating risk index for: {} - {}".format(area_type_i, ii)) 

        predictions_all = process_risk_index(kf, germ_regional_cov19, TARGET, germ_pop_density, stringency_data, PRED_WIN, COUNTRY, area_type_i, ii, GEO_LOCATE)
        
        predictions_all = calendar.merge(predictions_all, left_on ='DATE', right_on = 'DATE', how = 'left')
        
        predictions_all.fillna(method='ffill', inplace = True)

        all_areas_names_germ = all_areas_names_germ.append(predictions_all)
    
    all_areas_germ_risk_index = all_areas_germ_risk_index.append(all_areas_names_germ)
                

Reading Stringency Integrated Dataset for Country: Germany
--------------------------------
Reading Covid 19 and Other complementary Data
--------------------------------
Generating risk index for: Landkreis - Berlin, Stadt
Predicting 5 Days ahead
Series Shape: (138, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 133
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Ahrweiler
Predicting 5 Days ahead
Series Shape: (73, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 68
Series inside smoother func: 5
Merging with Stringency and Compleme

Series inside smoother func: 72
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Augsburg
Predicting 5 Days ahead
Series Shape: (73, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 68
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Aurich
Predicting 5 Days ahead
Series Shape: (50, 5

Series inside smoother func: 75
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Birkenfeld
Predicting 5 Days ahead
Series Shape: (41, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 36
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Eifelkreis Bitburg-Prüm
Predicting 5 Days ahead
S

Generating risk index for: Landkreis - Landkreis Coesfeld
Predicting 5 Days ahead
Series Shape: (85, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 80
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Cuxhaven
Predicting 5 Days ahead
Series Shape: (73, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 68
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
-----

Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Eichstätt
Predicting 5 Days ahead
Series Shape: (66, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 61
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Elbe-Elster
Predicting 5 Days ahead
Series Shape: (36, 5)
Training KF and Predicti

Series inside smoother func: 59
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Freyung-Grafenau
Predicting 5 Days ahead
Series Shape: (50, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 45
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Friesland
Predicting 5 Days ahead
Series Sh

Series inside smoother func: 91
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Göppingen
Predicting 5 Days ahead
Series Shape: (85, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 80
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Görlitz
Predicting 5 Days ahead
Series Shape: (48,

Series inside smoother func: 96
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Helmstedt
Predicting 5 Days ahead
Series Shape: (33, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 28
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Herford
Predicting 5 Days ahead
Series Shape: (82,

Series inside smoother func: 20
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Kaiserslautern, Landkreis
Predicting 5 Days ahead
Series Shape: (45, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 40
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Karlsruhe, Landkreis
Predicting 5 Days ahead
Series Sh

Series inside smoother func: 71
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Leer
Predicting 5 Days ahead
Series Shape: (61, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 56
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Leipzig
Predicting 5 Days ahead
Series Shape: (48, 5)
T

Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Main-Taunus-Kreis
Predicting 5 Days ahead
Series Shape: (82, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 77
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Mainz-Bingen
Predicting 5 Days ahead
Series Shape: (75, 5)
Training KF and

Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Märkischer Kreis
Predicting 5 Days ahead
Series Shape: (99, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 94
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Mühldorf a.Inn, Landkreis
Predicting 5 Days ahead
Series Shape: (63, 5)
Training KF 

Generating risk index for: Landkreis - Landkreis Nordsachsen
Predicting 5 Days ahead
Series Shape: (44, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 39
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Nordwestmecklenburg
Predicting 5 Days ahead
Series Shape: (30, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 25
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Sta

Series inside smoother func: 93
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Ostalbkreis
Predicting 5 Days ahead
Series Shape: (110, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 105
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Ostallgäu
Predicting 5 Days ahead
Series Shape

Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Ravensburg
Predicting 5 Days ahead
Series Shape: (72, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 67
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Recklinghausen
Predicting 5 Days ahead
Series Shape: (122, 5)
Training KF and Pre

Series inside smoother func: 108
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Rheingau-Taunus-Kreis
Predicting 5 Days ahead
Series Shape: (73, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 68
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Rheinisch-Bergischer Kreis
Predicting

Series inside smoother func: 61
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Saarlouis
Predicting 5 Days ahead
Series Shape: (82, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 77
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Salzlandkreis
Predicting 5 Days ahead
Series Shape

Generating risk index for: Landkreis - Landkreis Soest
Predicting 5 Days ahead
Series Shape: (76, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 71
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Sonneberg
Predicting 5 Days ahead
Series Shape: (87, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 82
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
-------

Generating risk index for: Landkreis - Landkreis Südwestpfalz
Predicting 5 Days ahead
Series Shape: (38, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 33
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Teltow-Fläming
Predicting 5 Days ahead
Series Shape: (63, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 58
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static 

Generating risk index for: Landkreis - Landkreis Viersen
Predicting 5 Days ahead
Series Shape: (106, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 101
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Vogelsbergkreis
Predicting 5 Days ahead
Series Shape: (45, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 40
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Da

Generating risk index for: Landkreis - Landkreis Wesel
Predicting 5 Days ahead
Series Shape: (121, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 116
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Landkreis Wesermarsch
Predicting 5 Days ahead
Series Shape: (27, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 22
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
---

Generating risk index for: Landkreis - Kreisfreie Stadt Aschaffenburg
Predicting 5 Days ahead
Series Shape: (55, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 50
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Kreisfreie Stadt Augsburg
Predicting 5 Days ahead
Series Shape: (93, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 88
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices an

Generating risk index for: Landkreis - Kreisfreie Stadt Coburg
Predicting 5 Days ahead
Series Shape: (61, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 56
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Kreisfreie Stadt Cottbus
Predicting 5 Days ahead
Series Shape: (16, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 11
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static

Generating risk index for: Landkreis - Kreisfreie Stadt Flensburg
Predicting 5 Days ahead
Series Shape: (29, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 24
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Kreisfreie Stadt Frankenthal (Pfalz)
Predicting 5 Days ahead
Series Shape: (26, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 21
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Ind

Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Kreisfreie Stadt Herne
Predicting 5 Days ahead
Series Shape: (81, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 76
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Kreisfreie Stadt Hof
Predicting 5 Days ahead
Series Shape: (57, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 52
Series inside smo

Generating risk index for: Landkreis - Kreisfreie Stadt Landshut
Predicting 5 Days ahead
Series Shape: (57, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 52
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Kreisfreie Stadt Leipzig
Predicting 5 Days ahead
Series Shape: (85, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 80
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Stat

Generating risk index for: Landkreis - Kreisfreie Stadt Neustadt an der Weinstraße
Predicting 5 Days ahead
Series Shape: (37, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 32
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Kreisfreie Stadt Nürnberg
Predicting 5 Days ahead
Series Shape: (109, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 104
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating String

Generating risk index for: Landkreis - Kreisfreie Stadt Salzgitter
Predicting 5 Days ahead
Series Shape: (57, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 52
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Kreisfreie Stadt Schwabach
Predicting 5 Days ahead
Series Shape: (41, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 36
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and 

Generating risk index for: Landkreis - Kreisfreie Stadt Wilhelmshaven
Predicting 5 Days ahead
Series Shape: (25, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 20
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Landkreis - Kreisfreie Stadt Wolfsburg
Predicting 5 Days ahead
Series Shape: (67, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 62
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
Extrapolating Stringency Indices a

Series inside smoother func: 92
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
No Density Population Data found for: Mecklenburg-Vorpommern
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
--------------------------------
Generating risk index for: Bundesland - Niedersachsen
Predicting 5 Days ahead
Series Shape: (140, 5)
Training KF and Predicting cum cases, 5 days ahead
--------------------------------
Series inside smoother func: 135
Series inside smoother func: 5
Merging with Stringency and Complementary Data
--------------------------------
Merging with Regional Population Data
--------------------------------
No Density Population Data found for: Niedersachsen
Extrapolating Stringency Indices and Static Data
--------------------------------
Calculating the Risk Index
----------------------

In [10]:
all_areas_germ_risk_index.head()

Unnamed: 0,DATE,pred,ci_lower,ci_upper,pred_acc,pred_vel,real_figures,country,area_name,area_type,npi_stringency_index,tests_new_per_thousand,stats_population_density,stats_population_urban,stats_population,mobility_retail_recreation,mobility_grocery_pharmacy,mobility_parks,mobility_transit_stations,mobility_workplaces,area_population_density,risk_index,risk_index_disc
0,2020-04-01,2929.342517,2902.034596,2956.650438,21.850417,376.020424,3088.0,Germany,"Berlin, Stadt",Landkreis,73.15,,237.016,7977060.0,83783945.0,-52.0,-13.0,20.0,-53.0,-43.0,40.55,99.999999,90 - 100
1,2020-04-02,3300.595107,3273.287194,3327.903019,13.895691,327.958911,3325.0,Germany,"Berlin, Stadt",Landkreis,73.15,,237.016,7977060.0,83783945.0,-54.0,-11.0,6.0,-54.0,-43.0,40.55,99.999992,90 - 100
2,2020-04-03,3538.176092,3510.868188,3565.483996,-5.231364,173.752093,3564.0,Germany,"Berlin, Stadt",Landkreis,73.15,,237.016,7977060.0,83783945.0,-57.0,-11.0,-13.0,-57.0,-44.0,40.55,99.982902,90 - 100
3,2020-04-04,3634.930221,3607.622321,3662.23812,-14.507344,86.996649,3672.0,Germany,"Berlin, Stadt",Landkreis,73.15,,237.016,7977060.0,83783945.0,-63.0,-16.0,21.0,-52.0,-31.0,40.55,98.708029,90 - 100
4,2020-04-05,3631.16474,3603.856842,3658.472637,-12.062141,93.979527,3750.0,Germany,"Berlin, Stadt",Landkreis,73.15,,237.016,7977060.0,83783945.0,-58.0,-13.0,61.0,-47.0,-30.0,40.55,99.085301,90 - 100


In [None]:
# berlin_risk_idx = process_risk_index(kf, germ_cov_cases, 'cum cases', germ_pop_density, stringency_data, 5, 'Germany', 'Landkreis', 'Berlin, Stadt' , False)
        

In [None]:
germ_regional_cov19.index.min()

In [None]:
plt.hist(all_areas_germ_risk_index['risk_index'])

In [12]:
pd.__version__

'1.0.5'

In [None]:
germ_pop_density[germ_pop_density['area_name'].str.contains('Württemberg')] 

In [None]:
area_code = series[(series['area_name'] == area_name)]['area_code'][0]  

stringency_merged_df['area_population_density'] = pop_stats_df[pop_stats_df['area_code']==area_code]['population density']


In [None]:
germ_cov_cases.head()

In [None]:
area_code = germ_cov_cases[(germ_cov_cases['area_name'] == 'Berlin, Stadt')]['area_code'][0]  

len(germ_pop_density[germ_pop_density['area_code'] == area_code])

if len(germ_pop_density[germ_pop_density['area_code'] == area_code]) > 0:

         toto =  germ_pop_density[germ_pop_density['area_code']==area_code]['population density'].values[0]/100.0

toto

In [None]:
area_code = germ_cov_cases[(germ_cov_cases['area_name'] == 'Berlin, Stadt')]['area_code'][0]  

area_code

In [None]:
germ_pop_density[germ_pop_density['area_code']==area_code]['population density'].values[0]

# Neighbours Dataset

In [None]:
uk_neighbours = pd.read_csv('/project_data/data_asset/df_uk_neighbours_processed.csv')

uk_neighbours.head()

In [None]:
uk_neighbours[uk_neighbours['me_name']=='Dorset']

## Saving Data to COS

In [15]:
from project_lib import Project

project = Project.access()

project.save_data(file_name = "all_areas_germ_risk_index.csv",data = all_areas_germ_risk_index.to_csv(index = False), overwrite = True)


{'file_name': 'all_areas_germ_risk_index.csv',
 'message': 'File saved to project storage.',
 'asset_id': '376fb530-d4af-4033-a54c-b059e1c6175f'}