### Import the necessary variables

In [595]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date
import math
import seaborn as sns
import impyute as impy

### Jupyter configuration

In [596]:
pd.set_option('display.max_rows', 500)
%matplotlib inline

### Load the dataset for all the countries

In [597]:
def parser(s):
    return datetime.strptime(s, '%Y-%m-%d')

In [598]:
all_countries_dataset = pd.read_csv('all_countries_dataset.csv')

### List all available variables

In [599]:
all_countries_dataset.columns.tolist()

['Unnamed: 0',
 'iso_code',
 'continent',
 'location',
 'date',
 'total_cases',
 'new_cases',
 'new_cases_smoothed',
 'total_deaths',
 'new_deaths',
 'new_deaths_smoothed',
 'total_cases_per_million',
 'new_cases_per_million',
 'new_cases_smoothed_per_million',
 'total_deaths_per_million',
 'new_deaths_per_million',
 'new_deaths_smoothed_per_million',
 'reproduction_rate',
 'icu_patients',
 'icu_patients_per_million',
 'hosp_patients',
 'hosp_patients_per_million',
 'weekly_icu_admissions',
 'weekly_icu_admissions_per_million',
 'weekly_hosp_admissions',
 'weekly_hosp_admissions_per_million',
 'new_tests',
 'total_tests',
 'total_tests_per_thousand',
 'new_tests_per_thousand',
 'new_tests_smoothed',
 'new_tests_smoothed_per_thousand',
 'positive_rate',
 'tests_per_case',
 'tests_units',
 'total_vaccinations',
 'people_vaccinated',
 'people_fully_vaccinated',
 'total_boosters',
 'new_vaccinations',
 'new_vaccinations_smoothed',
 'total_vaccinations_per_hundred',
 'people_vaccinated_per_

### Fix variables datatypes

In [600]:
# drop cloumn Unnamed: 0

if 'Unnamed: 0' in all_countries_dataset.columns:
    all_countries_dataset = all_countries_dataset.drop(columns=['Unnamed: 0'])

# Fixing date column datatype
all_countries_dataset['date'] = pd.to_datetime(all_countries_dataset['date']);
# all_countries_dataset['location'] = all_countries_dataset['location'].astype('category');
# all_countries_dataset['continent'] = all_countries_dataset['continent'].astype('category');

# all_countries_dataset.set_index('date', inplace=True)

### Exclude unnecessary continents

In [601]:
rule = np.logical_or(all_countries_dataset['continent'] == 'Europe', all_countries_dataset['location'] == 'United States')
dataset = all_countries_dataset[rule]

### Select only the relevant variables that can be used

In report will be needed to be explained why we excluded those variables

In [602]:
variables = [
    'continent',
    'location',
    'date',
    'new_cases',
    'new_deaths',
    'icu_patients',
    'new_tests',
    'positive_rate',
    'people_vaccinated',
    'new_vaccinations',
    'total_boosters',
    'stringency_index',
    'population',
    'population_density',
    'cardiovasc_death_rate',
    'diabetes_prevalence',
    'human_development_index'
 ]

Exclude the unnecessary variables

In [603]:
dataset = dataset[variables]

### Detect the microcountrie (countries that have a population of less than 500 000)

In [604]:
countries = dataset['location'].unique()

population_threshold = 500000
micro_countries = []

for country in countries:
    country_population = dataset[dataset['location'] == country]['population'].max()
    if country_population < population_threshold:
        micro_countries.append(country)

print(len(micro_countries))
micro_countries

11


['Andorra',
 'Faeroe Islands',
 'Gibraltar',
 'Guernsey',
 'Iceland',
 'Isle of Man',
 'Jersey',
 'Liechtenstein',
 'Monaco',
 'San Marino',
 'Vatican']

### Exclude the microcountries

In [605]:
# exclude the micro countries
dataset = dataset[~dataset['location'].isin(micro_countries)]

### Feature scaling

In [606]:
# Perform feature scaling

# take only numerical variables
# numerical_variables = [
#     'new_cases',
#     'new_deaths',
#     'reproduction_rate',
#     'icu_patients',
#     'hosp_patients',
#     'new_tests',
#     'positive_rate',
#     'people_vaccinated',
#     'new_vaccinations',
#     'total_boosters',
#     'stringency_index',
#     'population',
#     'population_density',
#     'cardiovasc_death_rate',
#     'diabetes_prevalence',
#     'human_development_index'
# ]

# # take non-numerical variables
# non_numerical_variables = [
#     'continent',
#     'location',
#     'date'
# ]

# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()

# obj_dataset = dataset[non_numerical_variables].copy()
# num_dataset = dataset[numerical_variables].copy()
# num_dataset_scaled = sc.fit_transform(num_dataset)
# dataset = pd.concat([obj_dataset, pd.DataFrame(data=num_dataset_scaled, columns=numerical_variables)], axis=1)

# dataset.dtypes


### Replace negative values

In [607]:
# removed records where new_cases is negative
# for each numerical variable 
# if new_cases is negative, set it to 0
def remove_negative_values(dataset):
    new_dataset = dataset.copy()
    for variable in variables:
        if variable in ['location', 'continent', 'date']:
            continue
        for index, row in new_dataset.iterrows():
            if index == 0:
                continue
            if row[variable] < 0 and index in new_dataset.index:
                new_dataset.at[index, variable] = new_dataset.at[index - 1, variable]
    return new_dataset

In [608]:
cleaned_dataset = remove_negative_values(dataset)

### Replace first non missing values

In [609]:
def replace_backwards(new_dataset, index, variable, value_to_replace_with):
    while index in new_dataset.index:
        new_dataset.at[index, variable] = value_to_replace_with
        index -= 1

# replace first missing values with first non missing value
def replace_first_missing_values(dataset):
    new_dataset = dataset.copy()
    
    for variable in ['population', 'population_density', 'cardiovasc_death_rate', 'diabetes_prevalence', 'human_development_index']:
        for index, row in dataset.iterrows():
            if not pd.isna(row[variable]):
                replace_backwards(new_dataset, index, variable, row[variable])
                break
    
    for variable in ['new_vaccinations', 'people_vaccinated', 'total_boosters', 'icu_patients', 'new_tests', 'new_cases', 'new_deaths', 'positive_rate', 'stringency_index']:
        for index, row in dataset.iterrows():
            if not pd.isna(row[variable]):
                replace_backwards(new_dataset, index, variable, 0)
                break

    return new_dataset

In [610]:
cleaned_dataset = replace_first_missing_values(cleaned_dataset)

### Replace missing values


In [611]:
def next_non_missing_value(dataset, index, variable):
    next_index = index
    while next_index in dataset.index and pd.isna(dataset.loc[next_index, variable]):
        next_index += 1
    if next_index not in dataset.index:
        return (next_index - 1, -1)
    return (index, dataset.loc[next_index, variable])

def last_non_missing_value(dataset, index, variable):
    last_index = index
    while last_index in dataset.index and pd.isna(dataset.loc[last_index, variable]):
        last_index -= 1
    if last_index not in dataset.index:
        return (last_index + 1, -1)
    return (index, dataset.loc[last_index, variable])

def replace_missing_values(dataset):
    df = dataset.copy()
    new_dataset = pd.DataFrame()
    for country in df['location'].unique():
        country_dataset = df[df['location'] == country]
        for variable in variables:
            if variable in ['location', 'continent', 'date']:
                continue
            found_non_missing = False
            for index, row in country_dataset.iterrows():
                if pd.isna(row[variable]):
                    if found_non_missing and (index - 1) in country_dataset.index and (index + 1) in country_dataset.index:
                        next = next_non_missing_value(country_dataset, index, variable)
                        last = last_non_missing_value(country_dataset, index, variable)
                        if next[1] == -1 or last[1] == -1:
                            continue
                        new_val = (next[1] - last[1]) / 2
                        country_dataset.loc[index, variable] = last[1] + new_val
                else:
                    found_non_missing = True
        new_dataset = pd.concat([new_dataset, country_dataset])
    return new_dataset

In [612]:
cleaned_dataset = replace_missing_values(cleaned_dataset)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


### Remove anomalies

In [613]:
def split_dataframe(a, n): 
    k, m = divmod(len(a), n)
    return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))

In [614]:
def get_mean(dataset, variable):
    sum = 0
    max = dataset[variable].max()
    if pd.isna(max):
        return 0
    count = 0
    for index, row in dataset.iterrows():
        if (~pd.isna(row[variable]) and row[variable] < max) == True:
            sum = sum + row[variable]
            count = count + 1
    if sum == 0:
        return 0
    return sum / count

In [615]:
def remove_anomalies(dataset):
    new_dataset = pd.DataFrame()
    for country in dataset['location'].unique():
        new_country_data = pd.DataFrame()
        country_data = dataset[dataset['location'] == country]
        dataset_chunks = split_dataframe(country_data, 25)

        for chunk in dataset_chunks:
            for variable in variables:
                if variable in ['location', 'continent', 'date']:
                    continue
                anomaly_indexes = chunk[chunk[variable] > chunk[variable].quantile(0.99)].index
                for index in anomaly_indexes:
                    if index - 1 in chunk.index:
                        chunk.at[index, variable] = chunk.at[index - 1, variable]
            new_country_data = new_country_data.append(chunk)
        
        new_dataset = new_dataset.append(new_country_data)

    return new_dataset

In [616]:
cleaned_dataset = remove_anomalies(cleaned_dataset)

### Show the coverage percentage for hosp_patiens and icu_patients

In [617]:
def show_coverage(dataset, variables):
    for country in dataset['location'].unique():
        for variable in variables:
            count = 0
            country_data = dataset[dataset['location'] == country]
            for index, row in country_data.iterrows():
                if not pd.isna(row[variable]):
                    count = count + 1
            covered = count / country_data.shape[0] * 100
            print("{}, {} = {} %".format(country, variable, covered))
        print("\n")

In [618]:
show_coverage(cleaned_dataset, ['icu_patients'])

Albania, icu_patients = 0.0 %


Austria, icu_patients = 98.7012987012987 %


Belarus, icu_patients = 0.0 %


Belgium, icu_patients = 92.4646781789639 %


Bosnia and Herzegovina, icu_patients = 0.0 %


Bulgaria, icu_patients = 93.87417218543047 %


Croatia, icu_patients = 0.0 %


Cyprus, icu_patients = 98.50993377483444 %


Czechia, icu_patients = 97.57281553398059 %


Denmark, icu_patients = 89.2018779342723 %


Estonia, icu_patients = 95.141065830721 %


Finland, icu_patients = 90.6687402799378 %


France, icu_patients = 98.30246913580247 %


Germany, icu_patients = 90.54263565891473 %


Greece, icu_patients = 0.0 %


Hungary, icu_patients = 0.0 %


Ireland, icu_patients = 94.28104575163398 %


Italy, icu_patients = 95.00780031201248 %


Kosovo, icu_patients = 0.0 %


Latvia, icu_patients = 0.0 %


Lithuania, icu_patients = 0.0 %


Luxembourg, icu_patients = 98.70340356564019 %


Malta, icu_patients = 98.35255354200989 %


Moldova, icu_patients = 0.0 %


Montenegro, icu_patients = 0.0

Remove countries that don't have data for icu_patients variable

In [619]:
def remove_countries_without_data(dataset, variables):
    new_dataset = pd.DataFrame()

    for country in dataset['location'].unique():
        for variable in variables:
            count = 0
            country_data = dataset[dataset['location'] == country]
            for index, row in country_data.iterrows():
                if not pd.isna(row[variable]):
                    count = count + 1
            covered = count / country_data.shape[0] * 100
            if covered != 0:
                new_dataset = new_dataset.append(country_data)
    return new_dataset

In [620]:
cleaned_dataset = remove_countries_without_data(cleaned_dataset, ['icu_patients'])

In [621]:
cleaned_dataset['location'].unique()

array(['Austria', 'Belgium', 'Bulgaria', 'Cyprus', 'Czechia', 'Denmark',
       'Estonia', 'Finland', 'France', 'Germany', 'Ireland', 'Italy',
       'Luxembourg', 'Malta', 'Netherlands', 'Portugal', 'Romania',
       'Serbia', 'Slovenia', 'Spain', 'Sweden', 'Switzerland',
       'United Kingdom', 'United States'], dtype=object)

In [622]:
cleaned_dataset.to_csv('cleaned_dataset.csv')

In [623]:
cleaned_dataset.corr()

Unnamed: 0,new_cases,new_deaths,icu_patients,new_tests,positive_rate,people_vaccinated,new_vaccinations,total_boosters,stringency_index,population,population_density,cardiovasc_death_rate,diabetes_prevalence,human_development_index
new_cases,1.0,0.820677,0.90023,0.815766,0.129381,0.499717,0.490728,0.441331,0.138078,0.681693,-0.09077,-0.077306,0.24494,0.083081
new_deaths,0.820677,1.0,0.872676,0.731426,0.162336,0.40251,0.540087,0.512688,0.232848,0.724571,-0.099187,-0.074018,0.258373,0.071535
icu_patients,0.90023,0.872676,1.0,0.806052,0.122777,0.600376,0.532213,0.488099,0.184524,0.848064,-0.109645,-0.087014,0.2957,0.084737
new_tests,0.815766,0.731426,0.806052,1.0,-0.039072,0.690078,0.705803,0.55671,0.145869,0.78456,-0.086071,-0.165494,0.195498,0.16155
positive_rate,0.129381,0.162336,0.122777,-0.039072,1.0,-0.067981,-0.058312,0.08962,0.240261,0.019895,-0.088177,0.283651,0.108422,-0.217305
people_vaccinated,0.499717,0.40251,0.600376,0.690078,-0.067981,1.0,0.590791,0.673678,-0.011384,0.84556,-0.105187,-0.133422,0.285961,0.139127
new_vaccinations,0.490728,0.540087,0.532213,0.705803,-0.058312,0.590791,1.0,0.319658,0.130647,0.753459,-0.106249,-0.154487,0.255182,0.142347
total_boosters,0.441331,0.512688,0.488099,0.55671,0.08962,0.673678,0.319658,1.0,-0.048601,0.618741,-0.092905,-1.2e-05,0.282262,0.040666
stringency_index,0.138078,0.232848,0.184524,0.145869,0.240261,-0.011384,0.130647,-0.048601,1.0,0.109724,0.019635,-0.138047,0.061515,0.032648
population,0.681693,0.724571,0.848064,0.78456,0.019895,0.84556,0.753459,0.618741,0.109724,1.0,-0.13302,-0.137692,0.35296,0.143607


### Plots

In [624]:
def plot_vars_per_country(dataset, cleaned_dataset):
    for continent in dataset['continent'].unique():
        print(continent)

        continent_data = dataset[dataset['continent'] == continent]
        continent_data_cleaned = cleaned_dataset[cleaned_dataset['continent'] == continent]

        for variable in variables:
            if variable == 'location' or variable == 'continent' or variable == 'date':
                continue
            continent_countries = continent_data['location'].unique()

            # set fig size
            plt.figure(figsize=(50, 20))

            for country in continent_countries:
                country_data = continent_data[continent_data['location'] == country]
                plt.plot(country_data['date'], country_data[variable], label=country)

            plt.legend()
            plt.title("{} - {}".format(continent, variable))
            plt.show()

            # set fig size
            plt.figure(figsize=(50, 20))

            for country in continent_countries:
                country_data_cleaned = continent_data_cleaned[continent_data_cleaned['location'] == country]
                plt.plot(country_data_cleaned['date'], country_data_cleaned[variable], label=country)

            plt.legend()
            plt.title("{} - {}".format(continent, variable))
            plt.show()


In [625]:
# plot_vars_per_country(dataset, cleaned_dataset)

VARMAX

In [626]:
# dataset - diff_dataset
def inverse_differenciate(dataset):
    # for variable in dataset.columns:
    #     if variable == 'location' or variable == 'continent' or variable == 'date':
    #         continue
    #     dataset[variable] = cleaned_dataset[variable].shift(1) + dataset[variable]

    for variable in dataset.columns:
        if variable == 'location' or variable == 'continent' or variable == 'date':
            continue
        dataset[variable] = pd.Series(np.r_[cleaned_dataset[variable], dataset[variable]].cumsum())

    return dataset

In [627]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf


In [628]:
# take only numerical variables
numerical_variables = [
    'new_cases',
    'new_deaths',
    'icu_patients',
    'new_tests',
    'positive_rate',
    'people_vaccinated',
    'new_vaccinations',
    'total_boosters',
    'stringency_index',
    'population',
    'population_density',
    'cardiovasc_death_rate',
    'diabetes_prevalence',
    'human_development_index'
]

# take non-numerical variables
non_numerical_variables = [
    'continent',
    'location',
    'date'
]

varmax_dataset = cleaned_dataset.copy()

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

obj_dataset = varmax_dataset[non_numerical_variables].copy()
num_dataset = varmax_dataset[numerical_variables].copy()
# num_dataset = sc.fit_transform(num_dataset)

num_dataset = pd.DataFrame(num_dataset, columns=numerical_variables)


num_dataset = np.log(num_dataset)

# Take First Difference to Remove Trend
num_dataset = num_dataset.diff()

num_dataset = num_dataset.diff()

# Remove Increasing Volatility
# num_dataset = num_dataset.groupby(num_dataset.index.year).std()

varmax_dataset = pd.concat([obj_dataset, pd.DataFrame(data=num_dataset, columns=numerical_variables)], axis=1)

varmax_dataset.dtypes

continent                          object
location                           object
date                       datetime64[ns]
new_cases                         float64
new_deaths                        float64
icu_patients                      float64
new_tests                         float64
positive_rate                     float64
people_vaccinated                 float64
new_vaccinations                  float64
total_boosters                    float64
stringency_index                  float64
population                        float64
population_density                float64
cardiovasc_death_rate             float64
diabetes_prevalence               float64
human_development_index           float64
dtype: object

### Split into traing and test sets

In [629]:
varmax_dataset.index = pd.to_datetime(varmax_dataset.date)
varmax_dataset.index.freq = varmax_dataset.index.inferred_freq

In [630]:
def replace_nans(dataset):
    # for each country
    new_dataset = pd.DataFrame()

    for country in dataset['location'].unique():
        country_dataset = dataset[dataset['location'] == country]

        for variable in numerical_variables:
            for index, row in country_dataset.iterrows():
                if pd.isna(row[variable]) or row[variable] == float('inf') or row[variable] == float('-inf'):
                    previous_timestamp = index - pd.Timedelta(days=1)
                    if previous_timestamp in country_dataset.index and pd.isna(country_dataset.loc[previous_timestamp, variable]) == False:
                        country_dataset.at[index, variable] = country_dataset.at[previous_timestamp, variable]
                    else:
                        country_dataset.at[index, variable] = 0
                        
        new_dataset = pd.concat([new_dataset, country_dataset], axis=0)

    return new_dataset

In [631]:
varmax_dataset = replace_nans(varmax_dataset)

In [632]:
# split into train and test
# training_date_limit = date(2021, 8, 1)

# varmax_dataset.index = pd.to_datetime(varmax_dataset.date)
# varmax_dataset.index.freq = varmax_dataset.index.inferred_freq

# varmax_train_dataset = varmax_dataset[varmax_dataset['date'].dt.date < training_date_limit]
# varmax_test_dataset = varmax_dataset[varmax_dataset['date'].dt.date >= training_date_limit]

varmax_train_dataset = varmax_dataset

In [633]:
import statsmodels.api as sm

exogeneous_variables = [
    'population',
    'population_density',
    'diabetes_prevalence',
    'human_development_index'
]

endogeneous_variables = [
    'icu_patients',
    'cardiovasc_death_rate',
    'new_cases',
    'new_deaths',
    'new_tests',
    'positive_rate',
    'people_vaccinated',
    'new_vaccinations',
    'total_boosters',
    'stringency_index'
]

In [634]:
print(varmax_train_dataset.location.unique())

['Austria' 'Belgium' 'Bulgaria' 'Cyprus' 'Czechia' 'Denmark' 'Estonia'
 'Finland' 'France' 'Germany' 'Ireland' 'Italy' 'Luxembourg' 'Malta'
 'Netherlands' 'Portugal' 'Romania' 'Serbia' 'Slovenia' 'Spain' 'Sweden'
 'Switzerland' 'United Kingdom' 'United States']


### ACF and PACF

https://www.youtube.com/watch?v=CAT0Y66nPhs&ab_channel=DataScienceShow

In [635]:
print(len(numerical_variables))

14


In [636]:
import matplotlib.pyplot as plt

def plot_acf_per_country(dataset):
    for country in dataset['location'].unique():
        country_dataset = dataset[dataset['location'] == country]

        print('\n')
        fig = plt.figure(figsize=(30, 15))
        for index, variable in enumerate(numerical_variables):
            # add plot_acf to subplot for each variable
            ax = fig.add_subplot(4, 4, index + 1)
            plot_acf(country_dataset[variable], ax=ax, lags=300)
            ax.set_title(variable)

        fig.suptitle(country)
        fig.show()

def plot_pacf_per_country(dataset):
    for country in dataset['location'].unique():
        country_dataset = dataset[dataset['location'] == country]

        print('\n')
        fig = plt.figure(figsize=(30, 15))
        for index, variable in enumerate(numerical_variables):
            # add plot_acf to subplot for each variable
            ax = fig.add_subplot(4, 4, index + 1)
            plot_pacf(country_dataset[variable], ax=ax)
            ax.set_title(variable)

        fig.suptitle(country)
        fig.show()

In [637]:
# plot_acf_per_country(varmax_train_dataset)

In [638]:
us_dataset = varmax_train_dataset[varmax_train_dataset['location'] == 'United States']
us_dataset.corr()

Unnamed: 0,new_cases,new_deaths,icu_patients,new_tests,positive_rate,people_vaccinated,new_vaccinations,total_boosters,stringency_index,population,population_density,cardiovasc_death_rate,diabetes_prevalence,human_development_index
new_cases,1.0,0.650277,0.192434,0.468756,0.293593,0.021477,0.461784,0.083258,0.113182,0.0,0.0,0.0,0.0,0.0
new_deaths,0.650277,1.0,0.153058,0.535546,0.177722,0.050461,0.39375,0.096533,-0.064895,0.0,0.0,0.0,0.0,0.0
icu_patients,0.192434,0.153058,1.0,0.217086,0.104043,-0.002714,0.132009,0.08839,0.011421,0.0,0.0,0.0,0.0,0.0
new_tests,0.468756,0.535546,0.217086,1.0,0.150764,0.066305,0.636182,0.112549,0.119339,0.0,0.0,0.0,0.0,0.0
positive_rate,0.293593,0.177722,0.104043,0.150764,1.0,-0.018443,0.184778,0.004384,0.060601,0.0,0.0,0.0,0.0,0.0
people_vaccinated,0.021477,0.050461,-0.002714,0.066305,-0.018443,1.0,0.132687,0.003556,0.000233,0.0,0.0,0.0,0.0,0.0
new_vaccinations,0.461784,0.39375,0.132009,0.636182,0.184778,0.132687,1.0,0.099483,0.052408,0.0,0.0,0.0,0.0,0.0
total_boosters,0.083258,0.096533,0.08839,0.112549,0.004384,0.003556,0.099483,1.0,0.001039,0.0,0.0,0.0,0.0,0.0
stringency_index,0.113182,-0.064895,0.011421,0.119339,0.060601,0.000233,0.052408,0.001039,1.0,0.0,0.0,0.0,0.0,0.0
population,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,1.0,1.0,-1.0


### Dickey-fuller test

In [639]:
# augmented dickey-fuller test
from statsmodels.tsa.stattools import adfuller

def dickey_fuller_test(dataset):
    for country in dataset.location.unique():
        country_dataset = varmax_train_dataset[varmax_train_dataset['location'] == country]
        print("{}".format(country))
        for variable in numerical_variables:
            print("{}".format(variable))

            X = np.asarray(country_dataset[variable])
            result = adfuller(X)

            print('ADF Statistic: %f' % result[0])
            print('p-value: %f' % result[1])
            print('Critical Values:')
            for key, value in result[4].items():
                print('\t%s: %.3f' % (key, value))

            print('\n')

        print('\n')
        print("=============================")


In [640]:
# dickey_fuller_test(varmax_train_dataset)

In [641]:
mod = sm.tsa.VARMAX(np.asarray(varmax_train_dataset[endogeneous_variables]), np.asarray(varmax_train_dataset[exogeneous_variables]), order=(1, 0))

In [642]:
res = mod.fit(disp=True)
res.summary()

 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          205     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -1.36532D+00    |proj g|=  4.95567D+01

At iterate    1    f= -1.39598D+00    |proj g|=  1.28284D+01

At iterate    2    f= -1.45310D+00    |proj g|=  1.13836D+01

At iterate    3    f= -1.51399D+00    |proj g|=  7.72106D+00

At iterate    4    f= -1.51743D+00    |proj g|=  4.59032D+00

At iterate    5    f= -1.52307D+00    |proj g|=  1.14529D+00

At iterate    6    f= -1.52382D+00    |proj g|=  7.12609D-01

At iterate    7    f= -1.52597D+00    |proj g|=  1.84839D+00

At iterate    8    f= -1.53474D+00    |proj g|=  4.64125D+00

At iterate    9    f= -1.55104D+00    |proj g|=  6.85359D+00

At iterate   10    f= -1.57133D+00    |proj g|=  6.35501D+00

At iterate   11    f= -1.57855D+00    |proj g|=  1.00117D+01

At iterate   12    f= -1.59654D+00    |proj g|=  3.65196D+00

At iterate   13    f= -1.5




At iterate   50    f= -1.68189D+00    |proj g|=  2.17314D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  205     50     59      1     0     0   2.173D-01  -1.682D+00
  F =  -1.6818859771134078     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 


0,1,2,3
Dep. Variable:,"['y1', 'y2', 'y3', 'y4', 'y5', 'y6', 'y7', 'y8', 'y9', 'y10']",No. Observations:,15061.0
Model:,VARX(1),Log Likelihood,25330.885
,+ intercept,AIC,-50251.769
Date:,"Sun, 26 Dec 2021",BIC,-48689.697
Time:,22:07:22,HQIC,-49733.599
Sample:,0,,
,- 15061,,
Covariance Type:,opg,,

0,1,2,3
Ljung-Box (L1) (Q):,"3.23, 3054.41, 102.85, 20.43, 747.69, 507.98, 37.80, 153.37, 279.50, 411.69",Jarque-Bera (JB):,"504419.44, 711287274.38, 301417.04, 5293.04, 143494.57, 44873905.93, 3266869627.36, 3866017.35, 903816034.29, 10589228.52"
Prob(Q):,"0.07, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00",Prob(JB):,"0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00"
Heteroskedasticity (H):,"0.44, 0.44, 0.37, 1.07, 0.65, 1.00, 0.63, 1.74, 0.11, 0.84",Skew:,"0.24, 2.36, -0.51, 0.37, 2.11, 5.33, 2.47, 4.58, 17.21, -0.91"
Prob(H) (two-sided):,"0.00, 0.00, 0.00, 0.01, 0.00, 0.92, 0.00, 0.00, 0.00, 0.00",Kurtosis:,"31.35, 1067.63, 24.89, 5.81, 17.52, 270.20, 2284.62, 80.95, 1202.61, 132.89"

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,-0.0066,0.009,-0.731,0.465,-0.024,0.011
L1.y1,-0.0595,0.003,-23.518,0.000,-0.065,-0.055
L1.y2,0.0008,11.884,6.8e-05,1.000,-23.292,23.293
L1.y3,0.0008,0.002,0.380,0.704,-0.003,0.005
L1.y4,-0.0025,0.003,-0.795,0.427,-0.009,0.004
L1.y5,-0.0110,0.012,-0.940,0.347,-0.034,0.012
L1.y6,0.0148,0.007,2.136,0.033,0.001,0.028
L1.y7,-0.0036,0.093,-0.039,0.969,-0.186,0.179
L1.y8,-0.0002,0.004,-0.051,0.960,-0.008,0.008

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,0.0001,0.011,0.011,0.991,-0.022,0.022
L1.y1,2.468e-05,0.123,0.000,1.000,-0.241,0.241
L1.y2,-0.1634,0.012,-13.964,0.000,-0.186,-0.140
L1.y3,-2.246e-05,0.008,-0.003,0.998,-0.016,0.016
L1.y4,-3.315e-05,0.021,-0.002,0.999,-0.042,0.042
L1.y5,-0.0002,0.015,-0.011,0.991,-0.030,0.029
L1.y6,4.843e-05,0.153,0.000,1.000,-0.301,0.301
L1.y7,0.0001,0.340,0.000,1.000,-0.666,0.666
L1.y8,-4.391e-05,0.012,-0.004,0.997,-0.023,0.023

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,-0.0350,0.016,-2.185,0.029,-0.066,-0.004
L1.y1,-0.0092,0.166,-0.056,0.956,-0.335,0.316
L1.y2,0.1642,14.244,0.012,0.991,-27.754,28.082
L1.y3,0.1881,0.010,18.605,0.000,0.168,0.208
L1.y4,-0.0016,0.028,-0.056,0.956,-0.057,0.054
L1.y5,0.2064,0.020,10.389,0.000,0.168,0.245
L1.y6,-0.7430,0.177,-4.199,0.000,-1.090,-0.396
L1.y7,-0.0218,0.487,-0.045,0.964,-0.977,0.933
L1.y8,0.0611,0.018,3.363,0.001,0.026,0.097

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,0.0511,0.057,0.898,0.369,-0.060,0.163
L1.y1,-0.0020,0.083,-0.024,0.981,-0.165,0.161
L1.y2,0.0318,24.201,0.001,0.999,-47.401,47.465
L1.y3,-0.0419,0.016,-2.586,0.010,-0.074,-0.010
L1.y4,0.1084,0.015,7.096,0.000,0.078,0.138
L1.y5,-0.0400,0.077,-0.520,0.603,-0.191,0.111
L1.y6,-0.0667,0.077,-0.865,0.387,-0.218,0.084
L1.y7,0.0663,0.303,0.218,0.827,-0.528,0.661
L1.y8,-0.0065,0.022,-0.297,0.766,-0.050,0.036

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,-0.0019,0.011,-0.168,0.867,-0.024,0.020
L1.y1,0.0285,0.022,1.274,0.203,-0.015,0.072
L1.y2,-0.0139,17.855,-0.001,0.999,-35.009,34.981
L1.y3,-0.0276,0.005,-5.729,0.000,-0.037,-0.018
L1.y4,0.0194,0.006,3.491,0.000,0.008,0.030
L1.y5,-0.5347,0.014,-38.748,0.000,-0.562,-0.508
L1.y6,0.0676,0.042,1.605,0.109,-0.015,0.150
L1.y7,0.0676,0.129,0.522,0.602,-0.186,0.321
L1.y8,0.0308,0.006,4.965,0.000,0.019,0.043

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,-0.0009,0.002,-0.397,0.691,-0.005,0.004
L1.y1,0.0022,0.004,0.508,0.612,-0.006,0.011
L1.y2,-0.0064,12.752,-0.001,1.000,-25.000,24.987
L1.y3,0.0014,0.001,1.414,0.157,-0.001,0.003
L1.y4,-9.169e-05,0.001,-0.077,0.939,-0.002,0.002
L1.y5,-0.0022,0.003,-0.728,0.466,-0.008,0.004
L1.y6,-0.5618,0.002,-321.272,0.000,-0.565,-0.558
L1.y7,-0.0022,0.035,-0.064,0.949,-0.071,0.066
L1.y8,0.0011,0.002,0.533,0.594,-0.003,0.005

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,-0.0017,0.001,-1.115,0.265,-0.005,0.001
L1.y1,-0.0016,0.031,-0.053,0.958,-0.063,0.060
L1.y2,-0.0073,2.038,-0.004,0.997,-4.002,3.987
L1.y3,-0.0002,0.002,-0.070,0.944,-0.005,0.004
L1.y4,0.0005,0.002,0.295,0.768,-0.003,0.003
L1.y5,0.0013,0.002,0.627,0.531,-0.003,0.005
L1.y6,-0.0027,0.019,-0.146,0.884,-0.040,0.034
L1.y7,-0.3430,0.014,-25.127,0.000,-0.370,-0.316
L1.y8,-8.499e-05,0.001,-0.097,0.922,-0.002,0.002

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,-0.0070,0.012,-0.570,0.569,-0.031,0.017
L1.y1,0.0707,0.077,0.915,0.360,-0.081,0.222
L1.y2,-0.0075,12.194,-0.001,1.000,-23.908,23.893
L1.y3,-0.0283,0.012,-2.413,0.016,-0.051,-0.005
L1.y4,0.0208,0.008,2.592,0.010,0.005,0.037
L1.y5,0.0071,0.016,0.457,0.648,-0.023,0.038
L1.y6,0.0310,0.084,0.368,0.713,-0.134,0.196
L1.y7,-0.0921,0.065,-1.417,0.156,-0.220,0.035
L1.y8,-0.4488,0.004,-102.575,0.000,-0.457,-0.440

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,-0.0006,0.001,-0.677,0.499,-0.002,0.001
L1.y1,-0.0007,0.007,-0.096,0.924,-0.014,0.013
L1.y2,-0.0007,15.790,-4.61e-05,1.000,-30.949,30.948
L1.y3,-0.0002,0.001,-0.128,0.898,-0.003,0.002
L1.y4,0.0004,0.001,0.536,0.592,-0.001,0.002
L1.y5,-0.0002,0.001,-0.206,0.837,-0.002,0.002
L1.y6,-0.0013,0.008,-0.163,0.870,-0.017,0.015
L1.y7,-0.0065,0.030,-0.220,0.826,-0.064,0.051
L1.y8,0.0013,0.001,1.923,0.055,-2.58e-05,0.003

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,-0.0001,0.001,-0.195,0.846,-0.002,0.001
L1.y1,0.0060,0.004,1.481,0.139,-0.002,0.014
L1.y2,0.0549,0.022,2.477,0.013,0.011,0.098
L1.y3,-0.0007,0.000,-1.492,0.136,-0.002,0.000
L1.y4,0.0012,0.001,1.450,0.147,-0.000,0.003
L1.y5,-0.0006,0.001,-0.523,0.601,-0.003,0.002
L1.y6,0.0031,0.004,0.702,0.483,-0.006,0.012
L1.y7,-0.0015,0.039,-0.037,0.971,-0.079,0.076
L1.y8,0.0011,0.001,0.896,0.370,-0.001,0.004

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
sqrt.var.y1,0.1729,0.000,635.459,0.000,0.172,0.173
sqrt.cov.y1.y2,5.115e-07,0.245,2.09e-06,1.000,-0.479,0.479
sqrt.var.y2,0.0248,0.000,116.379,0.000,0.024,0.025
sqrt.cov.y1.y3,0.0241,0.301,0.080,0.936,-0.566,0.614
sqrt.cov.y2.y3,-0.0305,0.595,-0.051,0.959,-1.197,1.136
sqrt.var.y3,1.1566,0.002,490.650,0.000,1.152,1.161
sqrt.cov.y1.y4,-0.0107,0.016,-0.656,0.512,-0.042,0.021
sqrt.cov.y2.y4,-0.0005,10.854,-4.71e-05,1.000,-21.275,21.274
sqrt.cov.y3.y4,0.0641,0.013,4.831,0.000,0.038,0.090


In [673]:
# set index to be date
varmax_train_dataset.index = pd.to_datetime(varmax_train_dataset.date)
varmax_train_dataset.index.freq = varmax_train_dataset.index.inferred_freq

varmax_train_dataset

Unnamed: 0_level_0,continent,location,date,new_cases,new_deaths,icu_patients,new_tests,positive_rate,people_vaccinated,new_vaccinations,total_boosters,stringency_index,population,population_density,cardiovasc_death_rate,diabetes_prevalence,human_development_index
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2020-02-25,Europe,Austria,2020-02-25,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2020-02-26,Europe,Austria,2020-02-26,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2020-02-27,Europe,Austria,2020-02-27,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2020-02-28,Europe,Austria,2020-02-28,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2020-02-29,Europe,Austria,2020-02-29,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-10-28,North America,United States,2021-10-28,-0.464198,-0.820675,-0.009224,-0.457954,-0.117783,-0.000108,-0.080922,-0.006731,0.0,0.0,0.0,0.0,0.0,0.0
2021-10-29,North America,United States,2021-10-29,0.463834,0.268763,-0.126087,-0.457954,-0.117783,0.000016,0.078808,-0.002860,0.0,0.0,0.0,0.0,0.0,0.0
2021-10-30,North America,United States,2021-10-30,-1.420847,-1.668878,0.128016,-0.457954,-0.117783,-0.000828,-1.215143,-0.032110,0.0,0.0,0.0,0.0,0.0,0.0
2021-10-31,North America,United States,2021-10-31,0.690956,1.165380,0.009860,-0.457954,-0.117783,-0.000365,-1.290287,-0.011137,0.0,0.0,0.0,0.0,0.0,0.0


In [687]:
res.predict(
    endog=np.asarray(varmax_train_dataset[varmax_train_dataset['location'] == 'United States'][endogeneous_variables]),
    exog=np.asarray(varmax_train_dataset[varmax_train_dataset['location'] == 'United States'][exogeneous_variables])
)



array([[-6.43757308e-03,  1.10345078e-04, -4.28357446e-02, ...,
        -3.35503723e-03, -3.81286801e-04, -5.58184542e-05],
       [-6.63919411e-03,  1.28793769e-04, -3.49951359e-02, ...,
        -6.99518702e-03, -5.99199363e-04, -1.43271713e-04],
       [-6.63919411e-03,  1.28793769e-04, -3.49951359e-02, ...,
        -6.99518702e-03, -5.99199363e-04, -1.43271713e-04],
       ...,
       [ 3.84783573e-03,  1.81331223e-04,  5.03624768e-02, ...,
        -6.53665586e-02,  1.19672501e-03, -8.79799878e-04],
       [-7.66452347e-03,  4.07820831e-04, -3.87071651e-01, ...,
         5.50017424e-01,  1.22577952e-02, -1.91997651e-03],
       [-6.01957304e-03,  2.26047718e-04,  5.46212746e-03, ...,
         5.72027414e-01,  3.46790593e-03, -7.26984914e-04]])

In [686]:
np.asarray(varmax_train_dataset[varmax_train_dataset['location'] == 'United States'][endogeneous_variables])

array([[ 0.00000000e+00,  2.12725702e-01,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00, -2.12725702e-01,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       ...,
       [ 1.28016142e-01,  0.00000000e+00, -1.42084716e+00, ...,
        -1.21514251e+00, -3.21102621e-02,  0.00000000e+00],
       [ 9.85962396e-03,  0.00000000e+00,  6.90956256e-01, ...,
        -1.29028662e+00, -1.11367196e-02,  0.00000000e+00],
       [-1.05700117e-02,  0.00000000e+00,  2.43891026e+00, ...,
        -1.29028662e+00, -6.96352710e-04,  0.00000000e+00]])

In [656]:
# prediction = res.get_prediction(start=pd.to_datetime(varmax_train_dataset['date'].max()), dynamic=False)
# prediction_mean = prediction.predicted_mean
# prediction_mean = pd.DataFrame(prediction_mean, columns=endogeneous_variables)

KeyError: 'The `start` argument could not be matched to a location related to the index of the data.'

### AutoARIMA

In [None]:
# get the data only for United States
us_dataset = varmax_train_dataset[varmax_train_dataset['location'] == 'United States']

# import autoarima 
from pmdarima.arima import auto_arima
from pmdarima.arima import ADFTest

# adf_test = ADFTest(alpha=0.05)
# adf_test = adf_test.should_diff(us_dataset['new_deaths'])

constant_variables = [
    'population',
    'population_density',
    'cardiovasc_death_rate',
    'diabetes_prevalence',
    'human_development_index'
]

# pq = {}

# all_country_p_sum = 0
# all_country_q_sum = 0
# for country in varmax_train_dataset['location'].unique():
#     country_train_dataset = varmax_train_dataset[varmax_train_dataset['location'] == country]
#     print("Using auto_arima for {}".format(country))

#     p_sum = 0
#     q_sum = 0

#     country_p_q = {}

#     for variable in numerical_variables:
#         if variable in constant_variables:
#             continue
#         print("Using auto_arima for {}".format(variable))
#         arima_model = auto_arima(country_train_dataset[variable], start_p=0, start_q=0,
#                                     test='adf',
#                                     max_p=5,
#                                     max_q=5,
#                                     m=4,
#                                     trace=True,
#                                     error_action='warn')
#         arima_model.summary()

#         country_p_q[variable] = {
#             'p': arima_model.order[0],
#             'q': arima_model.order[1]
#         }

#         # get p and q from the summary
#         p_sum += arima_model.order[0]
#         q_sum += arima_model.order[1]
    
#         print('\n')

#     pq[country] = country_p_q
    

#     print('\n')

#     p_mean = p_sum / len(numerical_variables)
#     q_mean = q_sum / len(numerical_variables)
#     all_country_p_sum = all_country_p_sum + p_mean
#     all_country_q_sum = all_country_q_sum + q_mean

#     print("p_mean: {}".format(p_mean))
#     print("q_mean: {}".format(q_mean))
#     print('\n')
#     print("=============================")

# print("all_country_p_mean: {}".format(all_country_p_sum / len(varmax_train_dataset['location'].unique())))
# print("all_country_q_mean: {}".format(all_country_q_sum/ len(varmax_train_dataset['location'].unique())))

# print(pq)

In [None]:
# arima_model.scoring(us_dataset_test['new_deaths'])

In [None]:
# plot_vars_per_country(dataset, varmax_dataset)