In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt

In [2]:
import PullData as pdta
import Methods as mt

### Upload time series

In [3]:
fatality_dict = pdta.GetTimeSeries()
fatality_arr = pdta.FilterDict(fatality_dict, 0.5, 'sb_ged_best')

32


### Methods

In [4]:
fatality_arr.shape

(32, 359)

### Validation

In [5]:
# truncated indexes - will be clipped from all arrays to allow for up to 36 months lag time 
num_trunc = 72+167

# training samples
num_train = 215-167

# validation samples
num_val = 36

# last 36 indexes
num_test = 36

### Naive

In [6]:
country_dim_mse_naive = np.zeros((fatality_arr.shape[0])) # num countries,num dimensions

for arr_idx in range(fatality_arr.shape[0]):
    
    pred_arr = np.zeros(num_test)
    
    for lag in range(1,num_test+1):
    
        # represents a country
        emb_reg = mt.EmbedDimRegressor(num_trunc, lag, 1, num_test)
        actual_arr = fatality_arr[arr_idx,-num_test:]
        x_train,x_test,y_train,y_test = emb_reg.CreateOffsetData(fatality_arr[arr_idx])
        pred = y_train[-1]

        pred_arr[lag-1] = pred

    country_dim_mse_naive[arr_idx] = mean_squared_error(actual_arr,pred_arr)  

### No Validation (just train and test)

In [7]:
country_dim_mse_test_no_val = np.zeros((fatality_arr.shape[0],36)) # num countries,num dimensions

for arr_idx in range(fatality_arr.shape[0]):
    
    if arr_idx%5==0:
        print(arr_idx)
    
    for embed_dim in range(1,37):
    
        # represents a country
        pred_arr = np.zeros(num_test)
        actual_arr = fatality_arr[arr_idx,-num_test:]

        # make one-step-ahead prediction
        for lag in range(1,37):

            emb_reg = mt.EmbedDimRegressor(num_trunc, lag, embed_dim, num_test)
            x_train,x_test,y_train,y_test = emb_reg.CreateOffsetData(fatality_arr[arr_idx])
            emb_reg.reg.fit(x_train,y_train) 
            pred = emb_reg.reg.predict(x_test.reshape(1,-1))

            pred_arr[lag-1] = pred

        
        country_dim_mse_test_no_val[arr_idx,embed_dim-1] = mean_squared_error(actual_arr,pred_arr) 

0
5
10
15
20
25
30


### Use Validaton Set to Choose best embedding dimension for each lag for each country

In [9]:
country_dim_mse_val = np.zeros((fatality_arr.shape[0],36,36)) # num countries,num dimensions

for arr_idx in range(fatality_arr.shape[0]):
    
    if arr_idx%5==0:
        print(arr_idx)
    
    for embed_dim in range(1,37):
        
        # represents a country
        pred_arr = np.zeros(num_val)
        actual_arr = fatality_arr[arr_idx,-num_test-num_val:-num_test]

        # make one-step-ahead prediction
        for lag in range(1,37):

            emb_reg = mt.EmbedDimRegressor(num_trunc, lag, embed_dim, num_val)
            x_train,x_test,y_train,y_test = emb_reg.CreateOffsetData(fatality_arr[arr_idx,:-num_test])
            emb_reg.reg.fit(x_train,y_train) 
            pred = emb_reg.reg.predict(x_test.reshape(1,-1))
            pred_arr[lag-1] = pred #(pred+pred_arr.sum())/lag

            country_dim_mse_val[arr_idx,lag-1,embed_dim-1] = abs(pred - y_test)

0
5
10
15
20
25
30


In [10]:
country_dim_mse_test_wval = np.zeros(fatality_arr.shape[0]) # num countries,num dimensions

for arr_idx in range(fatality_arr.shape[0]):
    
    if arr_idx%5==0:
        print(arr_idx)
    
    # represents a country
    pred_arr = np.zeros(num_test)
    actual_arr = fatality_arr[arr_idx,-num_test:]

    # make one-step-ahead prediction
    for lag in range(1,37):
        embed_dim = country_dim_mse_val[arr_idx,lag-1].argmin()+1
        emb_reg = mt.EmbedDimRegressor(num_trunc, lag, embed_dim, num_test)
        x_train,x_test,y_train,y_test = emb_reg.CreateOffsetData(fatality_arr[arr_idx])
        emb_reg.reg.fit(x_train,y_train) 
        pred = emb_reg.reg.predict(x_test.reshape(1,-1))
        pred_arr[lag-1] = pred 
        
    country_dim_mse_test_wval[arr_idx] = mean_squared_error(actual_arr,pred_arr)  

0
5
10
15
20
25
30


In [19]:
print('Average MSE by Method\n')
print('Use Validation:\t',round(country_dim_mse_test_wval.mean()))
print('No Validation:\t',round(country_dim_mse_test_no_val.mean()))
print('Naive:\t\t',round(country_dim_mse_naive.mean()))

Average MSE by Method

Use Validation:	 38591
No Validation:	 41517
Naive:		 49851
