In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt

In [2]:
import PullData as pdta
import Methods as mt

### Upload time series

In [3]:
fatality_dict = pdta.GetTimeSeries()
fatality_arr = pdta.FilterDict(fatality_dict, 0.5, 'sb_ged_best')

32


### Methods

X_train = np.arange(36)
y_train = np.arange(6,36)
X_test = np.arange(36-6,39)

reg = mt.EmbedDimRegressor(6)

reg.fit(X_train,y_train)
reg.predict(X_test)

In [4]:
fatality_arr.shape

(32, 359)

### Validation

In [5]:
# truncated indexes - will be clipped from all arrays to allow for up to 36 months lag time 
num_trunc = 72+167

# training samples
num_train = 215-167

# validation samples
num_val = 36

# last 36 indexes
num_test = 36

lag = 1

### Naive

In [6]:
country_dim_mse_naive = np.zeros((fatality_arr.shape[0])) # num countries,num dimensions

for arr_idx in range(fatality_arr.shape[0]):
    
    pred_arr = np.zeros(num_test)
    
    for lag in range(1,num_test+1):
    
        # represents a country
        emb_reg = mt.EmbedDimRegressor(num_trunc, lag, 1, num_test)
        actual_arr = fatality_arr[arr_idx,-num_test:]
        x_train,x_test,y_train,y_test = emb_reg.CreateOffsetData(fatality_arr[arr_idx])
        pred = y_train[-1]

        pred_arr[lag-1] = pred

    country_dim_mse_naive[arr_idx] = mean_squared_error(actual_arr,pred_arr)  
    
country_dim_mse_naive.mean()

49851.215277777774

### No Validation (just train and test)

In [None]:
country_dim_mse_test_no_val = np.zeros((fatality_arr.shape[0],36)) # num countries,num dimensions

for arr_idx in range(fatality_arr.shape[0]):
    
    for embed_dim in range(1,37):
    
        # represents a country
        pred_arr = np.zeros(num_test)
        actual_arr = fatality_arr[arr_idx,-num_test:]
        
        emb_reg = mt.EmbedDimRegressor(num_trunc, lag, embed_dim, num_test)
        x_train,x_test,y_train,y_test = emb_reg.CreateOffsetData(fatality_arr[arr_idx])

        # make one-step-ahead prediction
        for step in range(1,num_test+1):
            
            emb_reg.reg.fit(x_train,y_train) 
            pred = emb_reg.reg.predict(x_test.reshape(1,-1))

            pred_arr[lag-1] = pred #(pred+pred_arr.sum())/lag
            
            x_train,x_test,y_train,y_test = emb_reg.CreatePrequentialData(x_train, y_train, )

        
        country_dim_mse_test_no_val[arr_idx,embed_dim-1] = mean_squared_error(actual_arr,pred_arr) 
        
    print(arr_idx,country_dim_mse_test_no_val.sum()/((arr_idx+1)*36))

0 554122.2185608796
1 277274.21451744105
2 184854.1140756683
3 138751.83598516908
4 111009.25111729337
5 92577.32516800649
6 79382.61896172188
7 78409.8498781642
8 70265.43552323912
9 63249.699194153836
10 57554.79993775957
11 52759.22411913552
12 48706.84538509712
13 58519.421665333786
14 57458.36770769852
15 53909.45145605852
16 50826.697566633244
17 48295.750126349005
18 45753.86912253203
19 58919.774653276574
20 57841.83571634071
21 55212.9640423954
22 53089.7180076219
23 50975.221082417724


### Use Validaton Set to Choose best OVERALL embedding dimension for each country

country_dim_mse_val = np.zeros((fatality_arr.shape[0],36)) # num countries,num dimensions

for arr_idx in range(fatality_arr.shape[0]):
    
    print(arr_idx)
    
    lag = 1
    
    for embed_dim in range(1,37):
        
        # represents a country
        pred_arr = np.zeros(num_val)
        actual_arr = fatality_arr[arr_idx,-num_test-num_val:-num_test]

        # make one-step-ahead prediction
        for val_idx in range(1,37):

            emb_reg = mt.EmbedDimRegressor(num_trunc+val_idx-1, lag, embed_dim, 1)
            x_train,x_test,y_train,y_test = emb_reg.CreateOffsetData(fatality_arr[arr_idx,:-val_idx-num_test+1])
            emb_reg.reg.fit(x_train,y_train) 
            pred = emb_reg.reg.predict(x_test.reshape(1,-1))
            pred_arr[num_val-1] = pred #(pred+pred_arr.sum())/lag

            country_dim_mse_val[arr_idx,embed_dim-1] = abs(pred - y_test) #mean_squared_error(actual_arr,pred_arr)  
            

In [None]:
country_dim_mse_val = np.zeros((fatality_arr.shape[0],36,36)) # num countries,num dimensions

for arr_idx in range(fatality_arr.shape[0]):
    
    print(arr_idx)
    
    for embed_dim in range(1,37):
        
        # represents a country
        pred_arr = np.zeros(num_val)
        actual_arr = fatality_arr[arr_idx,-num_test-num_val:-num_test]

        # make one-step-ahead prediction
        for lag in range(1,37):

            emb_reg = mt.EmbedDimRegressor(num_trunc, lag, embed_dim, num_val)
            x_train,x_test,y_train,y_test = emb_reg.CreateOffsetData(fatality_arr[arr_idx,:-num_test])
            emb_reg.reg.fit(x_train,y_train) 
            pred = emb_reg.reg.predict(x_test.reshape(1,-1))
            pred_arr[lag-1] = pred #(pred+pred_arr.sum())/lag

            country_dim_mse_val[arr_idx,lag-1,embed_dim-1] = abs(pred - y_test) #mean_squared_error(actual_arr,pred_arr)  

In [None]:
country_dim_mse_test_wval = np.zeros(fatality_arr.shape[0]) # num countries,num dimensions

for arr_idx in range(fatality_arr.shape[0]):
    
    # represents a country
    pred_arr = np.zeros(num_test)
    actual_arr = fatality_arr[arr_idx,-num_test:]

    # make one-step-ahead prediction
    for lag in range(1,37):
        embed_dim = country_dim_mse_val[arr_idx,lag-1].argmin()+1
        emb_reg = mt.EmbedDimRegressor(num_trunc, lag, embed_dim, num_test)
        x_train,x_test,y_train,y_test = emb_reg.CreateOffsetData(fatality_arr[arr_idx])
        emb_reg.reg.fit(x_train,y_train) 
        pred = emb_reg.reg.predict(x_test.reshape(1,-1))
        pred_arr[lag-1] = pred #(pred+pred_arr.sum())/lag
        
    #plt.plot(actual_arr)
    #plt.plot(pred_arr)
    #plt.show()
        
    country_dim_mse_test_wval[arr_idx] = mean_squared_error(actual_arr,pred_arr)  
    
    print(arr_idx,country_dim_mse_test_wval.sum()/((arr_idx+1)))

In [None]:
plt.plot(country_dim_mse_test_no_val.mean(1))
plt.plot(country_dim_mse_test_wval)
plt.plot(country_dim_mse_naive)
plt.show()

In [None]:
country_dim_mse_test_wval.mean()

In [None]:
country_dim_mse_test_no_val.mean()