In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt

In [2]:
import PullData as pdta
import Methods as mt
import EnsembleUtil as eu

### Data Prep, Variable Assignments

In [3]:
fatality_dict = pdta.GetTimeSeries()
fatality_arr = pdta.FilterDict(fatality_dict, 0.5, 'sb_ged_best')

32


In [4]:
# truncated indexes - will be clipped from all arrays to allow for up to 36 months lag time 
num_trunc = 72+167

# training samples
num_train = 215-167

# validation samples
num_val = 36

# last 36 indexes
num_test = 36

## Methods

### Naive: Select last y training label as prediction

In [5]:
country_dim_mse_naive = np.zeros((fatality_arr.shape[0])) # num countries

for arr_idx in range(fatality_arr.shape[0]):
    
    pred_arr = np.zeros(num_test)
    
    for lag in range(1,num_test+1):
    
        # represents a country
        emb_reg = mt.EmbedDimRegressor(num_trunc, lag, 1, num_test)
        actual_arr = fatality_arr[arr_idx,-num_test:]
        x_train,x_test,y_train,y_test = emb_reg.CreateOffsetData(fatality_arr[arr_idx])
        pred = y_train[-1]

        pred_arr[lag-1] = pred

    country_dim_mse_naive[arr_idx] = mean_squared_error(actual_arr,pred_arr)  

### No Validation (just train and test)

In [6]:
country_dim_mse_test_no_val = np.zeros((fatality_arr.shape[0],36)) # num countries,num dimensions

for arr_idx in range(fatality_arr.shape[0]):
    
    if arr_idx%5==0:
        print(arr_idx)
    
    for embed_dim in range(1,37):
    
        # represents a country
        pred_arr = np.zeros(num_test)
        actual_arr = fatality_arr[arr_idx,-num_test:]

        # make one-step-ahead prediction
        for lag in range(1,37):

            emb_reg = mt.EmbedDimRegressor(num_trunc, lag, embed_dim, num_test)
            x_train,x_test,y_train,y_test = emb_reg.CreateOffsetData(fatality_arr[arr_idx])
            emb_reg.reg.fit(x_train,y_train) 
            pred = emb_reg.reg.predict(x_test.reshape(1,-1))

            pred_arr[lag-1] = pred

        
        country_dim_mse_test_no_val[arr_idx,embed_dim-1] = mean_squared_error(actual_arr,pred_arr) 

0
5
10
15
20
25
30


### Use Validaton Set to Choose best embedding dimension for each lag for each country

#### Validation Run (some objects will not be used until ensemble section)

In [7]:
# used to find most accurate learners
country_dim_mse_val = np.zeros((fatality_arr.shape[0],36,36)) # num countries,num lags,num dimensions
# used to find predictions for diversity calculation
country_dim_lag_test_preds = np.zeros((fatality_arr.shape[0],36,36)) # num countries,num lags (same as num_test),num dimensions
# used to find predictions for diversity calculation
country_dim_lag_train_preds = np.zeros((fatality_arr.shape[0],36,36,num_train+1)) # num countries,num lags (same as num_test),num dimensions


for arr_idx in range(fatality_arr.shape[0]):
    
    if arr_idx%5==0:
        print(arr_idx)
    
    for embed_dim in range(1,37):
        
        # represents a country
        actual_arr = fatality_arr[arr_idx,-num_test-num_val:-num_test]

        # make one-step-ahead prediction
        for lag in range(1,37):

            emb_reg = mt.EmbedDimRegressor(num_trunc, lag, embed_dim, num_val)
            x_train,x_test,y_train,y_test = emb_reg.CreateOffsetData(fatality_arr[arr_idx,:-num_test])
            emb_reg.reg.fit(x_train,y_train) 
            pred = emb_reg.reg.predict(x_test.reshape(1,-1))
            train_preds = emb_reg.reg.predict(x_train)

            country_dim_mse_val[arr_idx,lag-1,embed_dim-1] = abs(pred - y_test)
            country_dim_lag_test_preds[arr_idx,lag-1,embed_dim-1] = pred
            country_dim_lag_train_preds[arr_idx,lag-1,embed_dim-1] = train_preds

0
5
10
15
20
25
30


#### Test Based on Validation Performance

In [8]:
country_dim_mse_test_wval = np.zeros(fatality_arr.shape[0]) # num countries

for arr_idx in range(fatality_arr.shape[0]):
    
    if arr_idx%5==0:
        print(arr_idx)
    
    # represents a country
    pred_arr = np.zeros(num_test)
    actual_arr = fatality_arr[arr_idx,-num_test:]

    # make one-step-ahead prediction
    for lag in range(1,37):
        embed_dim = country_dim_mse_val[arr_idx,lag-1].argmin()+1
        emb_reg = mt.EmbedDimRegressor(num_trunc, lag, embed_dim, num_test)
        x_train,x_test,y_train,y_test = emb_reg.CreateOffsetData(fatality_arr[arr_idx])
        emb_reg.reg.fit(x_train,y_train) 
        pred = emb_reg.reg.predict(x_test.reshape(1,-1))
        pred_arr[lag-1] = pred 
        
    country_dim_mse_test_wval[arr_idx] = mean_squared_error(actual_arr,pred_arr)  

0
5
10
15
20
25
30


### Performance Summary for Single Learners

In [9]:
print('Average MSE by Method\n')
print('Use Validation:\t',round(country_dim_mse_test_wval.mean()))
print('No Validation:\t',round(country_dim_mse_test_no_val.mean()))
print('Naive:\t\t',round(country_dim_mse_naive.mean()))

Average MSE by Method

Use Validation:	 39788
No Validation:	 41545
Naive:		 49851


## Ensembles
#### All methods select 3 learners for each country and each lag.
#### We will average the ensemble predictions.

### Most Accurate Learners -- choose 3 most accurate learners

In [11]:
country_dim_mse_test_wval_ens = np.zeros(fatality_arr.shape[0]) # num countries
ensemble_size = 3

for arr_idx in range(fatality_arr.shape[0]):
    
    if arr_idx%5==0:
        print(arr_idx)
    
    # represents a country
    pred_arr = np.zeros(num_test)
    actual_arr = fatality_arr[arr_idx,-num_test:]

    # make one-step-ahead prediction
    for lag in range(1,37):
        embed_dim_l = (country_dim_mse_val[arr_idx,lag-1].argsort()[:ensemble_size]+1).tolist()
        ensemble_preds = np.zeros(ensemble_size)
        
        for t_idx,t_embed_dim in enumerate(embed_dim_l):
            emb_reg = mt.EmbedDimRegressor(num_trunc, lag, t_embed_dim, num_test)
            x_train,x_test,y_train,y_test = emb_reg.CreateOffsetData(fatality_arr[arr_idx])
            emb_reg.reg.fit(x_train,y_train) 
            t_pred = emb_reg.reg.predict(x_test.reshape(1,-1))
            ensemble_preds[t_idx] = t_pred
        
        pred_arr[lag-1] = ensemble_preds.mean()
        
    country_dim_mse_test_wval_ens[arr_idx] = mean_squared_error(actual_arr,pred_arr)  

0
5
10
15
20
25
30


### Random Ensemble -- choose three randomly (w/o replacement)

In [12]:
country_dim_mse_test_no_val_ens = np.zeros((fatality_arr.shape[0])) # num countries
ensemble_size = 3
mse_l = []

for arr_idx in range(fatality_arr.shape[0]):
    
    if arr_idx%5==0:
        print(arr_idx)
    
    # represents a country
    pred_arr = np.zeros(num_test)
    actual_arr = fatality_arr[arr_idx,-num_test:]

    # make one-step-ahead prediction
    for lag in range(1,37):
        # randomly choose three learners
        embed_dim_l = (np.random.choice(36,ensemble_size,replace=False)+1).tolist()
        ensemble_preds = np.zeros(ensemble_size)

        for t_idx,t_embed_dim in enumerate(embed_dim_l):
            emb_reg = mt.EmbedDimRegressor(num_trunc, lag, t_embed_dim, num_test)
            x_train,x_test,y_train,y_test = emb_reg.CreateOffsetData(fatality_arr[arr_idx])
            emb_reg.reg.fit(x_train,y_train) 
            t_pred = emb_reg.reg.predict(x_test.reshape(1,-1))
            ensemble_preds[t_idx] = t_pred

        pred_arr[lag-1] = ensemble_preds.mean()
        
    country_dim_mse_test_no_val_ens[arr_idx] = mean_squared_error(actual_arr,pred_arr) 
    mse_l.append(mean_squared_error(actual_arr,pred_arr))
    print(np.array(mse_l).mean())

0
524133.8493966049
262288.7360382223
174863.75273260413
131258.66872359868
105014.72259681145
5
87579.02927638452
75097.35762044377
74477.4473629012
66750.04402480698
60085.17833693251
10
54670.40449584325
50115.22983940702
46266.19948515839
54388.55637731198
52013.42361740414
15
48802.23220383993
46000.891805904525
43743.122677737476
41440.85352047833
54576.90781070441
20
53582.49975931284
51147.21354132493
49176.50133424429
47221.169072228084
45332.32230933896
25
44146.522727804375
42511.46633047829
41171.06545073783
39754.02783923621
38569.125697403644
30
37324.9603523261
38181.842544797786


### Accuracy-Diversity Measure for Selecting Ensemble Members

### Acc @ Val, Div @ Train: Choose the most mutually diverse on training set predictions among the top K learners (in terms of validation accuracy) 

In [14]:
country_dim_mse_test_wval_ens_train = np.zeros(fatality_arr.shape[0]) # num countries
ensemble_size = 3
K = 8
mse_l = []

for arr_idx in range(fatality_arr.shape[0]):
    
    if arr_idx%5==0:
        print(arr_idx)
    
    # represents a country
    pred_arr = np.zeros(num_test)
    actual_arr = fatality_arr[arr_idx,-num_test:]

    # make one-step-ahead prediction
    for lag in range(1,37):
        embed_dim_l = eu.AccuracyDiversityEnsemble(country_dim_mse_val[arr_idx,lag-1],country_dim_lag_train_preds[arr_idx,lag-1], K, ensemble_size)
        ensemble_preds = np.zeros(ensemble_size)
        
        for t_idx,t_embed_dim in enumerate(embed_dim_l):
            emb_reg = mt.EmbedDimRegressor(num_trunc, lag, t_embed_dim, num_test)
            x_train,x_test,y_train,y_test = emb_reg.CreateOffsetData(fatality_arr[arr_idx])
            emb_reg.reg.fit(x_train,y_train) 
            t_pred = emb_reg.reg.predict(x_test.reshape(1,-1))
            ensemble_preds[t_idx] = t_pred
        
        pred_arr[lag-1] = ensemble_preds.mean()
        
    country_dim_mse_test_wval_ens_train[arr_idx] = mean_squared_error(actual_arr,pred_arr) 
    mse_l.append(mean_squared_error(actual_arr,pred_arr))
    print(np.array(mse_l).mean())
    

0
399190.01885833335
199752.8909823289
133173.32038115317
99989.04841016699
79999.01989969915
5
66733.86062798744
57225.986609759486
58569.743825764286
52447.20239556942
47209.50122142139
10
42976.94456456352
39396.25330647168
36371.5051135024
43497.475406144644
41966.57701350656
15
39376.22660422174
37123.62385377006
35315.402233816916
33456.69696318131
43051.28389528459
20
42694.46584339273
40754.04157018636
39247.31708224538
37695.26752685579
36187.456825781555
25
35329.515618334226
34021.015039877406
33012.78865839725
31876.997575376303
30969.11316842189
30
29970.109517827634
30661.81063144876


### Acc-Div @ Validation: Choose the most mutually diverse on validation set predictions among the top K learners (in terms of validation accuracy)¶

In [15]:
import EnsembleUtil as eu
import importlib
importlib.reload(eu)

country_dim_mse_test_wval_ens_test = np.zeros(fatality_arr.shape[0]) # num countries
ensemble_size = 3
K = 8
mse_l = []

for arr_idx in range(fatality_arr.shape[0]):
    
    if arr_idx%5==0:
        print(arr_idx)
    
    # represents a country
    pred_arr = np.zeros(num_test)
    actual_arr = fatality_arr[arr_idx,-num_test:]

    # make one-step-ahead prediction
    for lag in range(1,37):
        embed_dim_l = eu.AccuracyDiversityEnsemble(country_dim_mse_val[arr_idx,lag-1],country_dim_lag_test_preds[arr_idx,lag-1], K, ensemble_size)
        ensemble_preds = np.zeros(ensemble_size)
        
        for t_idx,t_embed_dim in enumerate(embed_dim_l):
            emb_reg = mt.EmbedDimRegressor(num_trunc, lag, t_embed_dim, num_test)
            x_train,x_test,y_train,y_test = emb_reg.CreateOffsetData(fatality_arr[arr_idx])
            emb_reg.reg.fit(x_train,y_train) 
            t_pred = emb_reg.reg.predict(x_test.reshape(1,-1))
            ensemble_preds[t_idx] = t_pred
        
        pred_arr[lag-1] = ensemble_preds.mean()
        
    country_dim_mse_test_wval_ens_test[arr_idx] = mean_squared_error(actual_arr,pred_arr) 
    mse_l.append(mean_squared_error(actual_arr,pred_arr))
    print(np.array(mse_l).mean())
    

0
389200.5686580247
194769.86239138307
129851.28010675254
97498.4725763916
78006.56136064793
5
65093.868568456004
55820.525585514326
57495.873692561945
51529.24736163136
46386.26901684997
10
42221.60355788681
38703.85717992492
35732.5154303261
42795.52610296154
42338.97365266436
15
39722.09980626539
37449.75258702057
35679.119904105406
33801.27156515783
43625.0770979339
20
43176.08769989883
41213.897658135225
39695.452483244924
38128.156781215934
36603.0305099673
25
35723.0982305722
34400.02051832878
33362.53851022174
32214.72338176918
31309.997142006505
30
30299.997234199844
31059.026394039665


### Single Learner Summary

In [26]:
print('Average MSE by Method\n')
print('Use Validation:\t\t\t',round(country_dim_mse_test_wval.mean()))
print('No Validation:\t\t\t',round(country_dim_mse_test_no_val.mean()))
print('Naive:\t\t\t\t',round(country_dim_mse_naive.mean()))

Average MSE by Method

Use Validation:			 39788
No Validation:			 41545
Naive:				 49851


### Ensemble Summary

In [23]:
print('Average MSE by Ensemble Method\n')
print('Acc @ Val, Div @ Train:\t\t',round(country_dim_mse_test_wval_ens_train.mean()))
print('Acc-Div @ Validation:\t\t',round(country_dim_mse_test_wval_ens_test.mean()))
print('Most Accurate @ Validation:\t',round(country_dim_mse_test_wval_ens.mean()))
print('Random Choice:\t\t\t',round(country_dim_mse_test_no_val_ens.mean()))

Average MSE by Ensemble Method

Acc @ Val, Div @ Train:		 30662
Acc-Div @ Validation:		 31059
Most Accurate @ Validation:	 32100
Random Choice:			 38182
