# Main

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import itertools
import time

In [2]:
path = 'C:/Users/Meier/Dropbox (Institut für Statistik)/Structural Breaks + DL/Simulation/Python Code/' 
# path = 'C:/Users/Johan/Desktop/Local/'
#path= 'C:/Users/Johan/Dropbox (Institut für Statistik)/Structural Breaks + DL/Simulation/Python Code/'

In [3]:
str_model = 'RNN' # 'RNN', 'LSTM', 'GRU', 'AR', 'ARIMA', 'GARCH'
setting = 'AR' # 'AR', 'ARIMA', 'GARCH'

In [4]:
reps = 1000         # number of repetitions
ave = True       # return average predictions for sequential DL models
no_sdl = 10       # number of sequential DL models to train

In [5]:
test_size = 0.1                                # proportion of test set
sim_length = 5000                              # length of simulated sample
tau = np.array([0.1,0.2,0.3,0.4,0.5,0.6,0.7])  # break locations
fac_beta = np.array([0.5,1,2])                 # break size factor

In [6]:
test_size = 0.1           # proportion of test set
val_size = 0.2            # proportion of validation set
lags = 1                  # number of lags as features

In [7]:
if setting == 'AR':
    params = np.array([0.1,0.5,0.9,0.95,0.99]) # AR (phi)
if setting == 'ARIMA':
    params = [np.array([0.1,0.5,0.9,0.95,0.99]),np.array([0.3,-0.3])] # ARIMA (phi, theta)
    params_list = list(itertools.product(params[0],params[1]))
    print(params_list)
if setting == 'GARCH':
    ! pip install arch
    params = [np.array([0.1,0.8]), np.array([0.45,0.45]), np.array([0.8,0.1])] # GARCH (phi,theta)

Run external notebooks.

In [8]:
%run "Helper_functions.ipynb" # notebook containing helper functions
%run "Simulate_data.ipynb" # notebook containing simulation function
%run "Split_data.ipynb" # notebook containing split function
%run "DL_models.ipynb" # notebook containing sequential deep learning models
%run "AR_model.ipynb" # notebook containing AR model
%run "ARIMA_model.ipynb" # notebook containing AR model
%run "GARCH_model.ipynb" # notebook containing AR model

Get names of simulation settings.

In [9]:
str_sims = get_str_sims(tau, fac_beta) # get names of simulation settings

Run simulation.

In [10]:
# start timer
timer_start = time.time()
print('Simulation start: %s' %time.ctime(int(timer_start)))

# delete all files in Temp folder
emtpy_temp(path+'Temp/')

# run specified number of repetitions
for i in range(reps):
    
    # print repetition
    print('Repetition: ',i+1)
    
    # seed
    np.random.seed(i)
    
    # delete all simulation files in Temp folder
    del_sim(path+'Temp/')
    
    # simulate data for given setting and parameters(save csv-files in Temp)
    sim_data(setting=setting, params=params, tau=tau, fac_beta=fac_beta, test_size=test_size, l=sim_length, path=path+'Temp/', lags=1, verbose=False)
    
    # split all data into train, val, and test (save nzp-files in Temp)
    split_data(setting=setting, params=params, path=path+'Temp/', test_size=test_size, val_size=val_size)
    
    # check simulated data
    #sim = pd.read_csv(path+'Temp/sim1.csv' ,sep=',',na_values = 'NA')
    #data1 = np.load(path+'Temp/sim2_%s.npz' %sim.columns[0])
    #data2 = np.load(path+'Temp/sim2_%s.npz' %sim.columns[1])
    #plt.plot(data1['y_test'])
    #plt.plot(data2['y_test'])
    #plt.show()
    #del data1,data2
             
    # DL forecast
    # if DL: get combination forecasts
    if str_model == 'RNN' or str_model =='LSTM' or str_model =='GRU':
        
        #%run "DL_models.ipynb" # notebook containing sequential deep learning models
        
        # random seed for each model
        seed = np.random.randint(low=1,high=10000000,size=no_sdl) # number of models to run per dataset(random seeds)
        
        # set parameters
        input_dim = lags                  # number of lagged features in X
        hidden_dim = 10                   # number of hidden nodes per layer
        layer_dim = 1                     # number of layers
        output_dim = 1                    # output dimension (1 for univariate output)
        dropout = 0                       # dropout proportion (only before the last sequential layer)
        learning_rate = 1e-3              # learning rate for Adam optimizer
        weight_decay = 1e-6               # weight decay for Adam optimizer

        # save model parameters in dict
        model_params = {'input_dim': input_dim, 'hidden_dim' : hidden_dim,'layer_dim' : layer_dim, 'output_dim' : output_dim, 'dropout_prob' : dropout}
        
        # train model
        results = []
        ave_results = []
        for j in range(len(params)):
        #for j in range(2):
            print('Parameter combination: ', j+1,'/',len(params))
            results_seeds, sim_preds, best_seeds = train_loop(model_name=str_model,model_params=model_params,num_sim=j+1,str_sims=str_sims,path=path,seed=seed)
            results.append(np.squeeze(results_seeds[[np.arange(0,len(str_sims))],list(map(int, best_seeds)),:])) # shape: 1 x no. of settings x no. of metrics
        
            if ave:
                ave_results_sets = []
                for l in range(len(str_sims)):
                    ave_preds = np.mean(sim_preds,axis=1)[l,:,:]
                    df_ave = pd.DataFrame(data={"value": ave_preds[:,0], "prediction": ave_preds[:,1]})
                    ave_result_metrics = calculate_metrics(df_ave)
                    df_ave_metrics = pd.DataFrame(np.expand_dims((ave_result_metrics['rmse'],ave_result_metrics['mae'],ave_result_metrics['mape'],ave_result_metrics['r2'],),axis=0),columns=['rmse','mae','mape','r2'])
                    ave_results_sets.append(df_ave_metrics)
                ave_results_sets = np.expand_dims(np.squeeze(np.asarray(ave_results_sets)),axis=0) # shape: 1 x no. of settings x no. of metrics
                ave_results.append(ave_results_sets)
    
    # forecast AR model
    if str_model == 'AR':
        
        results = []
        for j in range(len(params)):
        #for j in range(2):
            print('Parameter combination: ', j+1,'/',len(params))
            df_sim = train_loop_ar(path=path,num_sim=j+1,str_sims=str_sims)
            results.append(df_sim)
    
    # forecast ARIMA model
    if str_model == 'ARIMA':
        
        params_list = list(itertools.product(params[0],params[1]))
        
        results = []
        for j in range(len(params_list)):
        #for j in range(2):
            print('Parameter combination: ', j+1,'/',len(params_list))
            df_sim = train_loop_arima(path=path,num_sim=j+1,str_sims=str_sims)
            results.append(df_sim)
    
    # forecast GARCH model
    if str_model == 'GARCH':
        
        results = []
        for j in range(len(params)):
        #for j in range(2):
            print('Parameter combination: ', j+1,'/',len(params))
            df_sim = train_loop_garch(path=path,num_sim=j+1,str_sims=str_sims)
            results.append(df_sim)
        
    
    # save intermediate results
    new_results = np.expand_dims(np.asarray(results),axis=1)
    if i==0:
        np.save(path+'Temp/interm_results.npy',new_results)
    else:
        prev_results = np.load(path+'Temp/interm_results.npy')
        all_results = np.concatenate((prev_results,new_results),axis=1) # shape: no. of params x rep x no. of settings x no. of metrics
        np.save(path+'Temp/interm_results.npy',all_results)
        del prev_results
    
    # calculate metrics
    arr_results = np.load(path+'Temp/interm_results.npy')
    arr_mean, arr_std, arr_min, arr_max, arr_median = get_results(arr_results) # aggregate over reps (axis 1)  
    np.savez(path+'Results/'+setting+'_'+str_model+'_results.npz',mean=arr_mean,std=arr_std,minimum=arr_min,maximum=arr_max,median=arr_median)
    del arr_results
    
    # if predictions averaging is required
    if ave and (str_model == 'RNN' or str_model =='LSTM' or str_model =='GRU'):
        
        # save intermediate results
        new_ave_results = np.asarray(ave_results)
        if i==0:
            np.save(path+'Temp/interm_ave_results.npy',new_ave_results)
        else:
            prev_ave_results = np.load(path+'Temp/interm_ave_results.npy')
            all_ave_results = np.concatenate((prev_ave_results,new_ave_results),axis=1) # shape: no. of params x rep x no. of settings x no. of metrics
            np.save(path+'Temp/interm_ave_results.npy',all_ave_results)
            del prev_ave_results
    
        # calculate metrics
        arr_ave_results = np.load(path+'Temp/interm_ave_results.npy')
        arr_mean, arr_std, arr_min, arr_max, arr_median = get_results(arr_ave_results) # aggregate over reps (axis 1)  
        np.savez(path+'Results/'+setting+'_'+str_model+'_ave_results.npz',mean=arr_mean,std=arr_std,minimum=arr_min,maximum=arr_max,median=arr_median) # shape: no. of parameters x no. of settings x no. of metrics
        del arr_ave_results
    
    if (i < 10) | (i % 50 == 0):
        print('Elapsed: %s' %time_format(time.time() - timer_start))

print('Simulation end: %s' %time.ctime(int(time.time())))

Simulation start: Wed Jul 27 08:04:27 2022
Repetition:  1
Parameter combination:  1 / 5
No. of dataset:  1 / 9
No. of dataset:  2 / 9
No. of dataset:  3 / 9
No. of dataset:  4 / 9
No. of dataset:  5 / 9
No. of dataset:  6 / 9
No. of dataset:  7 / 9
No. of dataset:  8 / 9
No. of dataset:  9 / 9
Parameter combination:  2 / 5
No. of dataset:  1 / 9
No. of dataset:  2 / 9
No. of dataset:  3 / 9
No. of dataset:  4 / 9
No. of dataset:  5 / 9
No. of dataset:  6 / 9
No. of dataset:  7 / 9
No. of dataset:  8 / 9
No. of dataset:  9 / 9
(2, 1, 9, 4)
(2, 1, 9, 4)
(1, 9, 4) (1, 9, 4) (1, 9, 4) (1, 9, 4) (1, 9, 4)
Elapsed: 01H 36m 41s
Repetition:  2
Parameter combination:  1 / 5
No. of dataset:  1 / 9
No. of dataset:  2 / 9
No. of dataset:  3 / 9
No. of dataset:  4 / 9
No. of dataset:  5 / 9
No. of dataset:  6 / 9
No. of dataset:  7 / 9
No. of dataset:  8 / 9
No. of dataset:  9 / 9
Parameter combination:  2 / 5
No. of dataset:  1 / 9
No. of dataset:  2 / 9
No. of dataset:  3 / 9
No. of dataset:  4 /

In [11]:
with np.load(path+'Results/'+setting+'_'+str_model+'_results.npz') as data:
    print(data['mean'].shape, data['std'].shape, data['minimum'].shape, data['maximum'].shape)

[[[0.99020052 0.79739594 2.21365619 0.13579388]
  [0.99007449 0.79726627 2.22717369 0.13606413]
  [0.98820113 0.79596564 2.21812564 0.13952391]
  [0.9871878  0.79457909 2.31536847 0.14138716]
  [0.98917747 0.79742649 2.18271893 0.13878147]
  [0.98957937 0.79691237 2.23571348 0.13701659]
  [0.98651938 0.79483235 2.17000651 0.14287008]
  [0.98911013 0.79722783 2.29028457 0.1386913 ]
  [0.98944224 0.79665351 2.19767529 0.13787704]]

 [[0.97448882 0.77168274 3.61348051 0.14021159]
  [0.974397   0.77172256 3.63116562 0.14044712]
  [0.97618454 0.77367303 3.62402135 0.13700554]
  [0.97224579 0.77065495 3.93686593 0.14387428]
  [0.97270912 0.77162951 4.05365837 0.14294248]
  [0.97487431 0.77200225 3.54940224 0.13974162]
  [0.97465634 0.77152598 3.52071697 0.13978017]
  [0.97184184 0.77101457 4.24833047 0.14455148]
  [0.97144308 0.77014136 4.14704412 0.14540709]]] [[[6.97308364e-03 8.17865133e-03 7.49534369e-01 1.42930159e-01]
  [7.13916609e-03 8.27178359e-03 7.65524507e-01 1.42602072e-01]
  [7