# Main

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import itertools
import time

In [2]:
path = 'C:/Users/Meier/Dropbox (Institut für Statistik)/Structural Breaks + DL/Simulation/Python Code/' 
# path = 'C:/Users/Johan/Desktop/Local/'
#path= 'C:/Users/Johan/Dropbox (Institut für Statistik)/Structural Breaks + DL/Simulation/Python Code/'
#path = 'C:/Users/Slave 1/Desktop/Johanna/'

In [3]:
str_model = 'RNN' # 'RNN', 'LSTM', 'GRU'
setting = 'AR' # 'AR', 'ARIMA', 'GARCH'

In [4]:
reps = 2         # number of repetitions
no_sdl = 2       # number of sequential DL models to train

In [5]:
test_size = 0.1                                # proportion of test set
sim_length = 5000                              # length of simulated sample
tau = np.array([0.1,0.2,0.3,0.4,0.5,0.6,0.7])  # break locations
#tau = np.array([0.1,0.2])  # break locations
fac_beta = np.array([0.5,1,2])                 # break size factor
#fac_beta = np.array([0.5,1])                 # break size factor

In [6]:
alpha=0.999
batch_size = 64

In [7]:
test_size = 0.1           # proportion of test set
val_size = 0.2            # proportion of validation set
lags = 1                  # number of lags as features

In [8]:
if setting == 'AR':
    params = np.array([0.1,0.5,0.9,0.95,0.99]) # AR (phi)
if setting == 'ARIMA':
    params = [np.array([0.1,0.5,0.9,0.95,0.99]),np.array([0.3,-0.3])] # ARIMA (phi, theta)
    params_list = list(itertools.product(params[0],params[1]))
    print(params_list)
if setting == 'GARCH':
    ! pip install arch
    params = [np.array([0.1,0.8]), np.array([0.45,0.45]), np.array([0.8,0.1])] # GARCH (phi,theta)

Run external notebooks.

In [9]:
%run "Helper_functions.ipynb" # notebook containing helper functions
%run "Simulate_data.ipynb" # notebook containing simulation function
%run "Split_data.ipynb" # notebook containing split function
%run "DL_models_EWSLoss.ipynb" # notebook containing sequential deep learning models

Get names of simulation settings.

In [10]:
str_sims = get_str_sims(tau, fac_beta) # get names of simulation settings

Run simulation.

In [11]:
# start timer
timer_start = time.time()
print('Simulation start: %s' %time.ctime(int(timer_start)))

# delete all files in Temp folder
emtpy_temp(path+'Temp/')

# run specified number of repetitions
for i in range(reps):
    
    # print repetition
    print('Repetition: ',i+1)
    
    # seed
    np.random.seed(i)
    
    # delete all simulation files in Temp folder
    del_sim(path+'Temp/')
    
    # simulate data for given setting and parameters(save csv-files in Temp)
    sim_data(setting=setting, params=params, tau=tau, fac_beta=fac_beta, test_size=test_size, l=sim_length, path=path+'Temp/', lags=1, verbose=False)
    
    # split all data into train, val, and test (save nzp-files in Temp)
    split_data(setting=setting, params=params, path=path+'Temp/', test_size=test_size, val_size=val_size)
    
    # check simulated data
    #sim = pd.read_csv(path+'Temp/sim1.csv' ,sep=',',na_values = 'NA')
    #data1 = np.load(path+'Temp/sim2_%s.npz' %sim.columns[0])
    #data2 = np.load(path+'Temp/sim2_%s.npz' %sim.columns[1])
    #plt.plot(data1['y_test'])
    #plt.plot(data2['y_test'])
    #plt.show()
    #del data1,data2
             
    # DL forecast
    # if DL: get combination forecasts
    if str_model == 'RNN' or str_model =='LSTM' or str_model =='GRU':
        
        #%run "DL_models.ipynb" # notebook containing sequential deep learning models
        
        # random seed for each model
        seed = np.random.randint(low=1,high=10000000,size=no_sdl) # number of models to run per dataset(random seeds)
        
        # set parameters
        input_dim = lags                  # number of lagged features in X
        hidden_dim = 10                   # number of hidden nodes per layer
        layer_dim = 1                     # number of layers
        output_dim = 1                    # output dimension (1 for univariate output)
        dropout = 0                       # dropout proportion (only before the last sequential layer)
        learning_rate = 1e-3              # learning rate for Adam optimizer
        weight_decay = 1e-6               # weight decay for Adam optimizer

        # save model parameters in dict
        model_params = {'input_dim': input_dim, 'hidden_dim' : hidden_dim,'layer_dim' : layer_dim, 'output_dim' : output_dim, 'dropout_prob' : dropout}
        
        # train model
        results = []
        ave_results = []
        for j in range(len(params)):
        #for j in range(2):
            print('Parameter combination: ', j+1,'/',len(params))
            results_seeds, sim_preds, best_seeds = train_loop(model_name=str_model,model_params=model_params,num_sim=j+1,str_sims=str_sims,path=path, lags = lags, seed=seed, alpha=alpha, batch_size = batch_size)
            results.append(np.squeeze(results_seeds[[np.arange(0,len(str_sims))],list(map(int, best_seeds)),:])) # shape: 1 x no. of settings x no. of metrics     
    
    # save intermediate results
    new_results = np.expand_dims(np.asarray(results),axis=1)
    if i==0:
        np.save(path+'Temp/interm_results.npy',new_results)
    else:
        prev_results = np.load(path+'Temp/interm_results.npy')
        all_results = np.concatenate((prev_results,new_results),axis=1) # shape: no. of params x rep x no. of settings x no. of metrics
        np.save(path+'Temp/interm_results.npy',all_results)
        del prev_results
    
    # calculate metrics
    arr_results = np.load(path+'Temp/interm_results.npy')
    arr_mean, arr_std, arr_min, arr_max, arr_median = get_results(arr_results) # aggregate over reps (axis 1)  
    np.savez(path+'Results/'+setting+'_'+str_model+'_EWSLoss_'+str(alpha)+'_'+str(batch_size)+'_results.npz',mean=arr_mean,std=arr_std,minimum=arr_min,maximum=arr_max,median=arr_median)
    del arr_results
    
    if (i < 10) | (i % 50 == 0):
        print('Elapsed: %s' %time_format(time.time() - timer_start))

print('Simulation end: %s' %time.ctime(int(time.time())))

Simulation start: Thu Jul 28 14:35:24 2022
Repetition:  1
Parameter combination:  1 / 5
No. of dataset:  1 / 7
{'rmse': 0.9819123177141645, 'mae': 0.7891803, 'mape': 1.3639995, 'r2': -0.004443916834422579}
{'rmse': 0.9828968454626066, 'mae': 0.7882456, 'mape': 1.5193719, 'r2': -0.006459120312276578}
No. of dataset:  2 / 7
{'rmse': 0.9823380542588184, 'mae': 0.7893557, 'mape': 1.3823245, 'r2': -0.005315026138023038}
{'rmse': 0.9819616980492936, 'mae': 0.78768885, 'mape': 1.4931709, 'r2': -0.004544901633303677}
No. of dataset:  3 / 7
{'rmse': 0.9816481954675065, 'mae': 0.7890432, 'mape': 1.3550012, 'r2': -0.003903627484119987}
{'rmse': 0.982328406678725, 'mae': 0.7882923, 'mape': 1.4709934, 'r2': -0.005295311972384642}
No. of dataset:  4 / 7
{'rmse': 0.9813729786194277, 'mae': 0.7888992, 'mape': 1.3450646, 'r2': -0.0033407183186289213}
{'rmse': 0.9820786894854512, 'mae': 0.78783053, 'mape': 1.4909924, 'r2': -0.004784253339322531}
No. of dataset:  5 / 7
{'rmse': 0.9810737172095662, 'mae':

In [12]:
with np.load(path+'Results/'+setting+'_'+str_model+'_results.npz') as data:
    print(data['mean'].shape, data['std'].shape, data['minimum'].shape, data['maximum'].shape)

(2, 9, 4) (2, 9, 4) (2, 9, 4) (2, 9, 4)
