# Main

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import itertools
import time
from scipy import stats

In [2]:
path = 'C:/Users/Meier/Institut für Statistik Dropbox/Johanna Meier/Structural Breaks + DL/Simulation/Python Code/' 
# path = 'C:/Users/Johan/Desktop/Local/'
#path= 'C:/Users/Johan/Dropbox (Institut für Statistik)/Structural Breaks + DL/Simulation/Python Code/'
#path = 'C:/Users/Slave 1/Desktop/Johanna/'

In [3]:
str_model = 'RNN' # 'RNN', 'LSTM', 'GRU'
setting = 'AR' # 'AR', 'ARIMA'

In [4]:
reps = 1000         # number of repetitions

In [5]:
sim_length = 1000                              # length of simulated sample
tau = np.array([0.2,0.5,0.8])                 # break locations
fac_beta = np.array([0.5,1,2])                 # break size factor

In [6]:
test_size = 0.1           # proportion of test set
train_size = 0.85         # proportion of training set
lags = 1                  # number of lags as features

In [7]:
weight_type = 'bartlett' # 'exponential', 'rayleigh', 'bartlett', 'parzen' , 'tukey-hanning'
if weight_type == 'bartlett' or weight_type == 'parzen' or weight_type == 'tukey-hanning':
    fac = np.array([1, 0.95, 0.9]) # {1, 0.95, 0.9}
    alphas = (sim_length*(1-test_size))**fac   # kernels: {T, T^0.95, T^0.9}
else:
    alphas = np.array([0.005, 0.01, 0.02]) # exp: {0.005, 0.01, 0.02}, ray: {2*10**(-5), 5*10**(-5), 1*10**(-4)}

In [8]:
alphas

array([900.        , 640.51659161, 455.8461157 ])

In [9]:
if setting == 'AR':
    params = np.array([0.1,0.4,0.7,-0.4]) # AR (phi)
if setting == 'ARIMA':
    params = [np.array([0.4,-0.4]),np.array([0.3,-0.3])] # ARIMA (phi, theta)
    params_list = list(itertools.product(params[0],params[1]))
    print(params_list)

Run external notebooks.

In [10]:
%run "Helper_functions.ipynb" # notebook containing helper functions
%run "Simulate_data.ipynb" # notebook containing simulation function
%run "DL_models_WeightedLoss.ipynb" # notebook containing sequential deep learning models

Get names of simulation settings.

In [11]:
str_sims = get_str_sims_breaksonly(tau, fac_beta) # get names of simulation settings

In [12]:
str_sims

['Tau0.1Beta0.5',
 'Tau0.1Beta1.0',
 'Tau0.1Beta2.0',
 'Tau0.45Beta0.5',
 'Tau0.45Beta1.0',
 'Tau0.45Beta2.0',
 'Tau0.9Beta0.5',
 'Tau0.9Beta1.0',
 'Tau0.9Beta2.0']

Run simulation.

In [30]:
# start timer
timer_start = time.time()
print('Simulation start: %s' %time.ctime(int(timer_start)))

# delete all files in Temp folder
emtpy_temp(path+'Temp/')

results_best_alphas = np.empty([reps,len(params),len(str_sims)])

# run specified number of repetitions
for i in range(reps):
    
    # print repetition
    print('Repetition: ',i+1)
    
    # seed
    np.random.seed(i)
    torch.manual_seed(i)
    
    # delete all simulation files in Temp folder
    del_sim(path+'Temp/')
    
    # simulate data for given setting and parameters(save csv-files in Temp)
    sim_data_breaksonly(setting=setting, params=params, tau=tau, fac_beta=fac_beta, test_size=test_size, l=sim_length, path=path+'Temp/', lags=1, verbose=False)
    
    # split all data into train, val, and test (save nzp-files in Temp)
    split_data_val(setting=setting, params=params, path=path+'Temp/', train_size=train_size)
    
    # check simulated data
    #sim = pd.read_csv(path+'Temp/sim1.csv' ,sep=',',na_values = 'NA')
    #data1 = np.load(path+'Temp/sim2_%s.npz' %sim.columns[0])
    #data2 = np.load(path+'Temp/sim2_%s.npz' %sim.columns[1])
    #plt.plot(data1['y_test'])
    #plt.plot(data2['y_test'])
    #plt.show()
    #del data1,data2
             
    # DL forecast
    if str_model == 'RNN' or str_model =='LSTM' or str_model =='GRU':
        
        # set parameters
        batch_size = 256                  # batch size
        input_dim = lags                  # number of lagged features in X
        hidden_dim = 10                   # number of hidden nodes per layer
        layer_dim = 1                     # number of layers
        output_dim = 1                    # output dimension (1 for univariate output)
        dropout = 0                       # dropout proportion (only before the last sequential layer)
        learning_rate = 1e-3              # learning rate for Adam optimizer
        weight_decay = 1e-6               # weight decay for Adam optimizer

        # save model parameters in dict
        model_params = {'input_dim': input_dim, 'hidden_dim' : hidden_dim,'layer_dim' : layer_dim, 'output_dim' : output_dim, 'dropout_prob' : dropout}
        
        # train model
        results = []
        for j in range(len(params)):
            print('Parameter combination: ', j+1,'/',len(params))
            df_sim, best_alphas = train_loop(model_name=str_model,model_params=model_params,num_sim=j+1,str_sims=str_sims,path=path, lags = lags, weight_type=weight_type, alphas=alphas, batch_size = batch_size, learning_rate=learning_rate, weight_decay=weight_decay)
            results.append(df_sim)
            results_best_alphas[i,j,:] = best_alphas
    
    # save intermediate results
    new_results = np.expand_dims(np.asarray(results),axis=1)
    if i==0:
        np.save(path+'Temp/interm_results.npy',new_results)
    else:
        prev_results = np.load(path+'Temp/interm_results.npy')
        all_results = np.concatenate((prev_results,new_results),axis=1) # shape: no. of params x rep x no. of settings x no. of metrics
        np.save(path+'Temp/interm_results.npy',all_results)
        del prev_results
    
    # calculate metrics
    arr_results = np.load(path+'Temp/interm_results.npy')
    arr_mean, arr_std, arr_min, arr_max, arr_median = get_results(arr_results) # aggregate over reps (axis 1)  
    np.savez(path+'Results/'+setting+'_'+str_model+'_WLoss_'+weight_type+'_results.npz',mean=arr_mean,std=arr_std,minimum=arr_min,maximum=arr_max,median=arr_median)
    del arr_results
    
    if (i < 10) | (i % 50 == 0):
        print('Elapsed: %s' %time_format(time.time() - timer_start))

np.save(path+'Results/'+setting+'_'+str_model+'_WLoss_'+weight_type+'_bestalphas.npy', np.squeeze(stats.mode(results_best_alphas,axis=0)))
print('Simulation end: %s' %time.ctime(int(time.time())))

Simulation start: Thu Mar  2 10:03:39 2023
Repetition:  1
Parameter combination:  1 / 4
Dataset:  Tau0.1Beta0.5 ( 1 / 9 )
Dataset:  Tau0.1Beta1.0 ( 2 / 9 )
Dataset:  Tau0.1Beta2.0 ( 3 / 9 )
Dataset:  Tau0.45Beta0.5 ( 4 / 9 )
Dataset:  Tau0.45Beta1.0 ( 5 / 9 )
Dataset:  Tau0.45Beta2.0 ( 6 / 9 )
Dataset:  Tau0.9Beta0.5 ( 7 / 9 )
Dataset:  Tau0.9Beta1.0 ( 8 / 9 )
Dataset:  Tau0.9Beta2.0 ( 9 / 9 )
Parameter combination:  2 / 4
Dataset:  Tau0.1Beta0.5 ( 1 / 9 )
Dataset:  Tau0.1Beta1.0 ( 2 / 9 )
Dataset:  Tau0.1Beta2.0 ( 3 / 9 )
Dataset:  Tau0.45Beta0.5 ( 4 / 9 )
Dataset:  Tau0.45Beta1.0 ( 5 / 9 )
Dataset:  Tau0.45Beta2.0 ( 6 / 9 )
Dataset:  Tau0.9Beta0.5 ( 7 / 9 )
Dataset:  Tau0.9Beta1.0 ( 8 / 9 )
Dataset:  Tau0.9Beta2.0 ( 9 / 9 )
Parameter combination:  3 / 4
Dataset:  Tau0.1Beta0.5 ( 1 / 9 )
Dataset:  Tau0.1Beta1.0 ( 2 / 9 )
Dataset:  Tau0.1Beta2.0 ( 3 / 9 )
Dataset:  Tau0.45Beta0.5 ( 4 / 9 )
Dataset:  Tau0.45Beta1.0 ( 5 / 9 )
Dataset:  Tau0.45Beta2.0 ( 6 / 9 )
Dataset:  Tau0.9Beta0.5 ( 7