# Simulate data

In [2]:
import itertools
import numpy as np
import pandas as pd
from statsmodels.tsa.arima_process import ArmaProcess
from statsmodels.tsa.api import SARIMAX
from arch.univariate import arch_model
from sklearn.preprocessing import MinMaxScaler,MaxAbsScaler
from sklearn.model_selection import train_test_split

Functions to simulate data:

In [56]:
def sim_data(setting, params, tau, fac_beta, test_size, l, path, lags=1, verbose=False):
    
    # zip parameter combinations if necessary
    if setting == 'ARIMA':
        params_list = list(itertools.product(params[0],params[1]))
    else:
        params_list = params
    
    # loop over all parameter combinations
    num_ref = 0
    num_tau = 0
    num_beta = 0
    for param in range(len(params_list)):  
        
        # simulate data withouth break
        if setting == 'AR':
            ar = np.array([1, -params_list[param]])
            ma = np.array([1])
            model = ArmaProcess(ar, ma)
            series = model.generate_sample(nsample=l+lags)
        if setting == 'ARIMA':
            zero_dat = np.zeros(l)
            model = SARIMAX(zero_dat, order=(1,1,1), initialization='diffuse')
            series = model.simulate([params_list[param][0],1,params_list[param][1]],l)
        if setting == 'GARCH':
            model = arch_model(None, mean='Zero', vol='Garch', p=1, q=1, dist="Normal")
            series = np.array(model.simulate([params_list[param][0],0,params_list[param][1]], l).data)
        num_ref += 1
        
        # scale the data to [-1,1]
        #scaler = MinMaxScaler()
        scaler = MaxAbsScaler()
        series = scaler.fit_transform(series.reshape(-1, 1))
        
        # dataframe from simulated series
        df = pd.DataFrame(series, columns=['Ref'])
        if verbose:
            print(np.min(series), np.max(series))
            plt.plot(series, color='green')
        
        # loop over break locations
        for t in tau:
            num_tau += 1
            
            # get break location
            break_loc = int((1-test_size)*t*l+lags) # location at specified proportion of training set (excluding the observations needed for lags)
            
            # save simulated series after breakpoint
            series2 = np.empty_like(series)
            series2[:] = np.nan
            series2[(break_loc-lags):] = series.copy()[(break_loc-lags):]
            colstr = 'Tau'+str(t)+'Ref'
            df2 = pd.DataFrame(series2, columns=[colstr])
            df = pd.concat([df,df2], axis=1)
            if verbose:
                plt.plot(series2, color='red')
            
            # loop over break sizes
            for fb in fac_beta:
                num_beta += 1
                
                # get break size
                beta = fb*np.std(series[lags:]) # determine the size of the break
            
                # add single mean break
                series_break = series.copy()
                series_break[lags:break_loc] += beta # add constant
            
                # save simulated series with break
                colstr = 'Tau'+str(t)+'Beta'+str(fb)
                df_break = pd.DataFrame(series_break, columns=[colstr])
                df = pd.concat([df,df_break], axis=1)
            if verbose:
                plt.plot(series_break, color='blue')
                print(np.min(series_break), np.max(series_break))
                print(beta)
            
        # save all simulated series
        df.to_csv(path+'sim'+str(num_ref)+'.csv',na_rep='NA', index=False)
        if verbose:
            print('sim'+str(num_ref))
            plt.show()
            print(df)

In [None]:
def sim_data_2breaks(setting, params, tau, fac_beta, test_size, l, path, lags=1, verbose=False):
    
    # zip parameter combinations if necessary
    if setting == 'ARIMA':
        params_list = list(itertools.product(params[0],params[1]))
    else:
        params_list = params
    
    # loop over all parameter combinations
    num_ref = 0
    num_tau = 0
    num_beta = 0
    for param in range(len(params_list)):  
        
        # simulate data withouth break
        if setting == 'AR':
            ar = np.array([1, -params_list[param]])
            ma = np.array([1])
            model = ArmaProcess(ar, ma)
            series = model.generate_sample(nsample=l+lags)
        if setting == 'ARIMA':
            zero_dat = np.zeros(l)
            model = SARIMAX(zero_dat, order=(1,1,1), initialization='diffuse')
            series = model.simulate([params_list[param][0],1,params_list[param][1]],l)
        if setting == 'GARCH':
            model = arch_model(None, mean='Zero', vol='Garch', p=1, q=1, dist="Normal")
            series = np.array(model.simulate([params_list[param][0],0,params_list[param][1]], l).data)
        num_ref += 1
        
        # scale the data to [-1,1]
        #scaler = MinMaxScaler()
        scaler = MaxAbsScaler()
        series = scaler.fit_transform(series.reshape(-1, 1))
        
        # save simulated series
        df = pd.DataFrame(series, columns=['Ref'])
        if verbose:
            plt.plot(series, color='green')
            plt.show()
        
        # loop over break locations
        for t in range(len(tau)):
            num_tau += 1
            
            # get break locations
            break_loc1 = int((1-test_size)*tau[t][0]*l+lags) # location at specified proportion of training set (excluding the observations needed for lags)
            break_loc2 = int((1-test_size)*tau[t][1]*l+lags) # location at specified proportion of training set (excluding the observations needed for lags)

            
            # save simulated series after breakpoint
            series2 = np.empty_like(series)
            series2[:] = np.nan
            series2[(break_loc2-lags):] = series.copy()[(break_loc2-lags):]
            colstr = 'Tau'+str(tau[t][0])+str(tau[t][1])+'Ref'
            df2 = pd.DataFrame(series2, columns=[colstr])
            df = pd.concat([df,df2], axis=1)
            if verbose:
                plt.plot(series2, color='red')
            
            # loop over break sizes
            for fb in fac_beta:
                num_beta += 1
                
                # get break size
                beta = fb*np.std(series[lags:]) # determine the size of the break
            
                # add increasing double mean break
                series_break = series.copy()
                series_break[lags:break_loc1] += 2*beta # Break 1: add constant x 2
                series_break[break_loc1:break_loc2] += beta # Break 2: add constant
                
                # add mean reverting double mean break
                series_break2 = series.copy()
                series_break2[break_loc1:break_loc2] += beta # Break 1+2: add constant between break points
            
                # save simulated series with breaks
                colstr = 'Tau'+str(tau[t][0])+str(tau[t][1])+'Beta'+str(fb)+'inc'
                df_break = pd.DataFrame(series_break, columns=[colstr])
                colstr2 = 'Tau'+str(tau[t][0])+str(tau[t][1])+'Beta'+str(fb)+'rev'
                df_break2 = pd.DataFrame(series_break2, columns=[colstr2])
                df = pd.concat([df,df_break,df_break2], axis=1)
            if verbose:
                plt.plot(series_break, color='blue')
                plt.plot(series_break2, color='orange')
                plt.show()
            
        # save all simulated series
        df.to_csv(path+'sim'+str(num_ref)+'.csv',na_rep='NA', index=False)
        if verbose:
            print('sim'+str(num_ref))
            #plt.show()
            print(df) 

Functions to split sample into training and test data:

In [None]:
def generate_time_lags(df, col_name, n_lags):
    df_n = df[[col_name]].copy()
    for n in range(1, n_lags + 1):
        df_n[f"lag{n}"] = df_n[col_name].shift(n)
    df_n = df_n.iloc[n_lags:]
    return df_n.reset_index(drop=True)

In [None]:
def feature_label_split(df, target_col):
    y = df[[target_col]]
    X = df.drop(columns=[target_col])
    return X, y

In [None]:
def split_data(setting, params, path, test_size, lags=1):
    
    # set number of simulations per setting
    if setting == 'AR':
        num_sim = params.shape[0] 
    if setting == 'GARCH':
        num_sim = len(params)              
    if setting == 'ARIMA':
        num_sim = params[0].shape[0]*params[1].shape[0] 
    
    # loop over all DGP settings
    for j in range(num_sim):

        # load simulated data
        sim = pd.read_csv(path+'sim%s.csv' %(j+1) ,sep=',',na_values = 'NA')

        # loop over all break settings
        for i in range(len(sim.columns)):

            # get and save splits
            df_generated = generate_time_lags(sim, sim.columns[i], lags)
            X, y = feature_label_split(df_generated, sim.columns[i])
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)
            #print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
            if all([x in sim.columns[i] for x in ['Tau','Ref']]):
                np.savez(path+'sim%s_%s.npz' %((j+1),sim.columns[i]), X_train=X_train.dropna(), X_test=X_test, y_train=y_train.dropna()[1:], y_test=y_test)
            else:
                np.savez(path+'sim%s_%s.npz' %((j+1),sim.columns[i]), X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
    
        del sim