In [None]:
'''
Project
Forecasting Stock Market Indices Using the Recurrent Neural Network Based Hybrid Models: CNN-LSTM, GRU-CNN, and Ensemble Models
'''

In [None]:
!nvidia-smi

In [None]:
### To USE gpu
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import warnings, os, shutil, random, sys

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import Adam, RMSprop, Adadelta, Adagrad, Ftrl
from tensorflow.keras import initializers, metrics
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.layers import BatchNormalization, Dropout
from tensorflow.keras.layers import MaxPool1D, GlobalMaxPooling1D
from tensorflow.keras import layers


SEED = 777
np.random.seed(SEED)
random.seed(SEED)

warnings.filterwarnings(action='ignore') 
os.environ["PYTHONHASHSEED"] = str(SEED)

print("python version:",sys.version)
print("numpy version:", np.__version__)        
print("tensorflow version:", tf.__version__)

In [None]:
KERNEL_INIT = initializers.glorot_uniform(seed=SEED)
Re_INIT = initializers.Orthogonal(gain=1.0, seed=SEED)
        
def last_time_step_mse(y_true, y_pred):
    return metrics.mean_squared_error(y_true[:,-1], y_pred[:,-1])
def last_time_step_mae(y_true, y_pred):
    return metrics.mean_absolute_error(y_true[:,-1], y_pred[:,-1])

In [None]:
'''
    Load Pickled Data
    :param path: Pickle 파일 위치
    :return: Load Pickle Data
'''
import pickle

def save_data(data, path):
    with open(path, 'wb') as f:
        pickle.dump(data, f)
        
def load_data(path):
    loaded = None
    
    with open(path, 'rb') as fr:
        loaded = pickle.load(fr)
        
    return loaded

PREPROC_ALL_TRAIN_DATA = load_data('./save_files3/PREPROC_ALL_TRAIN_DATA.pickle')
PREPROC_ALL_TEST_DATA = load_data('./save_files3/PREPROC_ALL_TEST_DATA.pickle')
SCALER = load_data('./save_files3/SCALER.pickle')

In [None]:
def make_dataset(x, y, input_wd, pred_wd):
    mk_x = []  
    mk_y = []   
    for i in range(len(x)-input_wd-pred_wd+1):
        mk_x.append(x[i:i+input_wd])
        mk_y.append(y[i+input_wd:i+input_wd+pred_wd])
    return np.array(mk_x), np.array(mk_y)

In [None]:
'''
    Load preprocessed data
'''
def get_preprocess_data(feature_code = "OHMLVC", data_code = "DJI", duration = "P0019", dataset_split="T10"):
    train_x_key = f"{data_code}_{duration}_{dataset_split}_TRX"
    train_y_key = f"{data_code}_{duration}_{dataset_split}_TRY"
    
    test_x_key = f"{data_code}_{duration}_{dataset_split}_TEX"
    test_y_key = f"{data_code}_{duration}_{dataset_split}_TEY"
    
    scaler_key = f"{data_code}_{duration}"
    
    return (
        PREPROC_ALL_TRAIN_DATA[feature_code][train_x_key],
        PREPROC_ALL_TRAIN_DATA[feature_code][train_y_key],
        
        PREPROC_ALL_TEST_DATA[feature_code][test_x_key],
        PREPROC_ALL_TEST_DATA[feature_code][test_y_key],
        
        SCALER[feature_code][scaler_key]["x"],
        SCALER[feature_code][scaler_key]["y"],
    )

In [None]:
'''
    CNN_LSTM
'''
def cnn_lstm(in_window, n_feature, x, y, x_test, y_test, pred_wd, EPOCHS, OPT, lr, verbose):
    model = Sequential()
    model.add(Input(shape=(in_window, n_feature)))
    
    model.add(Conv1D(
         filters = 32,
         kernel_size = 3,
         strides = 1,
         padding = "causal",
        activation = "relu"
     ))    
    model.add(LSTM(128, return_sequences=False, stateful=False,
                    kernel_initializer=KERNEL_INIT,
                    recurrent_initializer=Re_INIT,
                    bias_initializer='zeros'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2, seed=777))    
    model.add(Dense(pred_wd,kernel_initializer= KERNEL_INIT,bias_initializer="zeros", activation='relu'))
    if OPT=="ADAM":
        opt=Adam(lr=lr)
    else:
        opt=RMSprop(lr=lr)
    
    model.compile(loss=tf.keras.losses.Huber(), optimizer=opt, 
                  metrics=[last_time_step_mse, last_time_step_mae])
    
    earlystopping = EarlyStopping(monitor='val_loss', patience=30, mode='min')
    filename = os.path.join('cnn_lstm'+str(in_window), 'ckeckpointer.ckpt')
    checkpoint = ModelCheckpoint(filename, 
                                 save_weights_only=True, 
                                 save_best_only=True, 
                                 monitor='val_loss', 
                                 verbose=0)
    
    hist = model.fit(x, y, batch_size=32, epochs=EPOCHS, validation_split=0.1, 
                     shuffle=False, verbose=0, callbacks=[earlystopping, checkpoint])
    
#     return model, filename
    model.load_weights(filename)
    y_pred = model.predict(x_test)
    return y_test[:,-1], y_pred[:,-1]

In [None]:
'''
    GRU_CNN
'''
def gru_cnn(in_window, n_feature,x,y,x_test, y_test, pred_wd, EPOCHS, OPT, lr, verbose):
    model = Sequential()
    
    model.add(GRU(128,input_shape=[in_window, n_feature],
                      kernel_initializer=KERNEL_INIT,
                      recurrent_initializer=Re_INIT,
                      return_sequences=True,
                      bias_initializer='zeros'))
     
    model.add(Conv1D(
         filters = 32,
         kernel_size = 3,
         strides = 1,
         padding = "causal",
        activation = "relu"
     ))  
    model.add(GlobalMaxPooling1D())
    model.add(Dense(10, kernel_initializer= KERNEL_INIT,bias_initializer="zeros", activation='relu'))
    model.add(Dropout(0.2, seed=777))    
    model.add(Dense(pred_wd,kernel_initializer= KERNEL_INIT,bias_initializer="zeros", activation='relu'))    
    if OPT=="ADAM":
        opt=Adam(lr=lr)
    else:
        opt=RMSprop(lr=lr)        
    model.compile(loss=tf.keras.losses.Huber(), optimizer=opt, 
                  metrics=[last_time_step_mse, last_time_step_mae])    
    earlystopping = EarlyStopping(monitor='val_loss', patience=10, mode='min')
    filename = os.path.join('gru_cnn'+str(in_window), 'ckeckpointer.ckpt')
    checkpoint = ModelCheckpoint(filename, 
                                 save_weights_only=True, 
                                 save_best_only=True, 
                                 monitor='val_loss', 
                                 verbose=0)
    hist = model.fit(x,y,batch_size=32, epochs=EPOCHS, validation_split=0.1, 
                     shuffle=False, verbose=0, callbacks=[earlystopping, checkpoint])
    
    
    model.load_weights(filename)
    y_pred = model.predict(x_test)
    return y_test[:,-1], y_pred[:,-1]

In [None]:
'''
    Ensemble
'''
class MakeModel(tf.keras.Model):
    from tensorflow.keras import layers as tf_layer
    from tensorflow.keras import initializers as tf_init
    from tensorflow.keras import regularizers as tf_regular
    from tensorflow.keras import models as tf_models
    from tensorflow.keras.callbacks import EarlyStopping 
    from tensorflow.keras.callbacks import ModelCheckpoint
    import time as tm
    from datetime import timedelta as tdelta
    
    def __init__(self, **kwargs):             
        super(MakeModel, self).__init__()     
        
        # kwargs : dictionary
        SEED = 777
        in_shape = kwargs['in_shape']
        pw = kwargs['pw']
        self.lstm = self.tf_layer.LSTM(
            units = 128,
            input_shape = in_shape,
            activation = "tanh",
            kernel_initializer = self.tf_init.glorot_uniform(seed=SEED),
            recurrent_initializer = self.tf_init.Orthogonal(gain=1.0, seed=SEED),
            stateful = False
            )

        self.gru = self.tf_layer.GRU(
            units = 128,
            input_shape = in_shape,
            kernel_initializer = self.tf_init.glorot_uniform(seed=SEED),
            recurrent_initializer = self.tf_init.Orthogonal(gain=1.0, seed=SEED),
            bias_initializer = 'zeros',
            stateful = False
        )

        self.rnn = self.tf_layer.SimpleRNN(
            units = 128,
            input_shape = in_shape,
            kernel_initializer = self.tf_init.glorot_uniform(seed=SEED),
            recurrent_initializer = self.tf_init.Orthogonal(gain=1.0, seed=SEED),
            bias_initializer='zeros',
        )
        
        
        self.drop = self.tf_layer.Dropout(0.2, seed=SEED)
    
        self.dense = self.tf_layer.Dense(
            units = 32,
            activation='relu',
            kernel_initializer= self.tf_init.glorot_uniform(seed=SEED),
            bias_initializer="zeros"
        )
        
        self.out = self.tf_layer.Dense(
            units = pw,
            activation='relu' # linear로 바꿔보자
        )
        

    def call(self, inputs):
        x1 = self.lstm(inputs)
        x2 = self.gru(inputs)
        x3 = self.rnn(inputs)
        x = (x1+x2+x3)/3
        x = self.drop(x)
        x = self.dense(x)
        x = self.out(x)
        return x
    
    def build_graph(self, input_shape):
        input_shape_wo_batch = input_shape[1:]
        self.build(input_shape)
        inputs = tf.keras.Input(shape=input_shape_wo_batch)        
        _ = self.call(inputs)

In [None]:
# Compile Model
def compile_model(model, opt):
    if opt == "ADAM":
        model.compile(
            optimizer=tf.keras.optimizers.Adam(lr=0.0005),
            loss=tf.losses.Huber(),
            metrics = [last_time_step_mse,last_time_step_mae]
        )
    else:
        model.compile(
            optimizer=tf.keras.optimizers.RMSprop(lr=0.0005),
            loss=tf.losses.Huber(),
            metrics = [last_time_step_mse,last_time_step_mae]
        )
 

def ensemble(iw, n_feature,x,y,x_test, y_test, pw, EPOCHS, OPT, lr, verbose):
  
    earlystopping = EarlyStopping(monitor='val_loss', patience=10, mode='min')
    filename = os.path.join('ensemble'+str(iw), 'ckeckpointer.ckpt')
    checkpoint = ModelCheckpoint(filename, 
                                 save_weights_only=True, 
                                 save_best_only=True, 
                                 monitor='val_loss', 
                                 verbose=0)
    if OPT=="ADAM":
        opt=Adam(lr=lr)
    else:
        opt=RMSprop(lr=lr)     
        
    # kwargs define
    num_feature = len(feature)
    kwargs = {
        "in_shape" : [iw, num_feature],
        "pw": pw
    }        
    # Model Define
    model = MakeModel(**kwargs)
    
    # Compile Model
    compile_model(model, opt)
    
    hist = model.fit(x,y,batch_size=32, epochs=EPOCHS, validation_split=0.1, 
                     shuffle=False, verbose=0, callbacks=[earlystopping, checkpoint])
    model.load_weights(filename)
    y_pred = model.predict(x_test)
    return y_test[:,-1], y_pred[:,-1]


In [None]:
feature = 'OHLV'                                                                )

In [None]:
from timeit import default_timer as timer
from datetime import timedelta
start = timer()

OPT = 'RMSProp'  # RMSProp, ADAM
lr = 0.0005
verbose=0
EPOCHS = 1
iws = [5,21,42]
pw = 5

all_result = {}


idxs = ["DJI",
        "US500",
        "DAX"]
for index in idxs:
    trd_x, trd_y, ted_x, ted_y, scaler_x, scaler_y = get_preprocess_data(
                                                                    feature_code=feature, 
                                                                    data_code=index, 
                                                                    duration='P0019', 
                                                                    dataset_split='T20'
                                                                    )
    temp_result = []
    for iw in iws:
        train_xx, train_yy = make_dataset(trd_x, trd_y, iw, pw)
        test_xx, test_yy = make_dataset(ted_x, ted_y, iw, pw)
        true, cnnLstm = cnn_lstm(iw, len(feature), train_xx, train_yy , test_xx, test_yy, pw, EPOCHS, OPT, lr, verbose)
        true, gruCnn = gru_cnn(iw, len(feature),train_xx, train_yy ,test_xx, test_yy, pw, EPOCHS, OPT, lr, verbose)
        true, esb = ensemble(iw, len(feature), train_xx, train_yy , test_xx, test_yy, pw, EPOCHS, OPT, lr, verbose)
        temp_result.append([true, cnnLstm, gruCnn, esb])
        all_result[index] = temp_result

end = timer()
print("elapsed time:", timedelta(seconds=end-start))  

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np


fig=plt.figure(figsize=(36,24), dpi=50)
mpl.rc('axes', labelsize=20)
mpl.rc('xtick', labelsize=20)
mpl.rc('ytick', labelsize=20)
mpl.rcParams['axes.linewidth'] = 2 

i = 0
for idx, key in enumerate(all_result):
    # print(idx)
    # print(key)
    for num, value in enumerate(all_result[key]):
        i+=1
        plt.subplot(len(iws), len(all_result.keys()), i)
        thickness = 10
        if key=='DJI':
          key='DOW30'
        elif key=='US500':   
          key='S&P500' 
        elif key=='DAX':
          key='DAX30'
        
        plt.plot(value[0], "k-", linewidth=2.7, label=key)
        plt.plot(value[1], "r-", linewidth=2.7, label="look-back"+str(iws[num])+"_CNN_LSTM")
        plt.plot(value[2], "g-", linewidth=2.7, label="look-back"+str(iws[num])+"_GRU_CNN")
        plt.plot(value[3], "b-", linewidth=2.7, label="look-back"+str(iws[num])+"_Ensemble")
        plt.title(key, fontsize=30, pad=20)
        if key=='DOW30':
          plt.xlabel('Number of Testing Days', fontsize=27, labelpad=20) # Testing day, Number of observations
        elif key=='S&P500':   
          plt.xlabel('Number of Testing Days', fontsize=27, labelpad=20)
        elif key=='DAX30':
          plt.xlabel('Number of Testing Days', fontsize=27, labelpad=20)
        plt.ylabel('Close Price', fontsize=27, labelpad=10)
        plt.legend(fontsize=20)
        plt.margins(x=0.005, y=0)


plt.tight_layout(h_pad = 7, w_pad=7)
plt.show()