In [None]:
#Last edited 5/11/2023

# PURPOSE: 
# Train a model using ARIMA and 1D-CNN

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import tensorflow as tf
from datetime import timedelta
from scipy.special import expit, logit

from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import KFold, TimeSeriesSplit, GridSearchCV
from tensorflow.keras.optimizers.legacy import Adam

import keras.initializers
from keras.layers import Dense, Layer, LSTM
from keras.models import Sequential
from keras.models import load_model
from keras.regularizers import l1, l2
from keras.callbacks import EarlyStopping
from keras.wrappers.scikit_learn import KerasRegressor
from keras.utils import to_categorical

%matplotlib inline

In [None]:
# THINGS TO CHANGE FOR DIFFERENT PREDICTION HORIZONS 

horizon = 'SCALP'
# horizon = 'SWING'
# horizon = 'POSITION'

if horizon == 'POSITION':
    # defined using the statistical test (but i just chose a window size)
    n_steps = 20 #POSITION
    n_steps_ahead = 2
elif horizon == 'SWING':
    n_steps = 30 #SWING, SCALP
    n_steps_ahead = 3
elif horizon == 'SCALP':
    n_steps = 30
    n_steps_ahead = 4 # forecasting horizon

window_size = n_steps


print('Horizon is {} with window size {}, forecasting n_steps ahead {}'.format(horizon, n_steps, n_steps_ahead))

In [None]:
# Load data
BASE_PATH = os.path.dirname(os.getcwd())
DATA_DIR = 'data/'

FILE_NM = 'data_1d_{}.csv'.format(horizon)
DATA_PATH = os.path.join(BASE_PATH, DATA_DIR)
data = os.path.join(DATA_PATH, FILE_NM)

print('LOADING DATA')
data = pd.read_csv(data, sep=",")[["prediction", "date", "close", "label"]]
print("Rows in df :", len(data))

In [None]:
data.head(10)

In [None]:
df = pd.get_dummies(data, columns=['label'], drop_first=True)
df = df.rename(columns={"close": "close", "label_SHORT": "label"})
# LONG = 0, SHORT = 1

In [None]:
df.head()

In [None]:
# If the dataset contains missing values; in order to prevent this causing errors, 
# we replace these with adjacent values from the time series.

nof_missing_values = sum(np.isnan(df['close']))

print(nof_missing_values, 'observations are missing.')
print('This is {:.3f}% of the total.'.format(nof_missing_values*100/len(df)))


# df = df.fillna(method="backfill")
# nof_missing_values = sum(np.isnan(df['USD']))
# print('Now', nof_missing_values, 'observations are missing.')

In [None]:
### Splitting the time series into training and testing sets
# Split the training and test set by using the first 80% of the time series and the remaining 
#20% for the test set. Note that the test set must be in the future of the training set 
# to avoid look-ahead bias. Also, random sampling of the data can not be used as this would eliminate 
# the auto-correlation structure.

use_features = ['close'] # continuous input
target = ['label'] # continuous output
# Make sure the splits are the same as 2D CNN 
# train_weight = 0.8
# split = int(len(df) * train_weight)
split = int(np.floor(0.8*len(df)))

df_train = df[use_features].iloc[:split]
# df_test = df[use_features].iloc[split:] 

df_test = df[use_features].iloc[split-n_steps:] # so the rolling predcition cv can start right at the test date for ARIMA only
# test_label = df['label'].iloc[split:] # for ARIMA
test_label = df['label'].iloc[split-n_steps:] # for CNN

# labels (targets)
train_label = df['label'].iloc[:split]
# test_label = df['label'].iloc[split:]

# dates
train_date = df['prediction'].iloc[:split]
test_date = df['prediction'].iloc[split:]

In [None]:
### Scaling
# Standardization of the data is important to avoid potential scaling difficulties in the fitting of the model. 
# When there is more than one feature (covariate), scaling avoids one feature dominating over another due to 
# disparate scales.

# To avoid introducing a look-ahead bias into the prediction, we must re-scale the training data without 
# knowledge of the test set. Hence, we will simply standardize the training set using the mean and 
# standard deviation of the training set and not the whole time series. Additionally, to avoid introducing 
# a systematic bias into test set, we use the identical normalization for the test set - the mean and 
# standard deviation of the training set are used to normalize the test set.


# note that for a multivariate time series, you would need to scale 
# each variable by its own mean and standard deviation in the training set
mu = float(df_train.mean())
sigma = float(df_train.std())
min_ = float(df_train.min())
max_ = float(df_train.max())

normalize_input = lambda x: (x - min_) / (max_-min_)
stdize_input = lambda x: (x - mu) / sigma

# df_train = df_train.apply(stdize_input)
df_train = df_train.apply(normalize_input)
df_test = df_test.apply(normalize_input)

In [None]:
# ARIMA

In [None]:
# ARIMA
# https://alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.auto_arima.html

#building the model
import pmdarima as pm
from pmdarima import model_selection
from matplotlib import pyplot as plt

    
train = expit(df_train['close'].to_numpy())
test = expit(df_test['close'].to_numpy())
    
model = pm.auto_arima(train, 
                      max_p = n_steps, max_q = n_steps, #max lags are same as other models
                      trace=True, error_action='ignore', 
                      step_wise=True,
                      suppress_warnings=True,
                      stationary=False, #is the data stationarity ?
                      test='adf'
                      )
history = model.fit(train)

In [None]:
history.plot_diagnostics()

In [None]:
# testing rolling prediction
# Set belwo to the best model found above


# Author: Taylor Smith <taylor.smith@alkaline-ml.com>

import pmdarima as pm
from pmdarima import model_selection
from matplotlib import pyplot as plt

print("pmdarima version: %s" % pm.__version__)

# Load the data and split it into separate pieces
# ARIMA(1,1,0)(0,0,0)[0] # POSITION
# ARIMA(3,1,0)(0,0,0)[0] # SWING
# ARIMA(2,1,1)(0,0,0)[0] # SCALP
# y = pm.datasets.load_wineind()
y = test
est = pm.ARIMA(order=(2,1,1),
               seasonal_order=(0, 0, 0, 0),
               suppress_warnings=True)
cv = model_selection.SlidingWindowForecastCV(window_size=n_steps, step=1, h=n_steps_ahead)
predictions = model_selection.cross_val_predict(
    est, y, cv=cv, verbose=2, averaging="mean") #"median"

# plot the predictions over the original series
x_axis = np.arange(y.shape[0])
n_test = predictions.shape[0]

plt.plot(x_axis, y, alpha=0.75, c='b')
plt.plot(x_axis[-n_test:], predictions, alpha=0.75, c='g')  # Forecasts
plt.title("Cross-validated wineind forecasts")
plt.show()

In [None]:
len(predictions)

In [None]:
len(test_label)

In [None]:
true_pred = pd.DataFrame(np.column_stack((test_label, logit(predictions))), columns=["true", "predicted"])
true_pred.head()

In [None]:
true_pred.to_csv('../data/true_pred_ARIMA_{}_1D.csv'.format(horizon), index=False)

In [None]:
# CNN

In [None]:
# Using Keras to implement a 1D convolutional neural network (CNN) for timeseries prediction.

import numpy as np
import tensorflow as tf
from keras.layers import Conv1D, Dense, MaxPooling1D, Flatten
from keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from IPython.display import clear_output
import keras_tuner
from keras_tuner import Hyperband, GridSearch
from tensorflow.keras import regularizers

np.set_printoptions(threshold=25)

In [None]:
def make_timeseries_instances(timeseries, window_size, labels):
    # Convert 1D vectors to 2D column vectors
    timeseries = np.atleast_2d(timeseries)
    if timeseries.shape[0] == 1:
        timeseries = timeseries.T 
    
    if not 0 < window_size < timeseries.shape[0]:
        raise ValueError('Please set 0 < window size < timeseries length')
    
    # `X `is the tensor containing the inputs for the model
    # each row of `X` is a sequence of `window_size` observations from the timeseries
    X = [timeseries[start:start + window_size] for start in range(0, timeseries.shape[0] - window_size)]
    
    # for training the model, the array's dimensions must match the input layer of the CNN
    # that is, a 3D array of shape (timeseries.shape[0] - window_size, window_size, nof_ts_variables)
    X = np.atleast_3d(np.array(X))
    
    # For each row of `X`, the corresponding row of `y` is the 
    # desired output -- in this case, the subsequent value in the timeseries 
    labels = np.atleast_2d(labels)
    if labels.shape[0] == 1:
        labels = labels.T 
    y = labels[window_size:]
    
    return X, y


# def make_CNN(hp, window_size, filter_length,  nb_filter=4, nb_input_series=1, nb_outputs=1):
def make_CNN(hp, window_size, nb_input_series=1, nb_outputs=1):
    """
    window_size (int): number of observations in each input sequence
    filter length (int): length of the convolutional layer's filters
    nb_filter (int): number of filters learned in the convolutional layer
    nb_input_series (int): number of features of the input timeseries (1 for a univariate timeseries)
    nb_outputs (int): number of features being predicted (equal to nb_input_series 
        for predicting a timeseries at a set horizon)
    """
    regularization = hp.Choice('regularization', values=[1e-4, 1e-5])
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    nb_filter = hp.Choice('nb_filter', values=[2, 3, 4, 5])
    filter_length = hp.Choice('filter_length', values=[2, 3, 4, 5])
    
    model = Sequential((
        # The convolutional layer learns `nb_filter` filters (aka kernels), 
        # each of size `(filter_length, nb_input_series)`.  
        # Its output will have shape `(None, window_size - filter_length + 1, nb_filter)` ,  
        # i.e., for each position in the input timeseries, the activation of each filter at that position.
        Conv1D(filters=nb_filter, kernel_size=filter_length, activation='relu', input_shape=(window_size, nb_input_series)),
        Flatten(),
    #         Dense(nb_outputs, activation='sigmoid'), # For classification, a 'sigmoid' activation function would be used
        Dense(units=nb_outputs,
              activation='sigmoid',
              kernel_regularizer=regularizers.L1L2(l1=regularization, l2=regularization),
              bias_regularizer=regularizers.L2(regularization),
              activity_regularizer=regularizers.L2(regularization))
    ))
    #     model.compile(loss='mse', optimizer='adam', metrics=['mae'])
    return model


def build_model(hp):

    model = make_CNN(hp, window_size, nb_input_series=1, nb_outputs=1)
    # compile the model before training it.
    # since there are two classes, use the tf.keras.losses.BinaryCrossentropy loss 
    # with from_logits=True since hte model provides a linear output
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=hp_learning_rate),
                 loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                 metrics=['accuracy', tf.keras.metrics.BinaryCrossentropy(from_logits=True), tf.keras.metrics.AUC(from_logits=True)]
                 )

    return model


def get_model_name(k, horizon, image_type):
    return 'model_' + horizon + '_' + image_type + '_' + str(k) + '.h5'


def get_vars(horizon):
    if horizon == 'POSITION':
        horizon_name = 'POSITION_1D'
        splits = 3
        BATCH_SIZE = 64
        val_size = 150
        max_train_size=None
    elif horizon == 'SWING':
        horizon_name = 'SWING_1D'
        splits = 6
        BATCH_SIZE = 64
        val_size = 450
        max_train_size=None
    elif horizon == 'SCALP':
        horizon_name = 'SCALP_1D'
        splits = 10
        BATCH_SIZE = 128
        val_size = 1000
        max_train_size=5000
        
    EPOCHS = 50    
    image_type = '1D'
    
    return horizon_name, splits, BATCH_SIZE, val_size, max_train_size, EPOCHS, image_type

In [None]:
X, y = make_timeseries_instances(list(df_train['close']), window_size, list(train_label))
x_train = X
y_train = y

#SET HERE
# # horizon='POSITION_1D'
# horizon='SWING_1D'
# image_type = '1D'

# EPOCHS = 50
# BATCH_SIZE = 64 
# splits = 6 # 3 #
# val_size = 450 #150 #

horizon, splits, BATCH_SIZE, val_size, max_train_size, EPOCHS, image_type = get_vars(horizon)

print('Horizon: {}, Splits: {}, BATCH_SIZE: {}, val_size: {}, max_train_size:{}, EPOCHS: {}, image_type: {}'.format(horizon, splits, BATCH_SIZE, val_size, max_train_size, EPOCHS, image_type))

print('length of X: {}, length of y: {}'.format(len(X), len(y)))
print('X:', X, 'y:', y, sep='\n')


In [None]:
if horizon == 'SCALP_1D':
    training_size = 15000
    x_train = x_train[-training_size:]
    y_train = y_train[-training_size:]

In [None]:
### COMBINE & EDIT
PATH = os.path.dirname(os.getcwd())
save_dir = PATH + '/saved_models/{}/'.format(horizon)
#Time-series split
# tscv = TimeSeriesSplit(n_splits=splits, test_size=val_size,  max_train_size= max_train_size)
tscv = TimeSeriesSplit(n_splits=splits, test_size=val_size)

for i, (train_index, val_index) in enumerate(tscv.split(x_train)):
    print(f"Fold {i}:")
    
    VALIDATION_ACCURACY = []
    VALIDATION_LOSS = []
    BEST_HYPS = []
    
    train_index = train_index[i*val_size:]
    if i < 9:
        continue
    df_train_x = x_train[train_index]
    df_train_y = y_train[train_index]
    
    df_val_x = x_train[val_index]
    df_val_y = y_train[val_index]
    
    print("train : {}".format(len(df_train_x)))
    print("validation : {}".format(len(df_val_x)))
    steps_per_epoch = len(df_train_x) // BATCH_SIZE
    validation_steps = len(df_val_x)// BATCH_SIZE
    
    # Create Callbacks
    checkpoint = tf.keras.callbacks.ModelCheckpoint(save_dir+get_model_name(i, horizon, image_type),
                                                  monitor='val_accuracy', verbose=1,
                                                  save_best_only=True, mode='max')
    
    # create a callback to stop training early after reaching a certain value for the validation loss
    stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3,  min_delta=0.001, verbose=1, restore_best_weights=True, start_from_epoch=3)
    
    # https://keras.io/api/keras_tuner/tuners/base_tuner/#tuner-class
    tuner = keras_tuner.GridSearch(
        hypermodel = build_model,
        objective="val_accuracy",
        seed=5,
        max_trials=100,
        overwrite=True,
        directory="model_{}_{}".format(horizon, image_type),
        project_name="tune_hypermodel_{}".format(i),)
    
    tuner.search(df_train_x, df_train_y, 
                 epochs=EPOCHS, 
                 steps_per_epoch=steps_per_epoch,
                 validation_data=(df_val_x, df_val_y), 
                 validation_steps=validation_steps,
                 callbacks=[stop_early])
        # Get the optimal hyperparameters
    best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
    BEST_HYPS.append(best_hps.values)
    
    # Build the model with the optimal hyperparameters and train it on the data for 50 epochs
    model = tuner.hypermodel.build(best_hps)
    history = model.fit(df_train_x, df_train_y, 
                        epochs=EPOCHS,
                        steps_per_epoch=steps_per_epoch,
                        validation_data=(df_val_x, df_val_y), 
                        validation_steps=validation_steps,
                        callbacks=[checkpoint, stop_early])

    # Load best model to evaluate performance
    model.load_weights(os.path.join(save_dir, "model_"+ horizon + '_' + image_type + '_' + str(i) + ".h5"))
    scores = model.evaluate(df_val_x, df_val_y)
    print("{0}s: {1:.2f}%".format(model.metrics_names[1], scores[1]*100))
    results = dict(zip(model.metrics_names, scores))
    VALIDATION_ACCURACY.append(results['accuracy'])
    VALIDATION_LOSS.append(results['loss'])
    
    dict_results = {'VALIDATION_ACCURACY': VALIDATION_ACCURACY, 
                    'VALIDATION_LOSS': VALIDATION_LOSS,
                    'BEST_HYPS': BEST_HYPS}
    df = pd.DataFrame(dict_results)
    df.to_csv('model_{0}_{1}/results_{0}_{1}_fold{2}.csv'.format(horizon, image_type, i))

    # Good practice to explicitly close each 
    # tensorflow session prior to starting a 
    # new one in a loop for memory considerations
    tf.keras.backend.clear_session()
    clear_output(wait=True)


    

In [None]:
frames = []
for split in range(splits):
    result = pd.read_csv('model_{0}_{1}/results_{0}_{1}_fold{2}.csv'.format(horizon, image_type, split))[['VALIDATION_ACCURACY', 'VALIDATION_LOSS', 'BEST_HYPS']]
    frames.append(list(result.iloc[0]))
results_df = pd.DataFrame(frames, columns=['VALIDATION_ACCURACY', 'VALIDATION_LOSS', 'BEST_HYPS'])    
    

In [None]:
results_df

In [None]:
# results_df = pd.read_csv('./results_{}_{}.csv'.format(horizon, image_type))[['VALIDATION_ACCURACY', 'VALIDATION_LOSS', 'BEST_HYPS']]

VALIDATION_ACCURACY = list(results_df.VALIDATION_ACCURACY)
VALIDATION_LOSS = list(results_df.VALIDATION_LOSS)
BEST_HYPS = list(results_df.BEST_HYPS)

In [None]:
X_test, y_test = make_timeseries_instances(list(df_test['close']), window_size, list(test_label))

In [None]:
# Load saved model

best_model_index = VALIDATION_ACCURACY.index(max(VALIDATION_ACCURACY))
# load and evaluate a saved model
from numpy import loadtxt
from tensorflow.keras.models import load_model


# load model
# https://keras.io/guides/serialization_and_saving/
# model = load_model(save_dir + 'model_{}_{}{}.h5'.format(horizon, image_type, best_model_index), custom_objects={'mda': mda})
model = load_model(save_dir + 'model_{}_{}_{}.h5'.format(horizon, image_type, best_model_index))

# summarize model.
model.summary()

# evaluate the model
# score = model.evaluate(test_generator, verbose=0)
score = model.evaluate(X_test, y_test, verbose=0)


In [None]:
BEST_HYPS[best_model_index]

In [None]:
# Results
print("{}: {}%".format(model.metrics_names[0], score[0]*100))
print("{}: {}%".format(model.metrics_names[1], score[1]*100))
print("{}: {}%".format(model.metrics_names[2], score[2]*100))
print("{}: {}%".format(model.metrics_names[3], score[3]*100))

In [None]:
### Making predictions with the model
# Get the predicted values for the test set:
# test_generator.filenames #LONG = 0, SHORT = 1

y_pred = model.predict(X_test)

In [None]:
print(np.column_stack((y_test, y_pred)))

In [None]:
true_pred = pd.DataFrame(np.column_stack((y_test, y_pred)), columns=["true", "predicted"])
true_pred.head()

In [None]:
true_pred.to_csv('../data/true_pred_2dCNN_{}_{}.csv'.format(horizon, image_type), index=False)