# HYPERPARAMETER TUNING SIMPLE MODEL

In [2]:
import pandas as pd
import modules.preprocessing.sampling as sampling
import modules.preprocessing.scaling as scaling
import modules.constants as const

import numpy as np
import modules.training.LSTMmodels as LSTMmodels
import torch.nn as nn
import torch.optim as optim
import modules.training.training as training

import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
import torch

import modules.utils as utils
import modules.plot_utils as plutils
import modules.plot_constants as pltconst
from modules.plot_constants import uzh_colors

from tqdm import tqdm
from matplotlib.lines import Line2D

#### Load All Data

In [3]:
# Input features
market_df = pd.read_csv(const.input_X_dir + "Market.csv")
network_df = pd.read_csv(const.input_X_dir + "NetworkActivity.csv")
social_df = pd.read_csv(const.input_X_dir + "SocialNetworks.csv")
supply_df = pd.read_csv(const.input_X_dir + "Supply.csv")
technical_df = pd.read_csv(const.input_X_dir + "TechnicalIndicators.csv")

# Target feature and dates
df_y = pd.read_csv(const.input_y_dir + "Target.csv")
dates_df = pd.read_csv(const.input_y_dir + "Dates.csv")

y = df_y.values
dates = dates_df.values.flatten()

figures_dir = const.tezos_results_dir + "Hyperparameter tuning/Figures/"
tables_dir = const.tezos_results_dir + "Hyperparameter tuning/Tables/"

# 1. ONE DATA

In [4]:
# These hyperparameters were chosen and will stay fixed. The output dimension is determined by the nature of the prediction task.
# The window and step sizes are chosen as they are often used with similar tasks (such as stock price prediction), since a longer time window can better capture longer trends

TEST_SIZE = sampling.calculate_test_size_from_date(const.test_start_date)
WINDOW_SIZE = 30
STEP_SIZE = 1
OUTPUT_DIM = 1

In [5]:
# For the purpose of hyperparameter tuning, I will create a validation set from the training data set
X = market_df.values

n_features = X.shape[1]
X_train, y_train, X_test, y_test, scaler = sampling.prepare_input_data(X, y, test_size=TEST_SIZE, window_size=WINDOW_SIZE, step_size=STEP_SIZE, do_segmentation=False)

In [4]:
N_VALIDATION_SPLITS = 6

n_hidden_options = [32, 64, 128, 256]
lr_options = [0.1, 0.01, 0.001]
n_epochs = 200
mse_loss = nn.MSELoss()

## (1.1) Hidden neuron numbers and learning rate

In [None]:
tscv = TimeSeriesSplit(n_splits=N_VALIDATION_SPLITS)

# Iterate through the splits and perform training/testing
results = {}
training_curves_all, validation_curves_all = {}, {}

for n_hidden in n_hidden_options:
    print("Hidden neurons: ", n_hidden)
    results[str(n_hidden)] = {}
    training_curves_all[str(n_hidden)] = {}
    validation_curves_all[str(n_hidden)] = {}

    for lr in lr_options:
        tr_loss, val_loss = [], []
        training_curves, validation_curves = [], []
        
        # Iterate over blocked validation splits
        for train_indexes, val_indexes in tqdm(tscv.split(X_train)):
            X_tr, y_tr, X_val, y_val = X_train[train_indexes], y_train[train_indexes], X_train[val_indexes], y_train[val_indexes]

            model = LSTMmodels.LSTMSimple(input_size=X_tr.shape[2], hidden_size=n_hidden, output_size=OUTPUT_DIM)
            optimizer = optim.Adam(model.parameters(), lr=lr)

            data_loader = sampling.make_data_loader(X_tr, y_tr, batch_size=X_tr.shape[0])
            model, train_loss_curve, validation_loss_curve = training.train_model(model, data_loader, n_epochs=200, optimizer=optimizer, loss_fn = mse_loss, X_val=X_val, y_val=y_val)

            training_curves.append(train_loss_curve)
            validation_curves.append(validation_loss_curve)
            predictions, val_loss_value = training.make_prediction(model, X_val, y_val, mse_loss)

            tr_loss.append(train_loss_curve[-1])
            val_loss.append(val_loss_value)

        results[str(n_hidden)][str(lr)] = (np.average(tr_loss), np.average(val_loss))
        training_curves_all[str(n_hidden)][str(lr)] = np.average(np.array(training_curves), axis = 0)
        validation_curves_all[str(n_hidden)][str(lr)] = np.average(np.array(validation_curves), axis = 0)
        
df1 = pd.DataFrame.from_dict(results).reset_index()
df2 = pd.DataFrame.from_dict(training_curves_all).reset_index() 
df3 = pd.DataFrame.from_dict(validation_curves_all).reset_index() 

df1.to_csv(const.data_dir + "Temp/Results_of_benchmark_tuning1.csv", index=False)
df2.to_csv(const.data_dir + "Temp/Training_curves_benchmark1.csv", index=False)
df3.to_csv(const.data_dir + "Temp/Validation_curves_benchmark1.csv", index=False)

#### Chosen configuration 1. : Parameters and Plots

Below is the configuration of the chosen model, based on the lowest validation error.

In [5]:
CHOSEN_N_EPOCHS = 100
CHOSEN_N_HIDDEN = 128
CHOSEN_LR = 0.001

## (1.2) Batch size

In [8]:
X = market_df.values
WINDOW_SIZE = 30
STEP_SIZE = 1

n_features = X.shape[1]
X_train, y_train, X_test, y_test, scaler = sampling.prepare_input_data(X, y, test_size=TEST_SIZE, window_size=WINDOW_SIZE, step_size=STEP_SIZE, do_segmentation=False)

OUTPUT_DIM = 1
mse_loss = nn.MSELoss()

CHOSEN_N_EPOCHS = 100
CHOSEN_N_HIDDEN = 128
CHOSEN_LR = 0.001

N_VALIDATION_SPLITS = 6

In [None]:
batch_sizes = [32, 64, 128, 256]
tscv = TimeSeriesSplit(n_splits=N_VALIDATION_SPLITS)


results = []
for batch_size in batch_sizes:
    tr_loss, val_loss = [], []
    training_curves, validation_curves = [], []
    
    # Iterate over blocked validation splits
    for train_indexes, val_indexes in tqdm(tscv.split(X_train)):
        X_tr, y_tr, X_val, y_val = X_train[train_indexes], y_train[train_indexes], X_train[val_indexes], y_train[val_indexes]

        model = LSTMmodels.LSTMSimple(input_size=X_tr.shape[2], hidden_size=CHOSEN_N_HIDDEN, output_size=OUTPUT_DIM)
        optimizer = optim.Adam(model.parameters(), lr=CHOSEN_LR)
        data_loader = sampling.make_data_loader(X_tr, y_tr, batch_size=batch_size)
        model, train_loss_curve, validation_loss_curve = training.train_model(model, data_loader, n_epochs=CHOSEN_N_EPOCHS, optimizer=optimizer, loss_fn = mse_loss, X_val=X_val, y_val=y_val)
        plt.plot(range(len(validation_loss_curve)), validation_loss_curve)

        training_curves.append(train_loss_curve)
        validation_curves.append(validation_loss_curve)
        predictions, val_loss_value = training.make_prediction(model, X_val, y_val, mse_loss)

        tr_loss.append(train_loss_curve[-1])
        val_loss.append(val_loss_value)

    plt.show()
    results.append({
        "batch_size": batch_size,
        "train_loss": np.average(tr_loss),
        "validation_loss": np.average(val_loss),
        "training_curve": np.average(np.array(training_curves), axis = 0),
        "validation_curve": np.average(np.array(validation_curves), axis = 0)
    })

plt.show()
batches_df = pd.DataFrame.from_dict(results)
batches_df.to_csv(tables_dir + "Batch_size_variations2.csv", index=False)

## (1.3) Window size

In [15]:
X = market_df.values

TEST_SIZE = sampling.calculate_test_size_from_date(const.test_start_date)
STEP_SIZE = 1
OUTPUT_DIM = 1
N_VALIDATION_SPLITS = 6

mse_loss = nn.MSELoss()

CHOSEN_N_EPOCHS = 100
CHOSEN_N_HIDDEN = 256
CHOSEN_LR = 0.001

CHOSEN_BATCH_SIZE = 32

In [16]:
window_sizes = [7, 14, 30, 45]
dest_file = tables_dir + "Window_size_variations.csv"

tscv = TimeSeriesSplit(n_splits=N_VALIDATION_SPLITS)

results = []
for window_size in window_sizes:
    X_train, y_train, X_test, y_test, scaler = sampling.prepare_input_data(X, y, test_size=TEST_SIZE, window_size=window_size, step_size=STEP_SIZE, do_segmentation=False)
    tr_loss, val_loss = [], []
    training_curves, validation_curves = [], []
    
    # Iterate over blocked validation splits
    for train_indexes, val_indexes in tqdm(tscv.split(X_train)):
        X_tr, y_tr, X_val, y_val = X_train[train_indexes], y_train[train_indexes], X_train[val_indexes], y_train[val_indexes]
        model = LSTMmodels.LSTMSimple(input_size=X_tr.shape[2], hidden_size=CHOSEN_N_HIDDEN, output_size=OUTPUT_DIM)
        optimizer = optim.Adam(model.parameters(), lr=CHOSEN_LR)

        data_loader = sampling.make_data_loader(X_tr, y_tr, batch_size=CHOSEN_BATCH_SIZE)
        model, train_loss_curve, validation_loss_curve = training.train_model(model, data_loader, n_epochs=CHOSEN_N_EPOCHS, optimizer=optimizer, loss_fn = mse_loss, X_val=X_val, y_val=y_val)

        training_curves.append(train_loss_curve)
        validation_curves.append(validation_loss_curve)
        predictions, val_loss_value = training.make_prediction(model, X_val, y_val, mse_loss)

        tr_loss.append(train_loss_curve[-1])
        val_loss.append(val_loss_value)

    results.append({
        "window_size": window_size,
        "train_loss": np.average(tr_loss),
        "validation_loss": np.average(val_loss),
        "training_curve": np.average(np.array(training_curves), axis = 0),
        "validation_curve": np.average(np.array(validation_curves), axis = 0)
    })
  
windows_df = pd.DataFrame.from_dict(results)
windows_df.to_csv(dest_file, index=False)

6it [07:29, 74.93s/it]
6it [07:19, 73.32s/it]
6it [08:31, 85.22s/it] 
6it [08:59, 89.91s/it] 


# 2. MULTI DATA

In [47]:
TEST_SIZE = sampling.calculate_test_size_from_date(const.test_start_date)
WINDOW_SIZE = 30
STEP_SIZE = 1
OUTPUT_DIM = 1

BATCH_SIZE = 10

In [48]:
N_VALIDATION_SPLITS = 6

n_hidden_options = [64, 128, 256, 512]
lr_options = [0.01, 0.001]
n_epochs = 150
mse_loss = nn.MSELoss()

In [50]:
# Now the input parameters are coming from different data groups. For this purpose, data was joined from 3 input groups
X = utils.merge_dataframes([market_df, network_df, supply_df])

n_features = X.shape[1]
X_train2, y_train2, X_test2, y_test2, scaler = sampling.prepare_input_data(X, y, test_size=TEST_SIZE, window_size=WINDOW_SIZE, step_size=STEP_SIZE, do_segmentation=False)

40


## (2.1) Hidden neurons and learning rate

In [54]:
tscv = TimeSeriesSplit(n_splits=N_VALIDATION_SPLITS)

# Iterate through the splits and perform training/testing
results = []

for n_hidden in n_hidden_options:
    print("Hidden neurons: ", n_hidden)

    for lr in lr_options:
        result_row = {"learning_rate": lr, "hidden_neurons": n_hidden}
        tr_loss, val_loss = [], []
        training_curves, validation_curves = [], []
        
        # Iterate over blocked validation splits
        for train_indexes, val_indexes in tqdm(tscv.split(X_train2)):
            X_tr, y_tr, X_val, y_val = X_train2[train_indexes], y_train2[train_indexes], X_train2[val_indexes], y_train2[val_indexes]

            model = LSTMmodels.LSTMSimple(input_size=X_tr.shape[2], hidden_size=n_hidden, output_size=OUTPUT_DIM)
            optimizer = optim.Adam(model.parameters(), lr=lr)

            data_loader = sampling.make_data_loader(X_tr, y_tr, batch_size=X_tr.shape[0])
            model, train_loss_curve, validation_loss_curve = training.train_model(model, data_loader, n_epochs=n_epochs, optimizer=optimizer, loss_fn = mse_loss, X_val=X_val, y_val=y_val)

            training_curves.append(train_loss_curve)
            validation_curves.append(validation_loss_curve)
            predictions, val_loss_value = training.make_prediction(model, X_val, y_val, mse_loss)

            tr_loss.append(train_loss_curve[-1])
            val_loss.append(val_loss_value)

        result_row["train_loss"]= np.average(tr_loss)
        result_row["validation_loss"] = np.average(val_loss)

        result_row["training_curve"] = np.average(np.array(training_curves), axis = 0)
        result_row["validation_curve"] = np.average(np.array(validation_curves), axis = 0)

        results.append(result_row)
        
df1 = pd.DataFrame.from_dict(results).sort_values(by="validation_loss", ascending=True).reset_index()
df1.to_csv(tables_dir + "Results_of_benchmark_tuning2.csv", index=False)


Hidden neurons:  64


6it [04:26, 44.50s/it]
6it [04:18, 43.01s/it]


Hidden neurons:  128


6it [04:16, 42.75s/it]
6it [04:32, 45.40s/it]


Hidden neurons:  256


6it [05:32, 55.49s/it]
6it [06:28, 64.67s/it]


Hidden neurons:  512


6it [09:29, 94.98s/it] 
6it [08:36, 86.15s/it]
