<a href="https://colab.research.google.com/github/leonardobocci/ml-stock-market/blob/main/tuning_neural_nets_tensorflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install featurewiz==0.1.996
!pip install tscv
!pip install neptune
!pip install neptune-tensorflow-keras

In [None]:
import pandas as pd
from featurewiz import FeatureWiz
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from dateutil.relativedelta import relativedelta
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
import tensorflow as tf
import numpy as np
from sklearn import metrics
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from tscv import GapRollForward
import neptune
from neptune.integrations.tensorflow_keras import NeptuneCallback
import plotly.express as px

Imported 0.1.996 version. Select nrows to a small number when running on huge datasets.
output = featurewiz(dataname, target, corr_limit=0.70, verbose=2, sep=',', 
		header=0, test_data='',feature_engg='', category_encoders='',
		dask_xgboost_flag=False, nrows=None)
Create new features via 'feature_engg' flag : ['interactions','groupby','target']



In [None]:
%%capture
from google.colab import drive
drive.mount('/content/drive')

from google.colab import auth
auth.authenticate_user()
import gspread
from google.auth import default
creds, _ = default()
gc = gspread.authorize(creds)

workbook = gc.open('all_etfs_OHLCV')
sheet_titles = []
for sheet in workbook.worksheets():
  sheet_titles.append(sheet.title)

dict_of_sheets = {}
for sheet_title in sheet_titles:
  sheet = workbook.worksheet(sheet_title)
  values = sheet.get_all_values()
  dict_of_sheets[sheet_title] = values

keys = list(dict_of_sheets)
etfs = {}
for etf in keys:
  etfs[etf] = pd.read_csv(f'/content/drive/MyDrive/Bocci_Machine_Learning_Returns/Data/{etf}.csv')
  etfs[etf]['date'] = pd.to_datetime(etfs[etf]['date'], format="%Y/%m/%d")
  etfs[etf].set_index('date', inplace=True)

results_path='/content/drive/MyDrive/Bocci_Machine_Learning_Returns/Data/results_lstm.csv'
results=pd.read_csv(results_path)
results['id'] = results.etf + results.model

In [None]:
def split_scale(df):
  global y_train, y_pred, y_test, x_train, x_test, x_train_scaled, x_train_norm, x_test_scaled, x_test_norm, split_point, dates_df, dates_test_df, dates_train_df
  #Exclude dependent and date
  feature_names = df.columns
  feature_names = feature_names.drop(['day', 'month', 'log_returns'])

  #X, Y Split
  x = df.loc[:, feature_names]
  y = df.loc[:, 'log_returns']

  #Train-Test Split
  last_date = max(df.index)
  split_point = pd.to_datetime((last_date - relativedelta(years = 3)).date())
  x_test, y_test = x.loc[x.index >= split_point].values, y.loc[y.index >= split_point].values
  train_length = len(x_test) * 4
  x_train, y_train = x.loc[x.index < split_point].tail(train_length).values, y.loc[y.index < split_point].tail(train_length).values

  #Scaling
  scaler = preprocessing.StandardScaler().fit(x_train)
  #normalizer = preprocessing.Normalizer().fit(x_train)
  x_train_scaled = scaler.transform(x_train)
  x_test_scaled = scaler.transform(x_test)
  #x_train_norm = normalizer.transform(x_train)
  #x_test_norm = normalizer.transform(x_test)

  #Re-add column names
  x_train_scaled = pd.DataFrame(x_train_scaled, columns = feature_names)
  x_test_scaled = pd.DataFrame(x_test_scaled, columns = feature_names)
  #x_train_norm = pd.DataFrame(x_train_norm, columns = feature_names)
  #x_test_norm = pd.DataFrame(x_test_norm, columns = feature_names)
  y_test = pd.DataFrame(y_test, columns=['Log_Returns'])
  y_train= pd.DataFrame(y_train, columns=['Log_Returns'])

  #Save dates
  dates_df = pd.DataFrame(df.index)
  dates_test_df = dates_df.loc[dates_df.date >= split_point].reset_index(drop=True)
  dates_train_df = dates_df.loc[dates_df.date < split_point].tail(train_length).reset_index(drop=True)

In [None]:
def featurewiz_selection():
  train_features = x_train_scaled.copy()
  test_features = x_test_scaled.copy()
  train_labels = y_train.copy()
  test_labels = y_test.copy()

  features = FeatureWiz(corr_limit=0.70, feature_engg='', category_encoders='', dask_xgboost_flag=False, nrows=None, verbose=0)
  train_features = features.fit_transform(train_features, train_labels)
  test_features = features.transform(test_features)
  return train_features, test_features

In [None]:
def validate_ff():
  '''params = {
        'learning_rate': [0.0005, 0.001],
        'n_epochs': [100, 200],
        'n_neurons': [8, 16, 50],
        'n_hid_layers': [4, 12],
        'dropout_rate': [0.1, 0.3],
        'weight_decay': [0.00001, None],
        'clipnorm': [1, None],
        'earlystop': [True, False]
    }'''

  params = {
        'learning_rate': 0.0005,
        'n_epochs': 75,
        'n_neurons': [16, 50],
        'n_hid_layers': [4, 12],
        'dropout_rate': 0.1,
        'weight_decay': 0.00001,
    }


  train_features = tf.convert_to_tensor(x_train_scaled) 
  test_features = tf.convert_to_tensor(x_test_scaled) 
  train_labels = tf.convert_to_tensor(y_train) 
  test_labels = tf.convert_to_tensor(y_test)

  def build_and_compile_ff(lr, clipping, decay, neurons, layers, dropout):
    model_name='ff_nn'
    model = Sequential()
    for layer in range(0,layers):
      model.add(Dense(neurons))
      model.add(Dropout(dropout))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error',
                  optimizer=tf.keras.optimizers.Adam(weight_decay=decay, clipnorm=clipping, learning_rate=lr))

    #shape = train_features.shape
    #model.build(shape)
    return model

  #for a in range(0, len(params['clipnorm'])):
    #for b in range(0, len(params['weight_decay'])):
      #for c in range(0, len(params['dropout_rate'])):
        #for d in range(0, len(params['learning_rate'])):
  for e in range(0, len(params['n_hid_layers'])):
    for f in range(0, len(params['n_neurons'])):
  #for g in range(0, len(params['n_epochs'])):
  #for h in range(0, len(params['earlystop'])):
    #Start Run
      run = neptune.init_run(
          project="ku-master-research/master-thesis-ff",
          api_token='eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIxM2Q3OTFlZi0zZWQ5LTRjY2UtOGUyMi1mOTY4ZWFkOWE4YTAifQ=='
      )  # Credentials to log experiments
      neptune_callback = NeptuneCallback(run=run)

      #Get Params
      #clipnorm = params['clipnorm'][a]
      #weight_decay = params['weight_decay'][b]
      #dropout_rate = params['dropout_rate'][c]
      dropout_rate = params['dropout_rate']
      #learning_rate = params['learning_rate'][d]
      learning_rate = params['learning_rate']
      n_hid_layers = params['n_hid_layers'][e]
      n_neurons = params['n_neurons'][f]
      #n_epochs = params['n_epochs'][g]
      n_epochs = params['n_epochs']
      #earlystop = params['earlystop'][h]

      #Log Stuff
      run['mytracking/etf'] = etf
      #run['mytracking/learning_rate'] = learning_rate
      #run['mytracking/dropout'] = dropout_rate
      #run['mytracking/weight_decay'] = weight_decay
      #run['mytracking/clipnorm'] = clipnorm
      #run['mytracking/epochs'] = n_epochs
      run['mytracking/n_neurons'] = n_neurons
      run['mytracking/layers'] = n_hid_layers
      #run['mytracking/earlystop'] = earlystop

      ff = build_and_compile_ff(learning_rate, None, None, n_neurons, n_hid_layers, dropout_rate)
      #ff = build_and_compile_ff(learning_rate, clipnorm, weight_decay, n_neurons, n_hid_layers, dropout_rate)
      '''if earlystop:
        es = EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=10)
        ff.fit(
            train_features,
            train_labels,
            validation_split=0.2,
            verbose=0,
            epochs=n_epochs,
            callbacks=[es, neptune_callback]
        )
      else:'''
      ff.fit(
          train_features,
          train_labels,
          validation_split=0.2,
          verbose=0,
          epochs=n_epochs,
          callbacks=[neptune_callback]
      )

      #Make Predictions
      y_pred = ff.predict(test_features)
      run['mytracking/test_rmse'] = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
      run['mytracking/pred_stdev'] = np.std(y_pred)
      run['mytracking/pred_min'] = np.min(y_pred)
      run['mytracking/pred_max'] = np.max(y_pred)
      run['mytracking/pred_mean'] = np.mean(y_pred)

      #End Run
      run.stop()

In [None]:
def prepare_data(window_size):
  if window_size != 0:
    n_future = 1
    train_features = []
    train_labels = []
    test_features = []
    test_labels = []

    #Reformat input data into a shape: (n_samples x timesteps x n_features)
    #Create windows for the training data
    for i in range(window_size, len(x_train_scaled) - n_future +1):
        train_features.append(x_train_scaled.iloc[i - window_size:i, 0:x_train_scaled.shape[1]])
        train_labels.append(y_train.iloc[i + n_future - 1:i + n_future, 0])
    train_features, train_labels = np.array(train_features), np.array(train_labels)

    #Add the periods of the training data that are part of the required window length to the test data
    x_test_lstm = pd.concat([x_train_scaled.iloc[len(x_train_scaled)-window_size:,:], x_test_scaled]).reset_index(drop=True)

    #Create windows for the testing data
    for i in range(window_size, len(x_test_lstm) - n_future +1):
        test_features.append(x_test_lstm.iloc[i - window_size:i, 0:x_test_lstm.shape[1]])
        test_labels.append(y_test.iloc[i-window_size: i- window_size+1, 0])
    #Convert inputs to Tensors
    test_features, test_labels = np.array(test_features), np.array(test_labels)
    train_features = tf.convert_to_tensor(train_features) #train_features, scaled
    test_features = tf.convert_to_tensor(test_features) #test_features, scaled
    train_labels = tf.convert_to_tensor(train_labels) #train_labels
    test_labels = tf.convert_to_tensor(test_labels) #test_labels
    return train_features, test_features, train_labels, test_labels
  else:
    from tensorflow.python.ops.numpy_ops import np_config
    np_config.enable_numpy_behavior()
    train_features = tf.convert_to_tensor(x_train_scaled) #train_features, scaled
    train_features = train_features.reshape((train_features.shape[0], train_features.shape[1], 1))
    test_features = tf.convert_to_tensor(x_test_scaled) #test_features, scaled
    test_features = test_features.reshape((test_features.shape[0], test_features.shape[1], 1))
    train_labels = tf.convert_to_tensor(y_train) #train_labels
    test_labels = tf.convert_to_tensor(y_test) #test_labels
    return train_features, test_features, train_labels, test_labels
  

def build_lstm(n_hid_layers, n_units, decay, clipping, lr):
  model_name='lstm_nn'
  #Build and compile the model
  lstm = Sequential()
  for layer in range(0,n_hid_layers):
    lstm.add(LSTM(n_units, return_sequences=True))
    lstm.add(Dropout(0.05))
  lstm.add(LSTM(n_units, return_sequences=False))
  lstm.add(Dense(1))
  lstm.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(weight_decay=decay, clipnorm=clipping, learning_rate=lr))
  return lstm

def validate_lstm():
  #es = EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=10)
  '''params = {
      'window_size': [0],
      'n_epochs': [200, 400],
      'n_units': [120, 200],
      'n_hid_layers': [6, 10],
      'val_dropout': [0.05, 0.1],
      'weight_decay': [0, 0.00001],
      'clipnorm': [1]
  }
  '''
  params = {
      'learning_rate': 0.0005,
      'window_size': [0],
      'n_epochs': [1000],
      'n_units': [250],
      'n_hid_layers': [10],
      'val_dropout': [0.1],
      'weight_decay': [0.00001],
      'clipnorm': [1]
  }
 
  for a in range(0, len(params['window_size'])):
    for b in range(0, len(params['n_epochs'])):
      for c in range(0, len(params['n_units'])):
        for d in range(0, len(params['n_hid_layers'])):
          for e in range(0, len(params['val_dropout'])):
            for f in range(0, len(params['weight_decay'])):
              for g in range(0, len(params['clipnorm'])):
                #Start Run
                run = neptune.init_run(
                    project="ku-master-research/master-thesis-lstm",
                    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIxM2Q3OTFlZi0zZWQ5LTRjY2UtOGUyMi1mOTY4ZWFkOWE4YTAifQ==",
                )  # Credentials to log experiments
                neptune_callback = NeptuneCallback(run=run)

                #Get Params
                learning_rate = params['learning_rate']
                window_size = params['window_size'][a]
                n_epochs = params['n_epochs'][b]
                n_units = params['n_units'][c]
                n_hid_layers = params['n_hid_layers'][d]
                val_dropout = params['val_dropout'][e]
                weight_decay = params['weight_decay'][f]
                clipnorm = params['clipnorm'][g]

                #Prepare Data into LSTM Format
                train_features, test_features, train_labels, test_labels = prepare_data(window_size)

                #Log Stuff
                run['namespace/field_name'] = f'etf_{etf}_win_{window_size}_epo_{n_epochs}_uni_{n_units}_lay_{n_hid_layers}'
                run['mytracking/etf'] = etf
                run['mytracking/dropout'] = val_dropout
                run['mytracking/weight_decay'] = weight_decay
                run['mytracking/clipnorm'] = clipnorm
                run['mytracking/window_size'] = window_size
                run['mytracking/epochs'] = n_epochs
                run['mytracking/units'] = n_units
                run['mytracking/layers'] = n_hid_layers

                #Build Model
                lstm = build_lstm(n_hid_layers, n_units, weight_decay, clipnorm, learning_rate)

                #Run Model
                lstm.fit(
                      train_features,
                      train_labels,
                      validation_split=0.2,
                      verbose=0,
                      epochs=n_epochs,
                      #callbacks=[es, neptune_callback]
                      callbacks=[neptune_callback]
                  )
                
                #Make Predictions
                y_pred = lstm.predict(test_features)
                run['mytracking/test_rmse'] = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
                run['mytracking/pred_stdev'] = np.std(y_pred)
                run['mytracking/pred_min'] = np.min(y_pred)
                run['mytracking/pred_max'] = np.max(y_pred)
                run['mytracking/pred_mean'] = np.mean(y_pred)

                #End Run
                run.stop()

In [None]:
for etf in keys:
    split_scale(etfs[etf])
    x_train_scaled, x_test_scaled = featurewiz_selection()
    #validate_lstm()
    validate_ff()

wiz = FeatureWiz(verbose=1)
        X_train_selected = wiz.fit_transform(X_train, y_train)
        X_test_selected = wiz.transform(X_test)
        wiz.features  ### provides a list of selected features ###            
        
############################################################################################
############       F A S T   F E A T U R E  E N G G    A N D    S E L E C T I O N ! ########
# Be judicious with featurewiz. Don't use it to create too many un-interpretable features! #
############################################################################################
Skipping feature engineering since no feature_engg input...
Skipping category encoding since no category encoders specified in input...
#### Single_Label Regression problem ####
    Loaded train data. Shape = (3132, 84)
#### Single_Label Regression problem ####
No test data filename given...
#######################################################################################
####################