<a href="https://colab.research.google.com/github/leonardobocci/ml-stock-market/blob/main/1.master_thesis_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Change FFNN to same style as LSTM which allows defining num_layers

Add cross validation to lstm and ffnn parameters

Pipeline Steps:
7. Load data for selected ETF
8. Split into train/test
9. Normalise scales/distributions
10. Feature selection
11. Run Models
12. Add Model Results to Results File

# Libraries and Data Loading

In [None]:
%%capture
!pip install feature_engine
!pip install featurewiz==0.1.996
!pip install tscv
!pip install lightgbm
!pip install pmdarima --upgrade
!pip install sklearn-genetic

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from google.colab import files
from datetime import datetime
from dateutil.relativedelta import relativedelta
from sklearn import metrics
from genetic_selection import GeneticSelectionCV
from tscv import GapRollForward
from tensorflow import keras

import pmdarima as pm
import tensorflow as tf

In [None]:
%%capture
from google.colab import drive
drive.mount('/content/drive')

from google.colab import auth
auth.authenticate_user()
import gspread
from google.auth import default
creds, _ = default()
gc = gspread.authorize(creds)

workbook = gc.open('all_etfs_OHLCV')
sheet_titles = []
for sheet in workbook.worksheets():
  sheet_titles.append(sheet.title)

dict_of_sheets = {}
for sheet_title in sheet_titles:
  sheet = workbook.worksheet(sheet_title)
  values = sheet.get_all_values()
  dict_of_sheets[sheet_title] = values

keys = list(dict_of_sheets)
etfs = {}
for etf in keys:
  etfs[etf] = pd.read_csv(f'/content/drive/MyDrive/Bocci_Machine_Learning_Returns/Data/{etf}.csv')
  etfs[etf]['date'] = pd.to_datetime(etfs[etf]['date'], format="%Y/%m/%d")
  etfs[etf].set_index('date', inplace=True)

results_path='/content/drive/MyDrive/Bocci_Machine_Learning_Returns/Data/results.csv'
results=pd.read_csv(results_path)
results['id'] = results.etf + results.model

In [None]:
#Import Models
from featurewiz import FeatureWiz
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from prophet import Prophet
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM

#Split and Scale

Splitting into test/train. A validation set is not necessary as hyper-parameters are tuned using cross validation instead.

The test set includes 3 years of data prior to the last available date. The train set is trimmed to obtain an 80:20 train:test split. 

Scaling the data to have a mean of 0 and a unit standard deviation. The scaling happens only based on the fitting done on the training set. Doing a fit transform on the test set, or scaling before splitting would cause spillage (using info fron the test set).

Scaling was found to perform consistently and significantly better than normalising.

In [None]:
def split_scale(df):
  global y_train, y_pred, y_test, x_train, x_test, x_train_scaled, x_train_norm, x_test_scaled, x_test_norm, split_point, dates_df, dates_test_df, dates_train_df
  #Exclude dependent and date
  feature_names = df.columns
  feature_names = feature_names.drop(['day', 'month', 'log_returns'])

  #X, Y Split
  x = df.loc[:, feature_names]
  y = df.loc[:, 'log_returns']

  #Train-Test Split
  last_date = max(df.index)
  split_point = pd.to_datetime((last_date - relativedelta(years = 3)).date())
  x_test, y_test = x.loc[x.index >= split_point].values, y.loc[y.index >= split_point].values
  train_length = len(x_test) * 4
  x_train, y_train = x.loc[x.index < split_point].tail(train_length).values, y.loc[y.index < split_point].tail(train_length).values

  #Scaling
  scaler = preprocessing.StandardScaler().fit(x_train)
  #normalizer = preprocessing.Normalizer().fit(x_train)
  x_train_scaled = scaler.transform(x_train)
  x_test_scaled = scaler.transform(x_test)
  #x_train_norm = normalizer.transform(x_train)
  #x_test_norm = normalizer.transform(x_test)

  #Re-add column names
  x_train_scaled = pd.DataFrame(x_train_scaled, columns = feature_names)
  x_test_scaled = pd.DataFrame(x_test_scaled, columns = feature_names)
  #x_train_norm = pd.DataFrame(x_train_norm, columns = feature_names)
  #x_test_norm = pd.DataFrame(x_test_norm, columns = feature_names)
  y_test = pd.DataFrame(y_test, columns=['Log_Returns'])
  y_train= pd.DataFrame(y_train, columns=['Log_Returns'])

  #Save dates
  dates_df = pd.DataFrame(df.index)
  dates_test_df = dates_df.loc[dates_df.date >= split_point].reset_index(drop=True)
  dates_train_df = dates_df.loc[dates_df.date < split_point].tail(train_length).reset_index(drop=True)

#Feature Selection

In [None]:
def genetic_selection(estimator):
  train_features = x_train_scaled.copy()
  test_features = x_test_scaled.copy()
  train_labels = y_train.copy()
  test_labels = y_test.copy()
  start_period = y_test.index[0]
  end_period = y_test.index[-1]
  test_periods = len(y_test)
  splitter_size = int(0.16*len(train_features))
  splitter = GapRollForward(gap_size=0, min_test_size=splitter_size, min_train_size=splitter_size, max_test_size=splitter_size)

  selector = GeneticSelectionCV(
    estimator,
    cv=splitter,
    scoring='neg_mean_squared_error',
    n_population=100,
    crossover_proba=0.5,
    mutation_proba=0.2,
    n_generations=40,
    crossover_independent_proba=0.1,
    mutation_independent_proba=0.05,
    tournament_size=3,
    n_gen_no_change=5,
    n_jobs=1,
  )
  selector = selector.fit(train_features, train_labels.values.ravel())
  
  selected_features = train_features.columns[selector.support_]
  train_features = train_features[selected_features]
  test_features = test_features[selected_features]
  return train_features, test_features

In [None]:
def recursive_elimination(estimator):
  train_features = x_train_scaled.copy()
  test_features = x_test_scaled.copy()
  train_labels = y_train.copy()
  test_labels = y_test.copy()
  
  splitter_size = int(0.16*len(train_features))
  splitter = GapRollForward(gap_size=0, min_test_size=splitter_size, min_train_size=splitter_size, max_test_size=splitter_size)
  selector = RFECV(estimator, step=1, cv=splitter, scoring='neg_mean_squared_error')
  selector.fit(train_features, train_labels.values.ravel())
  selected_cols = selector.get_support()
  train_features = train_features.iloc[:, selected_cols]
  test_features = test_features.iloc[:, selected_cols]
  return train_features, test_features

In [None]:
def featurewiz_selection():
  train_features = x_train_scaled.copy()
  test_features = x_test_scaled.copy()
  train_labels = y_train.copy()
  test_labels = y_test.copy()

  features = FeatureWiz(corr_limit=0.70, feature_engg='', category_encoders='', dask_xgboost_flag=False, nrows=None, verbose=0)
  train_features = features.fit_transform(train_features, train_labels)
  test_features = features.transform(test_features)
  return train_features, test_features

# Models

##Useful Functions

In [None]:
models = ['last_price', 'ols', 'ridge', 'lasso', 'elastic_net', 'decision_tree', 'random_forest', 'gradient_boost', 'xgboost', 'sv_rbf', 'lgbm', 'arima', 'ff_nn', 'lstm_nn', 'geometric_brownian']
model_predictions = {}

In [None]:
def save_results(etf, model_name, y_test, y_pred):
  path = results_path
  model_results = pd.DataFrame({'etf':etf,
                                  'model':model_name,
                                  'rmse':np.sqrt(metrics.mean_squared_error(y_test, y_pred))}, index=[0])
  results=pd.read_csv(path)
  results['id'] = results.etf + results.model
  res = results.copy()
  res = pd.concat([res, model_results]).reset_index(drop=True)
  with open(path, 'w', encoding = 'utf-8-sig') as f:
    res.to_csv(f, index=False)

In [None]:
def save_fits_predictions(model_predictions, model_fits, model_name, etf):
  workbook = gc.open('predictions')
  sheet_titles = []
  for sheet in workbook.worksheets():
    sheet_titles.append(sheet.title)

  if etf in sheet_titles:
    sheet = workbook.worksheet(etf)
    predicted_df = pd.DataFrame(sheet.get_all_records())
    predicted_df[f'{model_name}_predicted'] = pd.DataFrame(model_predictions)
    predicted_df[f'{model_name}_error'] = predicted_df.true-predicted_df[f'{model_name}_predicted']
    sheet.update([predicted_df.columns.values.tolist()] + predicted_df.values.tolist())
  else:
    predicted_df = pd.DataFrame()
    predicted_df['date']=dates_test_df['date'].astype(str)
    predicted_df['true'] = pd.DataFrame(y_test)
    predicted_df[f'{model_name}_predicted'] = pd.DataFrame(model_predictions)
    predicted_df[f'{model_name}_error'] = predicted_df.true-predicted_df[f'{model_name}_predicted']
    sheet = workbook.add_worksheet(etf, predicted_df.shape[0], predicted_df.shape[1])
    sheet.update([predicted_df.columns.values.tolist()] + predicted_df.values.tolist())

  workbook = gc.open('fits')
  sheet_titles = []
  for sheet in workbook.worksheets():
    sheet_titles.append(sheet.title)

  if etf in sheet_titles:
    sheet = workbook.worksheet(etf)
    fitted_df = pd.DataFrame(sheet.get_all_records())
    fitted_df[f'{model_name}_fitted'] = pd.DataFrame(model_fits)
    fitted_df[f'{model_name}_error'] = fitted_df.true-fitted_df[f'{model_name}_fitted']
    sheet.update([fitted_df.columns.values.tolist()] + fitted_df.values.tolist())
  else:
    fitted_df = pd.DataFrame()
    fitted_df['date']=dates_train_df['date'].astype(str)
    fitted_df['true'] = pd.DataFrame(y_train)
    fitted_df[f'{model_name}_fitted'] = pd.DataFrame(model_fits)
    fitted_df[f'{model_name}_error'] = fitted_df.true-fitted_df[f'{model_name}_fitted']
    sheet = workbook.add_worksheet(etf, fitted_df.shape[0], fitted_df.shape[1])
    sheet.update([fitted_df.columns.values.tolist()] + fitted_df.values.tolist())
  

In [None]:
def save_params(model, model_name, etf):
  params = model.get_params()
  params = pd.DataFrame(params, index=[0])
  params['etf'] = etf
  params = params.fillna('')

  workbook = gc.open('params')
  sheet_titles = []
  for sheet in workbook.worksheets():
    sheet_titles.append(sheet.title)

  if model_name in sheet_titles:
    sheet = workbook.worksheet(f'{model_name}')
    params_all = pd.DataFrame(sheet.get_all_records())
    params_all = pd.concat([params_all, params]).reset_index(drop=True)
    sheet.update([params_all.columns.values.tolist()] + params_all.values.tolist())

  else:
    sheet = workbook.add_worksheet(model_name, params.shape[0], params.shape[1])
    sheet.update([params.columns.values.tolist()] + params.values.tolist())

In [None]:
def save_selected_cols(x_train_scaled, model_name, etf):
  cols = x_train_scaled.columns
  cols = pd.DataFrame(cols)
  cols.columns = ['features']
  cols['etf'] = etf
  cols['model'] = model_name

  workbook = gc.open('features')
  sheet=workbook.worksheet('Sheet1')
  features_df = pd.DataFrame(sheet.get_all_records())

  features_df = pd.concat([cols, features_df])
  sheet.update([features_df.columns.values.tolist()] + features_df.values.tolist())

In [None]:
def run_cross_val(estimator, parameters):
  splitter_size = int(0.16*len(x_train_scaled))
  splitter = GapRollForward(gap_size=0, min_test_size=splitter_size, min_train_size=splitter_size, max_test_size=splitter_size)
  cv = GridSearchCV(estimator=estimator, param_grid=parameters, cv=splitter, scoring='neg_mean_squared_error')
  cv.fit(x_train_scaled, y_train.values.ravel())
  params = pd.DataFrame(cv.best_params_, index=[0])
  return params

##Return=0

Models need to beat a simple returns=0 benchmark, where returns are assumed to follow a random walk with no drift.

In [None]:
def run_last_price():
  model_name='last_price'
  y_pred = y_test.copy()
  y_fit = y_train.copy()
  y_pred.Log_Returns = 0
  y_fit.Log_Returns = 0
  save_results(etf=etf, model_name=model_name, y_test=y_test, y_pred=y_pred)
  save_fits_predictions(model_predictions=y_pred, model_fits=y_fit, model_name=model_name, etf=etf)

##Geometric Brownian Motion

In [None]:
def run_geom_brown():  
  model_name = 'geometric_brownian'
  #Starting Price
  So = 1000
  #Time increment
  dt = 1
  T = len(y_test)
  T_train = len(y_train)
  N = T / dt
  N_train = T_train / dt
  #Time array
  t = np.arange(1, int(N) + 1)
  t_train = np.arange(1, int(N_train) + 1)
  #Historical Mean Returns
  mu = np.mean(y_train)[0]
  mu_train = 0
  #Historical volatility
  sigma = np.std(y_train)[0]
  sigma_train = 0.1
  #Number of simulations
  scen_size = 500
  #Run simulation
  b = {str(scen): np.random.normal(0, 1, int(N)) for scen in range(1, scen_size + 1)}
  b_train = {str(scen): np.random.normal(0, 1, int(N_train)) for scen in range(1, scen_size + 1)}
  #Brownian Path
  W = {str(scen): b[str(scen)].cumsum() for scen in range(1, scen_size + 1)}
  W_train = {str(scen): b_train[str(scen)].cumsum() for scen in range(1, scen_size + 1)}
  #Drift
  drift = (mu - 0.5 * sigma**2) * t
  drift_train = (mu_train - 0.5 * sigma_train**2) * t_train
  #Diffusion
  diffusion = {str(scen): sigma * W[str(scen)] for scen in range(1, scen_size + 1)}
  diffusion_train = {str(scen): sigma_train * W_train[str(scen)] for scen in range(1, scen_size + 1)}
  #Predicted Scenarios
  S = np.array([So * np.exp(drift + diffusion[str(scen)]) for scen in range(1, scen_size + 1)]) 
  S = np.hstack((np.array([[So] for scen in range(scen_size)]), S))
  S_train = np.array([So * np.exp(drift_train + diffusion_train[str(scen)]) for scen in range(1, scen_size + 1)]) 
  S_train = np.hstack((np.array([[So] for scen in range(scen_size)]), S_train))
  df = pd.DataFrame(S.T)
  df_train = pd.DataFrame(S_train.T)
  #Calculate Returns
  rets_df = pd.DataFrame()
  rets_df_train = pd.DataFrame()
  for i in range (0, len(df.columns)):
    rets_df[f'log_ret_{i}'] = np.log(df[i]) - np.log(df[i].shift(1))
  for i in range (0, len(df_train.columns)):
    rets_df_train[f'log_ret_{i}'] = np.log(df_train[i]) - np.log(df_train[i].shift(1))
  rets_df = rets_df.dropna().reset_index(drop=True)
  rets_df_train = rets_df_train.dropna().reset_index(drop=True)
  y_pred = pd.DataFrame(rets_df.mean(axis=1), columns=['Log_Returns'])
  y_fit = pd.DataFrame(rets_df_train.mean(axis=1), columns=['Log_Returns'])
  save_results(etf=etf, model_name=model_name, y_test=y_test, y_pred=y_pred)
  save_fits_predictions(model_predictions=y_pred, model_fits=y_fit, model_name=model_name, etf=etf)

## Linear Regression

In [None]:
def run_ols():
  model_name='ols'
  ols = LinearRegression().fit(x_train_scaled, y_train)
  y_pred = ols.predict(x_test_scaled)
  y_fit = ols.predict(x_train_scaled)
  save_params(model=ols, model_name=model_name, etf=etf)
  save_results(etf=etf, model_name=model_name, y_test=y_test, y_pred=y_pred)
  save_fits_predictions(model_predictions=y_pred, model_fits=y_fit, model_name=model_name, etf=etf)
  save_selected_cols(x_train_scaled=x_train_scaled, model_name=model_name, etf=etf)

## Ridge Regression

In [None]:
def run_ridge():
  model_name='ridge'

  ridge = Ridge()
  parameters = {
      'alpha': [0.1, 1, 10, 100]
  }
  params = run_cross_val(ridge, parameters)

  ridge = Ridge(alpha=params.loc[0, 'alpha']).fit(x_train_scaled, y_train)
  y_pred = ridge.predict(x_test_scaled)
  y_fit = ridge.predict(x_train_scaled)
  
  save_params(model=ridge, model_name=model_name, etf=etf)
  save_results(etf=etf, model_name=model_name, y_test=y_test, y_pred=y_pred)
  save_fits_predictions(model_predictions=y_pred, model_fits=y_fit, model_name=model_name, etf=etf)
  save_selected_cols(x_train_scaled=x_train_scaled, model_name=model_name, etf=etf)

## Lasso Regression

In [None]:
def run_lasso():
  model_name='lasso'

  las = Lasso()
  parameters = {
      'alpha': [0.01, 1, 100, 100]
  }
  params = run_cross_val(las, parameters)

  las = Lasso(alpha=params.loc[0, 'alpha']).fit(x_train_scaled, y_train)
  y_pred = las.predict(x_test_scaled)
  y_fit = las.predict(x_train_scaled)

  save_params(model=las, model_name=model_name, etf=etf)
  save_results(etf=etf, model_name=model_name, y_test=y_test, y_pred=y_pred)
  save_fits_predictions(model_predictions=y_pred, model_fits=y_fit, model_name=model_name, etf=etf)

## Elastic Net

In [None]:
%%capture
def run_elastic_net():
  model_name='elastic_net'

  elnet = ElasticNet(max_iter=10000)
  parameters = {
      'l1_ratio': [0.3, 0.5, 0.7],
      'alpha': [0.01, 0.5, 1, 100]
  }
  params = run_cross_val(elnet, parameters)

  elnet = ElasticNet(l1_ratio=params.loc[0, 'l1_ratio'], max_iter=10000)
  elnet.fit(x_train_scaled, y_train.values.ravel())
  y_pred = elnet.predict(x_test_scaled)
  y_fit = elnet.predict(x_train_scaled)

  save_params(model=elnet, model_name=model_name, etf=etf)
  save_results(etf=etf, model_name=model_name, y_test=y_test, y_pred=y_pred)
  save_fits_predictions(model_predictions=y_pred, model_fits=y_fit, model_name=model_name, etf=etf)
  save_selected_cols(x_train_scaled=x_train_scaled, model_name=model_name, etf=etf)

## Decision Tree Regressor

In [None]:
%%capture
def run_decision_tree():
  model_name='decision_tree'

  tree = DecisionTreeRegressor(random_state=0)
  parameters = {
      'max_depth': [2, 5, 10, 50],
      'min_samples_leaf': [5, 10, 20]
  }
  params = run_cross_val(tree, parameters)

  tree = DecisionTreeRegressor(random_state=0, max_depth=params.loc[0, 'max_depth'], min_samples_leaf=params.loc[0, 'min_samples_leaf'])
  tree.fit(x_train_scaled, y_train)
  y_pred = tree.predict(x_test_scaled)
  y_fit = tree.predict(x_train_scaled)

  save_params(model=tree, model_name=model_name, etf=etf)
  save_results(etf=etf, model_name=model_name, y_test=y_test, y_pred=y_pred)
  save_fits_predictions(model_predictions=y_pred, model_fits=y_fit, model_name=model_name, etf=etf)
  save_selected_cols(x_train_scaled=x_train_scaled, model_name=model_name, etf=etf)

## Random Forest Regressor

In [None]:
%%capture
def run_random_forest():
  model_name='random_forest'

  forest = RandomForestRegressor(random_state=0)
  parameters = {
      'n_estimators': [8, 64, 128],
      'max_depth': [2, 5, 10, 50],
      'min_samples_leaf': [5, 10, 20]
  }
  params = run_cross_val(forest, parameters)

  forest = RandomForestRegressor(random_state=0, max_depth=params.loc[0, 'max_depth'], min_samples_leaf=params.loc[0, 'min_samples_leaf'], n_estimators=params.loc[0, 'n_estimators'])
  forest.fit(x_train_scaled, y_train.values.ravel())
  y_pred = forest.predict(x_test_scaled)
  y_fit = forest.predict(x_train_scaled)

  save_params(model=forest, model_name=model_name, etf=etf)
  save_results(etf=etf, model_name=model_name, y_test=y_test, y_pred=y_pred)
  save_fits_predictions(model_predictions=y_pred, model_fits=y_fit, model_name=model_name, etf=etf)
  save_selected_cols(x_train_scaled=x_train_scaled, model_name=model_name, etf=etf)

## Gradient Boosting Regression

In [None]:
%%capture
def run_gradient_boost():
  model_name='gradient_boost'

  gboost = GradientBoostingRegressor(random_state=0)
  parameters = {
      'learning_rate': [0.025, 0.05],
      'n_estimators': [15, 30, 50],
      'max_depth': [2, 5, 10],
      'min_samples_leaf': [5, 15]
  }
  params = run_cross_val(gboost, parameters)

  gboost = GradientBoostingRegressor(random_state=0, max_depth=params.loc[0, 'max_depth'], min_samples_leaf=params.loc[0, 'min_samples_leaf'], n_estimators=params.loc[0, 'n_estimators'],learning_rate=params.loc[0, 'learning_rate'])
  gboost.fit(x_train_scaled, y_train.values.ravel())
  y_pred = gboost.predict(x_test_scaled)
  y_fit = gboost.predict(x_train_scaled)

  save_params(model=gboost, model_name=model_name, etf=etf)
  save_results(etf=etf, model_name=model_name, y_test=y_test, y_pred=y_pred)
  save_fits_predictions(model_predictions=y_pred, model_fits=y_fit, model_name=model_name, etf=etf)
  save_selected_cols(x_train_scaled=x_train_scaled, model_name=model_name, etf=etf)

## Extreme Gradient Boosting

In [None]:
def run_xgboost():
  model_name='xgboost'
  exgboost = XGBRegressor(random_state=0, objective = 'reg:squarederror')
  parameters = {
      'learning_rate': [0.08, 0.1, 0.12],
      'n_estimators': [50, 80, 120],
      'max_depth': [2, 5, 10],
  }
  params = run_cross_val(exgboost, parameters)

  exgboost = XGBRegressor(random_state=0, objective = 'reg:squarederror', learning_rate=params.loc[0, 'learning_rate'], max_depth=params.loc[0, 'max_depth'], n_estimators=params.loc[0, 'n_estimators'])
  exgboost.fit(x_train_scaled, y_train.values.ravel())
  y_pred = exgboost.predict(x_test_scaled)
  y_fit = exgboost.predict(x_train_scaled)

  save_params(model=exgboost, model_name=model_name, etf=etf)
  save_results(etf=etf, model_name=model_name, y_test=y_test, y_pred=y_pred)
  save_fits_predictions(model_predictions=y_pred, model_fits=y_fit, model_name=model_name, etf=etf)
  save_selected_cols(x_train_scaled=x_train_scaled, model_name=model_name, etf=etf)

## Support Vector Machines

In [None]:
def run_sv_rbf():
  model_name='sv_rbf'
  svr = SVR(kernel="rbf")
  parameters = {
        'epsilon': [0.01, 0.05, 0.1],
        'gamma': ['scale', 'auto'],
        'C': [0.1, 1, 100]
    }
  params = run_cross_val(svr, parameters)

  svr = SVR(kernel="rbf", epsilon=params.loc[0, 'epsilon'], gamma=params.loc[0, 'gamma'],C=params.loc[0, 'C'])
  svr.fit(x_train_scaled, y_train.values.ravel())
  y_pred = svr.predict(x_test_scaled)
  y_fit = svr.predict(x_train_scaled)

  save_params(model=svr, model_name=model_name, etf=etf)
  save_results(etf=etf, model_name=model_name, y_test=y_test, y_pred=y_pred)
  save_fits_predictions(model_predictions=y_pred, model_fits=y_fit, model_name=model_name, etf=etf)
  save_selected_cols(x_train_scaled=x_train_scaled, model_name=model_name, etf=etf)

##LightGBM

In [None]:
def run_lgbm():
  model_name='lgbm'
  lgbm = LGBMRegressor()
  parameters = {
      'learning_rate': [0.05, 0.1, 0.2],
      'n_estimators': [15, 30, 50, 100],
      'max_depth': [-1, 2, 5, 10]
  }
  params = run_cross_val(lgbm, parameters)

  lgbm = LGBMRegressor(random_state=0, max_depth=params.loc[0, 'max_depth'], n_estimators=params.loc[0, 'n_estimators'],learning_rate=params.loc[0, 'learning_rate'])
  lgbm.fit(x_train_scaled, y_train.values.ravel())
  y_pred = lgbm.predict(x_test_scaled)
  y_fit = lgbm.predict(x_train_scaled)

  save_params(model=lgbm, model_name=model_name, etf=etf)
  save_results(etf=etf, model_name=model_name, y_test=y_test, y_pred=y_pred)
  save_fits_predictions(model_predictions=y_pred, model_fits=y_fit, model_name=model_name, etf=etf)
  save_selected_cols(x_train_scaled=x_train_scaled, model_name=model_name, etf=etf)

## ARIMA

In [None]:
%%capture
def run_arima():
  model_name='arima'
  train_features = x_train_scaled.copy() 
  test_features = x_test_scaled.copy()
  train_labels = y_train.copy()
  test_labels = y_test.copy()
  test_periods = len(test_labels)
  train_periods = len(train_labels)

  arimax = pm.auto_arima(y=train_labels, X=train_features, seasonal=False, stationary=True, information_criterion='bic')
  y_pred = arimax.predict(X=test_features, n_periods=test_periods)
  y_fit = arimax.predict(X=train_features, n_periods=train_periods)
  y_fit = pd.DataFrame(y_fit).to_numpy()
  y_pred = pd.DataFrame(y_pred).to_numpy()

  save_results(etf=etf, model_name=model_name, y_test=test_labels, y_pred=y_pred)
  save_fits_predictions(model_predictions=y_pred, model_fits=y_fit, model_name=model_name, etf=etf)

##Neural Networks

###Feed Forward NN (Sequential)

In [None]:
def run_ff_nn():
  model_name='ff_nn'
  train_features = tf.convert_to_tensor(x_train_scaled) 
  test_features = tf.convert_to_tensor(x_test_scaled) 
  train_labels = tf.convert_to_tensor(y_train) 
  test_labels = tf.convert_to_tensor(y_test)

  def build_and_compile_ff(layers, neurons, dropout, decay, clipping, lr):
    model = Sequential()
    for layer in range(0,layers):
      model.add(Dense(neurons))
      model.add(Dropout(dropout))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error',
                  optimizer=tf.keras.optimizers.Adam(weight_decay=decay, clipnorm=clipping, learning_rate=lr))
    return model

  feed_forward = build_and_compile_ff(4, 50, 0.1, 0.00001, None, 0.0005)
  history = feed_forward.fit(
      train_features,
      train_labels,
      validation_split=0.2,
      verbose=0,
      epochs=120
  )

  y_pred = feed_forward.predict(test_features)
  y_fit = feed_forward.predict(train_features)

  save_results(etf=etf, model_name=model_name, y_test=y_test, y_pred=y_pred)
  save_fits_predictions(model_predictions=y_pred, model_fits=y_fit, model_name=model_name, etf=etf)

###LSTM NN (Recurrent)

In [None]:
def run_lstm_nn():
  from tensorflow.python.ops.numpy_ops import np_config
  np_config.enable_numpy_behavior()
  model_name='lstm_nn'
  n_hid_layers = 6
  n_units = 180
  n_epochs = 180
  clipnorm = 1

  #Code without windowing
  train_features = tf.convert_to_tensor(x_train_scaled) #train_features, scaled
  train_features = train_features.reshape((train_features.shape[0], train_features.shape[1], 1))
  test_features = tf.convert_to_tensor(x_test_scaled) #test_features, scaled
  test_features = test_features.reshape((test_features.shape[0], test_features.shape[1], 1))
  train_labels = tf.convert_to_tensor(y_train) #train_labels
  test_labels = tf.convert_to_tensor(y_test) #test_labels


  lstm = Sequential()
  for layer in range(0,n_hid_layers):
    lstm.add(LSTM(n_units, return_sequences=True))
    lstm.add(Dropout(0.05))
  lstm.add(LSTM(n_units, return_sequences=False))
  lstm.add(Dense(1))
  lstm.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(clipnorm=clipnorm))
  history = lstm.fit(
      train_features,
      train_labels,
      validation_split=0.2,
      verbose=0,
      epochs=n_epochs,
  )

  y_pred = lstm.predict(test_features)
  y_fit = lstm.predict(train_features)

  save_results(etf=etf, model_name=model_name, y_test=y_test, y_pred=y_pred)
  save_fits_predictions(model_predictions=y_pred, model_fits=y_fit, model_name=model_name, etf=etf)

In [None]:
def run_lstm_windowing_nn(n_hid_layers, n_units, n_epochs):
  model_name='lstm_nn_w'
  n_future = 1   # Forecast window length
  n_past = 3  # Past window length
  #n_past = int(0.1*len(x_train_scaled))  # Past window length
  
  train_features = []
  train_labels = []
  test_features = []
  test_labels = []

  #Reformat input data into a shape: (n_samples x timesteps x n_features)
  for i in range(n_past, len(x_train_scaled) - n_future +1):
      train_features.append(x_train_scaled.iloc[i - n_past:i, 0:x_train_scaled.shape[1]])
      train_labels.append(y_train.iloc[i + n_future - 1:i + n_future, 0])
  train_features, train_labels = np.array(train_features), np.array(train_labels)

  #Add the periods of the tarining data that are part of the required window length to the test data
  x_test_lstm = pd.concat([x_train_scaled.iloc[len(x_train_scaled)-n_past:,:], x_test_scaled]).reset_index(drop=True)

  for i in range(n_past, len(x_test_lstm) - n_future +1):
      test_features.append(x_test_lstm.iloc[i - n_past:i, 0:x_test_lstm.shape[1]])
      test_labels.append(y_test.iloc[i-n_past: i- n_past+1, 0])
  test_features, test_labels = np.array(test_features), np.array(test_labels)


  train_features = tf.convert_to_tensor(train_features) #train_features, scaled
  test_features = tf.convert_to_tensor(test_features) #test_features, scaled
  train_labels = tf.convert_to_tensor(train_labels) #train_labels
  test_labels = tf.convert_to_tensor(test_labels) #test_labels

  lstm = Sequential()
  for layer in range(0,n_hid_layers):
    lstm.add(LSTM(n_units, return_sequences=True))
    lstm.add(Dropout(0.1))
  lstm.add(LSTM(n_units, return_sequences=False))
  lstm.add(Dense(1))
  lstm.compile(loss='mean_squared_error', optimizer='adam')
  history = lstm.fit(
      train_features,
      train_labels,
      validation_split=0.2,
      verbose=0,
      epochs=n_epochs,
  )

  y_pred = lstm.predict(test_features)
  y_fit = lstm.predict(train_features)

  save_results(etf=etf, model_name=model_name, y_test=y_test, y_pred=y_pred)

#Run Models and Save Results

Logic: For each ETF, check if model results are already stored in the results file. If not, run the models. If yes, move on to next ETF.

In [None]:
def run_model(model, etf):
  if model==models[0]:
    run_last_price()
  elif model==models[1]:
    run_ols()
  elif model==models[2]:
    run_ridge()
  elif model==models[3]:
    run_lasso()
  elif model==models[4]:
    run_elastic_net()
  elif model==models[5]:
    run_decision_tree()
  elif model==models[6]:
    run_random_forest()
  elif model==models[7]:
    run_gradient_boost()
  elif model==models[8]:
    run_xgboost()
  elif model==models[9]:
    run_sv_rbf()
  elif model==models[10]:
    run_lgbm()
  elif model==models[11]:
    run_arima()
  elif model==models[12]:
    run_ff_nn()
  elif model==models[13]:
    run_lstm_nn()
  elif model==models[14]:
    run_geom_brown()
  else:
    print('Add the model to the list of models and run_model function list')

In [None]:
for etf in keys:
    split_scale(etfs[etf])
    x_train_scaled, x_test_scaled = featurewiz_selection()
    for model in models:
        model_saved = (etf+model == results.id).any()
        print(model)
        if model_saved==False:
          run_model(model, etf)

wiz = FeatureWiz(verbose=1)
        X_train_selected = wiz.fit_transform(X_train, y_train)
        X_test_selected = wiz.transform(X_test)
        wiz.features  ### provides a list of selected features ###            
        
############################################################################################
############       F A S T   F E A T U R E  E N G G    A N D    S E L E C T I O N ! ########
# Be judicious with featurewiz. Don't use it to create too many un-interpretable features! #
############################################################################################
Skipping feature engineering since no feature_engg input...
Skipping category encoding since no category encoders specified in input...
#### Single_Label Regression problem ####
    Loaded train data. Shape = (3132, 84)
#### Single_Label Regression problem ####
No test data filename given...
#######################################################################################
####################