# ML Model pipeline

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import os

os.chdir("../marketdata")
import alpaca

os.chdir("../technicals")
import technicals


### Set model seeds

In [2]:
# The random seed
seed = 42

# Lookback
lookback = 14

# Set seeds
tf.random.set_seed(seed)
np.random.seed(seed)


In [3]:
test_tickers = ["ADSK"]
ohlcv_df = alpaca.ohlcv(test_tickers)
tech_ind = technicals.TechnicalAnalysis(ohlcv_df)
df = tech_ind.get_all_technicals(test_tickers[0], returns_period=14)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 384 entries, 2020-01-02 00:00:00-05:00 to 2021-07-12 00:00:00-04:00
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   open              384 non-null    float64
 1   high              384 non-null    float64
 2   low               384 non-null    float64
 3   close             384 non-null    float64
 4   volume            384 non-null    int64  
 5   rsi               384 non-null    float64
 6   williams          384 non-null    float64
 7   mfi               384 non-null    float64
 8   stoch_k           384 non-null    float64
 9   macd              384 non-null    float64
 10  ma_10             384 non-null    float64
 11  ma_50             384 non-null    float64
 12  ma_200            384 non-null    float64
 13  ema_7             384 non-null    float64
 14  ema_14            384 non-null    float64
 15  ema_21            384 non-null    float64


In [4]:
df.head(14)

Unnamed: 0_level_0,open,high,low,close,volume,rsi,williams,mfi,stoch_k,macd,...,ema_7,ema_14,ema_21,highlow,closeopen,bb_high,bb_low,pvt,daily_return,cum_daily_return
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-02 00:00:00-05:00,184.21,187.89,181.88,187.83,1379670,0.0,0.0,0.0,0.0,0.0,...,187.83,187.83,187.83,6.01,3.62,0.0,0.0,0.0,0.0,1.0
2020-01-03 00:00:00-05:00,184.49,186.41,183.78,184.96,635151,0.0,0.0,0.0,0.0,-0.064391,...,186.19,186.2925,186.326667,2.63,0.47,0.0,0.0,0.0,-0.01528,0.98472
2020-01-06 00:00:00-05:00,183.78,187.25,183.16,187.12,642260,5.101559,0.0,0.0,0.0,-0.015065,...,186.592162,186.608608,186.616677,4.09,3.34,0.0,0.0,17205.40508,0.011678,0.99622
2020-01-07 00:00:00-05:00,186.78,188.17,185.1,187.52,750003,6.052509,0.0,0.0,0.0,0.02218,...,186.931486,186.887428,186.875742,3.07,0.74,0.0,0.0,-5897.185512,0.002138,0.99835
2020-01-08 00:00:00-05:00,188.22,190.5,187.03,190.01,1080578,11.938087,0.0,0.0,0.0,0.141034,...,187.940576,187.702103,187.627387,3.47,1.79,0.0,0.0,12745.289664,0.013279,1.011606
2020-01-09 00:00:00-05:00,191.71,192.7,190.68,191.94,952496,16.293071,0.0,0.0,0.0,0.292359,...,189.156914,188.682676,188.527576,2.02,0.23,0.0,0.0,-4673.700426,0.010157,1.021881
2020-01-10 00:00:00-05:00,193.16,193.42,191.835,192.43,629895,17.404185,0.0,0.0,0.0,0.406567,...,190.101238,189.472317,189.256284,1.585,-0.73,0.0,0.0,-8066.797857,0.002553,1.02449
2020-01-13 00:00:00-05:00,190.0,193.37,189.97,192.44,1336587,17.428151,0.0,0.0,0.0,0.477085,...,190.750975,190.052752,189.798801,3.4,2.44,0.0,0.0,-1538.588697,5.2e-05,1.024543
2020-01-14 00:00:00-05:00,192.25,193.13,190.63,191.1,932926,16.731152,0.0,0.0,0.0,0.452177,...,190.845315,190.245575,190.004202,2.5,-1.15,0.0,0.0,-6565.617359,-0.006963,1.017409
2020-01-15 00:00:00-05:00,190.94,192.17,190.01,190.76,1282589,16.551204,0.0,0.0,0.0,0.410415,...,190.822713,190.335714,190.116023,2.16,-0.18,0.0,0.0,4214.211025,-0.001779,1.015599


In [5]:
df['daily_return'] = df['daily_return'].shift(-lookback).fillna(method='ffill')
df.head(14)

Unnamed: 0_level_0,open,high,low,close,volume,rsi,williams,mfi,stoch_k,macd,...,ema_7,ema_14,ema_21,highlow,closeopen,bb_high,bb_low,pvt,daily_return,cum_daily_return
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-02 00:00:00-05:00,184.21,187.89,181.88,187.83,1379670,0.0,0.0,0.0,0.0,0.0,...,187.83,187.83,187.83,6.01,3.62,0.0,0.0,0.0,0.01982,1.0
2020-01-03 00:00:00-05:00,184.49,186.41,183.78,184.96,635151,0.0,0.0,0.0,0.0,-0.064391,...,186.19,186.2925,186.326667,2.63,0.47,0.0,0.0,0.0,0.002971,0.98472
2020-01-06 00:00:00-05:00,183.78,187.25,183.16,187.12,642260,5.101559,0.0,0.0,0.0,-0.015065,...,186.592162,186.608608,186.616677,4.09,3.34,0.0,0.0,17205.40508,-0.016767,0.99622
2020-01-07 00:00:00-05:00,186.78,188.17,185.1,187.52,750003,6.052509,0.0,0.0,0.0,0.02218,...,186.931486,186.887428,186.875742,3.07,0.74,0.0,0.0,-5897.185512,0.020933,0.99835
2020-01-08 00:00:00-05:00,188.22,190.5,187.03,190.01,1080578,11.938087,0.0,0.0,0.0,0.141034,...,187.940576,187.702103,187.627387,3.47,1.79,0.0,0.0,12745.289664,-0.0013,1.011606
2020-01-09 00:00:00-05:00,191.71,192.7,190.68,191.94,952496,16.293071,0.0,0.0,0.0,0.292359,...,189.156914,188.682676,188.527576,2.02,0.23,0.0,0.0,-4673.700426,-0.003505,1.021881
2020-01-10 00:00:00-05:00,193.16,193.42,191.835,192.43,629895,17.404185,0.0,0.0,0.0,0.406567,...,190.101238,189.472317,189.256284,1.585,-0.73,0.0,0.0,-8066.797857,-0.010804,1.02449
2020-01-13 00:00:00-05:00,190.0,193.37,189.97,192.44,1336587,17.428151,0.0,0.0,0.0,0.477085,...,190.750975,190.052752,189.798801,3.4,2.44,0.0,0.0,-1538.588697,0.012751,1.024543
2020-01-14 00:00:00-05:00,192.25,193.13,190.63,191.1,932926,16.731152,0.0,0.0,0.0,0.452177,...,190.845315,190.245575,190.004202,2.5,-1.15,0.0,0.0,-6565.617359,0.023024,1.017409
2020-01-15 00:00:00-05:00,190.94,192.17,190.01,190.76,1282589,16.551204,0.0,0.0,0.0,0.410415,...,190.822713,190.335714,190.116023,2.16,-0.18,0.0,0.0,4214.211025,-0.009267,1.015599


In [6]:
df.tail(8)

Unnamed: 0_level_0,open,high,low,close,volume,rsi,williams,mfi,stoch_k,macd,...,ema_7,ema_14,ema_21,highlow,closeopen,bb_high,bb_low,pvt,daily_return,cum_daily_return
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-06-30 00:00:00-04:00,296.67,296.8702,290.93,291.89,765928,57.798307,-23.15724,71.538834,41.297456,2.897262,...,289.993667,286.483805,284.903388,5.9402,-4.78,295.430682,268.745508,-30034.493659,-0.009108,1.554012
2021-07-01 00:00:00-04:00,290.87,295.22,290.61,293.69,613623,59.265211,-16.487955,64.286794,44.399203,3.198737,...,290.91775,287.444631,285.702171,4.61,2.82,296.965163,268.550075,19009.491406,-0.009108,1.563595
2021-07-02 00:00:00-04:00,295.46,298.31,295.03,297.73,477879,62.407493,-2.135541,64.443181,54.191472,3.720762,...,292.620813,288.816014,286.79561,3.28,2.27,298.957486,268.773942,2789.671084,-0.009108,1.585104
2021-07-06 00:00:00-04:00,298.19,301.0,294.12,295.61,1196378,59.813341,-18.057314,64.142621,37.862235,3.918237,...,293.368109,289.721879,287.596918,6.88,-2.58,300.320783,268.442074,-15092.568047,-0.009108,1.573817
2021-07-07 00:00:00-04:00,297.89,299.205,293.46,296.1,859874,60.222805,-16.41574,63.79646,34.189034,4.067389,...,294.051082,290.572295,288.369926,5.745,-1.79,301.710911,268.700518,9944.181957,-0.009108,1.576425
2021-07-08 00:00:00-04:00,292.3,296.13,290.04,294.31,1062354,57.913229,-25.740669,56.731872,27.187297,3.995103,...,294.115812,291.070656,288.909932,6.09,2.01,302.6272,269.274705,-7847.518831,-0.009108,1.566896
2021-07-09 00:00:00-04:00,294.03,296.96,290.49,295.36,787164,58.903771,-22.440794,56.903683,27.89289,3.976701,...,294.426859,291.642568,289.496302,6.47,1.33,303.322548,270.483166,9230.539637,-0.009108,1.572486
2021-07-12 00:00:00-04:00,297.35,298.7,292.385,292.67,198360,55.329324,-45.469432,57.04818,17.142038,3.702378,...,293.987644,291.779559,289.78482,6.315,-4.68,303.636438,271.614991,-4614.908436,-0.009108,1.558164


In [7]:
df = df.iloc[:-7]

In [8]:
df.tail(8)

Unnamed: 0_level_0,open,high,low,close,volume,rsi,williams,mfi,stoch_k,macd,...,ema_7,ema_14,ema_21,highlow,closeopen,bb_high,bb_low,pvt,daily_return,cum_daily_return
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-06-21 00:00:00-04:00,278.54,283.5,275.8672,283.23,1014091,53.096922,-14.810218,57.274099,56.553824,-1.158335,...,278.775668,278.999582,279.599742,7.6328,4.69,291.367623,270.585235,11686.803626,-0.009108,1.507906
2021-06-22 00:00:00-04:00,282.97,286.41,282.7873,286.09,719142,55.719089,-2.097068,64.516569,69.216633,-0.495325,...,280.604251,279.944971,280.189765,3.6227,3.12,291.811208,270.558792,-11481.91962,-0.009108,1.523133
2021-06-23 00:00:00-04:00,286.93,286.93,282.68,283.68,794550,53.041822,-20.596474,64.360069,46.692252,-0.16248,...,281.373188,280.442975,280.50706,4.25,-3.25,291.304454,270.703641,-13954.980012,-0.009108,1.510302
2021-06-24 00:00:00-04:00,286.71,289.1,285.88,286.95,703947,56.1078,-11.978116,64.408488,55.277455,0.361003,...,282.767391,281.310578,281.092781,3.22,0.24,290.822573,270.945522,14807.675942,-0.009108,1.527711
2021-06-25 00:00:00-04:00,287.99,289.895,285.6018,288.78,746415,57.761412,-5.948443,71.610103,59.078902,0.913008,...,284.270543,282.306501,281.791619,4.2932,0.79,289.875859,271.536998,-3354.248078,-0.009108,1.537454
2021-06-28 00:00:00-04:00,291.2,295.0,289.27,293.37,840218,61.644675,-6.834553,71.699842,62.379766,1.70124,...,286.545408,283.781634,282.844199,5.73,2.17,291.390448,270.626694,8594.604902,-0.009108,1.561891
2021-06-29 00:00:00-04:00,292.0,298.14,290.13,297.81,978497,64.981395,-1.222702,78.837959,67.623321,2.653601,...,289.361556,285.652083,284.204727,8.01,5.81,294.136781,269.004171,1454.230169,-0.009108,1.585529
2021-06-30 00:00:00-04:00,296.67,296.8702,290.93,291.89,765928,57.798307,-23.15724,71.538834,41.297456,2.897262,...,289.993667,286.483805,284.903388,5.9402,-4.78,295.430682,268.745508,-30034.493659,-0.009108,1.554012


In [9]:
df['daily_return_bin'] = np.where(df['daily_return'] > 0, 1, 0)
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,rsi,williams,mfi,stoch_k,macd,...,ema_14,ema_21,highlow,closeopen,bb_high,bb_low,pvt,daily_return,cum_daily_return,daily_return_bin
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-02 00:00:00-05:00,184.21,187.89,181.88,187.83,1379670,0.0,0.0,0.0,0.0,0.0,...,187.83,187.83,6.01,3.62,0.0,0.0,0.0,0.01982,1.0,1
2020-01-03 00:00:00-05:00,184.49,186.41,183.78,184.96,635151,0.0,0.0,0.0,0.0,-0.064391,...,186.2925,186.326667,2.63,0.47,0.0,0.0,0.0,0.002971,0.98472,1
2020-01-06 00:00:00-05:00,183.78,187.25,183.16,187.12,642260,5.101559,0.0,0.0,0.0,-0.015065,...,186.608608,186.616677,4.09,3.34,0.0,0.0,17205.40508,-0.016767,0.99622,0
2020-01-07 00:00:00-05:00,186.78,188.17,185.1,187.52,750003,6.052509,0.0,0.0,0.0,0.02218,...,186.887428,186.875742,3.07,0.74,0.0,0.0,-5897.185512,0.020933,0.99835,1
2020-01-08 00:00:00-05:00,188.22,190.5,187.03,190.01,1080578,11.938087,0.0,0.0,0.0,0.141034,...,187.702103,187.627387,3.47,1.79,0.0,0.0,12745.289664,-0.0013,1.011606,0


In [10]:
df.drop(columns=['daily_return','cum_daily_return','open','high','low','close','volume'], inplace=True)

In [11]:
df.head()

Unnamed: 0_level_0,rsi,williams,mfi,stoch_k,macd,ma_10,ma_50,ma_200,ema_7,ema_14,ema_21,highlow,closeopen,bb_high,bb_low,pvt,daily_return_bin
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2020-01-02 00:00:00-05:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,187.83,187.83,187.83,6.01,3.62,0.0,0.0,0.0,1
2020-01-03 00:00:00-05:00,0.0,0.0,0.0,0.0,-0.064391,0.0,0.0,0.0,186.19,186.2925,186.326667,2.63,0.47,0.0,0.0,0.0,1
2020-01-06 00:00:00-05:00,5.101559,0.0,0.0,0.0,-0.015065,0.0,0.0,0.0,186.592162,186.608608,186.616677,4.09,3.34,0.0,0.0,17205.40508,0
2020-01-07 00:00:00-05:00,6.052509,0.0,0.0,0.0,0.02218,0.0,0.0,0.0,186.931486,186.887428,186.875742,3.07,0.74,0.0,0.0,-5897.185512,1
2020-01-08 00:00:00-05:00,11.938087,0.0,0.0,0.0,0.141034,0.0,0.0,0.0,187.940576,187.702103,187.627387,3.47,1.79,0.0,0.0,12745.289664,0


### Train/test plit

In [12]:
split = int(0.8 * len(df.index))

df_train = df.iloc[: split - 1]
df_test = df.iloc[split:]

In [13]:
df_train.shape

(300, 17)

In [14]:
df_test.shape

(76, 17)

### Train/Validate split

In [15]:
from sklearn.model_selection import train_test_split
df_train, df_validate = train_test_split(df_train, train_size=0.8, random_state=seed)

In [16]:
target = "daily_return_bin"

In [17]:
# Get the feature matrix
X_train = df_train[np.setdiff1d(df_train.columns, [target])].values
X_val = df_validate[np.setdiff1d(df_validate.columns, [target])].values
X_test = df_test[np.setdiff1d(df_test.columns, [target])].values

# Get the target vector
y_train = df_train[target].values
y_val = df_validate[target].values
y_test = df_test[target].values

In [18]:
from sklearn.preprocessing import StandardScaler

# The StandardScaler
scaler = StandardScaler()

# Standardize the training data
X_train = scaler.fit_transform(X_train)

# Standardize the validation data
X_val = scaler.transform(X_val)

# Standardize the test data
X_test = scaler.transform(X_test)

### ML Model Pipeline Hyperparameter Tuning

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

models = {'lr': LogisticRegression(class_weight='balanced', random_state=seed),
          'mlpc': MLPClassifier(early_stopping=True, random_state=seed),
          'rfc': RandomForestClassifier(class_weight='balanced', random_state=seed),
          'hgbc': HistGradientBoostingClassifier(random_state=seed)}

In [20]:
# Pipeline dictionary

from sklearn.pipeline import Pipeline

pipeline_dict = {}

for model_name, model in models.items():
    pipeline_dict[model_name] = Pipeline([('model', model)])

In [21]:
from sklearn.model_selection import PredefinedSplit

# Code source: https://www.kaggle.com/arushik1994/wids-datathon-logistic-regression

def get_train_val_ps(X_train, y_train, X_val, y_val):
    """
    Get the:
    feature matrix and target velctor in the combined training and validation data
    target vector in the combined training and validation data
    PredefinedSplit
    
    Parameters
    ----------
    X_train : the feature matrix in the training data
    y_train : the target vector in the training data
    X_val : the feature matrix in the validation data
    y_val : the target vector in the validation data  

    Return
    ----------
    The feature matrix in the combined training and validation data
    The target vector in the combined training and validation data
    PredefinedSplit
    """  

    # Combine the feature matrix in the training and validation data
    X_train_val = np.vstack((X_train, X_val))

    # Combine the target vector in the training and validation data
    y_train_val = np.vstack((y_train.reshape(-1, 1), y_val.reshape(-1, 1))).reshape(-1)

    # Get the indices of training and validation data
    train_val_idxs = np.append(np.full(X_train.shape[0], -1), np.full(X_val.shape[0], 0))

    # The PredefinedSplit
    ps = PredefinedSplit(train_val_idxs)

    return X_train_val, y_train_val, ps

In [22]:
# Used the implementation in pmlm_utilities.ipynb
X_train_val, y_train_val, ps = get_train_val_ps(X_train, y_train, X_val, y_val)

In [23]:
param_grids = {}

In [24]:
# Logistic Regression Parameters
# The parameter grid of tol
tol_grid = [10 ** -5, 10 ** -4, 10 ** -3]

# The parameter grid of C
C_grid = [0.1, 1, 10]

# Update param_grids
param_grids['lr'] = [{'model__tol': tol_grid,
                      'model__C': C_grid}]

In [25]:
# MPL Classifier Parameters
#The grids for alpha
alpha_grids = [10 ** i for i in range(-7, -2)]

# The grids for learning_rate_init
learning_rate_init_grids = [8 ** i for i in range(-4, -1)]

# Update param_grids
param_grids['mlpc'] = [{'model__alpha': alpha_grids,
                        'model__learning_rate_init': learning_rate_init_grids}]

In [26]:
# Random Forest Classifier Parameters
# The grids for min_samples_split
min_samples_split_grids = [2, 20, 200]

# The grids for min_samples_leaf
min_samples_leaf_grids = [1, 20, 200]

# Update param_grids
param_grids['rfc'] = [{'model__min_samples_split': min_samples_split_grids,
                       'model__min_samples_leaf': min_samples_leaf_grids}]

In [27]:
# Histogram Based Gradient Boost Parameters
# The grids for learning_rate
learning_rate_grids = [10 ** i for i in range(-4, 2)]

# The grids for min_samples_leaf
min_samples_leaf_grids = [1, 20, 100]

# Update param_grids
param_grids['hgbc'] = [{'model__learning_rate': learning_rate_grids,
                        'model__min_samples_leaf': min_samples_leaf_grids}]

In [28]:
from sklearn.model_selection import GridSearchCV

model_dict = {}

for model_name in pipeline_dict.keys():

    grid = GridSearchCV(estimator=pipeline_dict[model_name],
                      param_grid=param_grids[model_name],
                      scoring='f1_macro',
                      n_jobs=2,
                      cv=ps,
                      return_train_score=True)
        
    # Fit the pipeline
    model_fit = grid.fit(X_train_val, y_train_val)
    
    # Update best_score_params_estimator_gs
    model_dict[model_name] = {'model_score' : grid.best_score_,
                              'model' : model_fit,
                              'params' : grid.best_params_, 
                              'estimator' : grid.best_estimator_}

In [29]:
model_params = {}
for key in model_dict:
    model_params[key] = model_dict[key]['model_score']
    
model_params 

{'lr': 0.5832175604334537,
 'mlpc': 0.53125,
 'rfc': 0.4831897749374826,
 'hgbc': 0.5000000000000001}

In [30]:
model_params = pd.DataFrame(model_params, index=['score']).transpose()

In [31]:
model_params

Unnamed: 0,score
lr,0.583218
mlpc,0.53125
rfc,0.48319
hgbc,0.5


In [32]:
best_model = model_dict[str(model_params.nlargest(1, columns='score').index[0])]

In [33]:
model = best_model['estimator']

### Load production test data

In [34]:
# set market data date range 
from datetime import date, datetime, timedelta

test_end_date  = datetime.now()
test_start_date  = (test_end_date - timedelta(days=lookback))

test_start_date = test_start_date.strftime('%Y-%m-%d')
test_end_date = test_end_date.strftime('%Y-%m-%d')

print(f"Start date : {test_start_date}")
print(f"End date : {test_end_date}")

Start date : 2021-06-28
End date : 2021-07-12


In [35]:
# load the dataset
test_tickers = ["ADSK"]
test_ohlcv_df = alpaca.ohlcv(test_tickers, start_date=test_start_date, end_date=test_end_date)
test_tech_ind = technicals.TechnicalAnalysis(test_ohlcv_df)

test_df = test_tech_ind.get_all_technicals(test_tickers[0])
test_df.head()

Unnamed: 0_level_0,open,high,low,close,volume,rsi,williams,mfi,stoch_k,macd,...,ema_7,ema_14,ema_21,highlow,closeopen,bb_high,bb_low,pvt,daily_return,cum_daily_return
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-06-28 00:00:00-04:00,291.2,295.0,289.27,293.37,840218,0.0,0.0,0.0,0.0,0.0,...,293.37,293.37,293.37,5.73,2.17,0.0,0.0,0.0,0.0,1.0
2021-06-29 00:00:00-04:00,292.0,298.14,290.13,297.81,978497,100.0,0.0,0.0,0.0,0.099615,...,295.907143,295.748571,295.695714,8.01,5.81,0.0,0.0,0.0,0.015134,1.015134
2021-06-30 00:00:00-04:00,296.67,296.8702,290.93,291.89,765928,91.304348,0.0,0.0,0.0,-0.056649,...,294.17,294.274584,294.304502,5.9402,-4.78,0.0,0.0,-30034.493659,-0.019878,0.994955
2021-07-01 00:00:00-04:00,290.87,295.22,290.61,293.69,613623,91.543893,0.0,0.0,0.0,-0.061345,...,293.994457,294.095743,294.128268,4.61,2.82,0.0,0.0,19009.491406,0.006167,1.001091
2021-07-02 00:00:00-04:00,295.46,298.31,295.03,297.73,477879,92.069268,0.0,0.0,0.0,0.098156,...,295.218912,295.043915,294.99202,3.28,2.27,0.0,0.0,2789.671084,0.013756,1.014862


In [36]:
test_df.head(7)

Unnamed: 0_level_0,open,high,low,close,volume,rsi,williams,mfi,stoch_k,macd,...,ema_7,ema_14,ema_21,highlow,closeopen,bb_high,bb_low,pvt,daily_return,cum_daily_return
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-06-28 00:00:00-04:00,291.2,295.0,289.27,293.37,840218,0.0,0.0,0.0,0.0,0.0,...,293.37,293.37,293.37,5.73,2.17,0.0,0.0,0.0,0.0,1.0
2021-06-29 00:00:00-04:00,292.0,298.14,290.13,297.81,978497,100.0,0.0,0.0,0.0,0.099615,...,295.907143,295.748571,295.695714,8.01,5.81,0.0,0.0,0.0,0.015134,1.015134
2021-06-30 00:00:00-04:00,296.67,296.8702,290.93,291.89,765928,91.304348,0.0,0.0,0.0,-0.056649,...,294.17,294.274584,294.304502,5.9402,-4.78,0.0,0.0,-30034.493659,-0.019878,0.994955
2021-07-01 00:00:00-04:00,290.87,295.22,290.61,293.69,613623,91.543893,0.0,0.0,0.0,-0.061345,...,293.994457,294.095743,294.128268,4.61,2.82,0.0,0.0,19009.491406,0.006167,1.001091
2021-07-02 00:00:00-04:00,295.46,298.31,295.03,297.73,477879,92.069268,0.0,0.0,0.0,0.098156,...,295.218912,295.043915,294.99202,3.28,2.27,0.0,0.0,2789.671084,0.013756,1.014862
2021-07-06 00:00:00-04:00,298.19,301.0,294.12,295.61,1196378,88.961703,0.0,0.0,0.0,0.101541,...,295.337853,295.174897,295.121014,6.88,-2.58,0.0,0.0,-15092.568047,-0.007121,1.007635
2021-07-07 00:00:00-04:00,297.89,299.205,293.46,296.1,859874,89.053202,0.0,0.0,0.0,0.12416,...,295.557741,295.369836,295.303822,5.745,-1.79,0.0,0.0,9944.181957,0.001658,1.009306


In [37]:
test_df['daily_return_bin'] = np.where(test_df['daily_return'] > 0, 1, 0)
test_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,open,high,low,close,volume,rsi,williams,mfi,stoch_k,macd,...,ema_14,ema_21,highlow,closeopen,bb_high,bb_low,pvt,daily_return,cum_daily_return,daily_return_bin
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-06-28 00:00:00-04:00,291.2,295.0,289.27,293.37,840218,0.0,0.0,0.0,0.0,0.0,...,293.37,293.37,5.73,2.17,0.0,0.0,0.0,0.0,1.0,0
2021-06-29 00:00:00-04:00,292.0,298.14,290.13,297.81,978497,100.0,0.0,0.0,0.0,0.099615,...,295.748571,295.695714,8.01,5.81,0.0,0.0,0.0,0.015134,1.015134,1
2021-06-30 00:00:00-04:00,296.67,296.8702,290.93,291.89,765928,91.304348,0.0,0.0,0.0,-0.056649,...,294.274584,294.304502,5.9402,-4.78,0.0,0.0,-30034.493659,-0.019878,0.994955,0
2021-07-01 00:00:00-04:00,290.87,295.22,290.61,293.69,613623,91.543893,0.0,0.0,0.0,-0.061345,...,294.095743,294.128268,4.61,2.82,0.0,0.0,19009.491406,0.006167,1.001091,1
2021-07-02 00:00:00-04:00,295.46,298.31,295.03,297.73,477879,92.069268,0.0,0.0,0.0,0.098156,...,295.043915,294.99202,3.28,2.27,0.0,0.0,2789.671084,0.013756,1.014862,1


In [38]:
test_df.drop(columns=['daily_return','cum_daily_return','open','high','low','close','volume'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [46]:
test_df.head(14)

Unnamed: 0_level_0,rsi,williams,mfi,stoch_k,macd,ma_10,ma_50,ma_200,ema_7,ema_14,ema_21,highlow,closeopen,bb_high,bb_low,pvt,daily_return_bin
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2021-06-28 00:00:00-04:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,293.37,293.37,293.37,5.73,2.17,0.0,0.0,0.0,0
2021-06-29 00:00:00-04:00,100.0,0.0,0.0,0.0,0.099615,0.0,0.0,0.0,295.907143,295.748571,295.695714,8.01,5.81,0.0,0.0,0.0,1
2021-06-30 00:00:00-04:00,91.304348,0.0,0.0,0.0,-0.056649,0.0,0.0,0.0,294.17,294.274584,294.304502,5.9402,-4.78,0.0,0.0,-30034.493659,0
2021-07-01 00:00:00-04:00,91.543893,0.0,0.0,0.0,-0.061345,0.0,0.0,0.0,293.994457,294.095743,294.128268,4.61,2.82,0.0,0.0,19009.491406,1
2021-07-02 00:00:00-04:00,92.069268,0.0,0.0,0.0,0.098156,0.0,0.0,0.0,295.218912,295.043915,294.99202,3.28,2.27,0.0,0.0,2789.671084,1
2021-07-06 00:00:00-04:00,88.961703,0.0,0.0,0.0,0.101541,0.0,0.0,0.0,295.337853,295.174897,295.121014,6.88,-2.58,0.0,0.0,-15092.568047,0
2021-07-07 00:00:00-04:00,89.053202,0.0,0.0,0.0,0.12416,0.0,0.0,0.0,295.557741,295.369836,295.303822,5.745,-1.79,0.0,0.0,9944.181957,1
2021-07-08 00:00:00-04:00,86.254755,0.0,0.0,0.0,0.052198,0.0,0.0,0.0,295.211103,295.162548,295.134471,6.09,2.01,0.0,0.0,-7847.518831,0
2021-07-09 00:00:00-04:00,86.520966,0.0,0.0,0.0,0.053666,0.0,0.0,0.0,295.251349,295.198903,295.170072,6.47,1.33,0.0,0.0,9230.539637,1
2021-07-12 00:00:00-04:00,82.153536,0.0,0.0,0.0,-0.084633,0.0,0.0,0.0,294.567502,294.75578,294.800185,6.315,-4.68,0.0,0.0,-4614.908436,0


### Data processing

In [40]:
# Get the feature matrix
X_test = test_df[np.setdiff1d(test_df.columns, [target])].values

# Get the target vector
y_test = test_df[target].values

In [41]:
from sklearn.preprocessing import StandardScaler

# The StandardScaler
scaler = StandardScaler()

# Standardize the training data
X_train = scaler.fit_transform(X_train)

# Standardize the validation data
X_val = scaler.transform(X_val)

# Standardize the test data
X_test = scaler.transform(X_test)

In [42]:
predictions = model.predict(X_test)

In [47]:
# Assemble actual y data (Y_test) with predicted y data (from just above) into two columns in a DataFrame
results = pd.DataFrame(y_test)
results["Predicted Value"] = predictions

# Display sample data
results['Actual values'] = test_df['daily_return_bin']

In [48]:
results

Unnamed: 0,0,Predicted Value,Actual values
0,0,0,
1,1,0,
2,0,0,
3,1,1,
4,1,0,
5,0,0,
6,1,1,
7,0,0,
8,1,1,
9,0,0,
