# ML Model pipeline

In [5]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import os

os.chdir("../marketdata")
import yahoo
import alpaca

os.chdir("../technicals")
import technicals


### Set model seeds

In [6]:
# The random seed
seed = 42

# Set seeds
tf.random.set_seed(seed)
np.random.seed(seed)

In [7]:
test_tickers = ["ADSK"]
ohlcv_df = alpaca.ohlcv(test_tickers)
tech_ind = technicals.TechnicalAnalysis(ohlcv_df)
df = tech_ind.get_all_technicals(test_tickers[0], returns_period=14)
df['daily_return_bin'] = np.where(df['daily_return'] > 0, 1, 0)
df.drop(columns=["daily_return"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [8]:
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,rsi,williams,mfi,stoch_k,macd,...,ema_7,ema_14,ema_21,highlow,closeopen,bb_high,bb_low,pvt,cum_daily_return,daily_return_bin
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-02 00:00:00-05:00,184.21,187.89,181.88,187.83,1379670,0.0,0.0,0.0,0.0,0.0,...,187.83,187.83,187.83,6.01,3.62,0.0,0.0,0.0,1.0,0
2020-01-03 00:00:00-05:00,184.49,186.41,183.78,184.96,635151,0.0,0.0,0.0,0.0,-0.064391,...,186.19,186.2925,186.326667,2.63,0.47,0.0,0.0,0.0,0.98472,0
2020-01-06 00:00:00-05:00,183.78,187.25,183.16,187.12,642260,5.101559,0.0,0.0,0.0,-0.015065,...,186.592162,186.608608,186.616677,4.09,3.34,0.0,0.0,17205.40508,0.99622,1
2020-01-07 00:00:00-05:00,186.78,188.17,185.1,187.52,750003,6.052509,0.0,0.0,0.0,0.02218,...,186.931486,186.887428,186.875742,3.07,0.74,0.0,0.0,-5897.185512,0.99835,1
2020-01-08 00:00:00-05:00,188.22,190.5,187.03,190.01,1080578,11.938087,0.0,0.0,0.0,0.141034,...,187.940576,187.702103,187.627387,3.47,1.79,0.0,0.0,12745.289664,1.011606,1


### Train/test plit

In [9]:
split = int(0.8 * len(df.index))

df_train = df.iloc[: split - 1]
df_test = df.iloc[split:]

In [10]:
df_train.shape

(305, 23)

In [11]:
df_test.shape

(77, 23)

### Train/Validate split

In [12]:
from sklearn.model_selection import train_test_split
df_train, df_validate = train_test_split(df_train, train_size=0.8, random_state=seed)

In [15]:
target = "daily_return_bin"

In [16]:
# Get the feature matrix
X_train = df_train[np.setdiff1d(df_train.columns, [target])].values
X_val = df_validate[np.setdiff1d(df_validate.columns, [target])].values
X_test = df_test[np.setdiff1d(df_test.columns, [target])].values

# Get the target vector
y_train = df_train[target].values
y_val = df_validate[target].values
y_test = df_test[target].values

In [17]:
from sklearn.preprocessing import StandardScaler

# The StandardScaler
scaler = StandardScaler()

# Standardize the training data
X_train = scaler.fit_transform(X_train)

# Standardize the validation data
X_val = scaler.transform(X_val)

# Standardize the test data
X_test = scaler.transform(X_test)

### ML Model Pipeline Hyperparameter Tuning

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

models = {'lr': LogisticRegression(class_weight='balanced', random_state=seed),
          'mlpc': MLPClassifier(early_stopping=True, random_state=seed),
          'rfc': RandomForestClassifier(class_weight='balanced', random_state=seed),
          'hgbc': HistGradientBoostingClassifier(random_state=seed)}

In [19]:
# Pipeline dictionary

from sklearn.pipeline import Pipeline

pipeline_dict = {}

for model_name, model in models.items():
    pipeline_dict[model_name] = Pipeline([('model', model)])

In [20]:
from sklearn.model_selection import PredefinedSplit

# Code source: https://www.kaggle.com/arushik1994/wids-datathon-logistic-regression

def get_train_val_ps(X_train, y_train, X_val, y_val):
    """
    Get the:
    feature matrix and target velctor in the combined training and validation data
    target vector in the combined training and validation data
    PredefinedSplit
    
    Parameters
    ----------
    X_train : the feature matrix in the training data
    y_train : the target vector in the training data
    X_val : the feature matrix in the validation data
    y_val : the target vector in the validation data  

    Return
    ----------
    The feature matrix in the combined training and validation data
    The target vector in the combined training and validation data
    PredefinedSplit
    """  

    # Combine the feature matrix in the training and validation data
    X_train_val = np.vstack((X_train, X_val))

    # Combine the target vector in the training and validation data
    y_train_val = np.vstack((y_train.reshape(-1, 1), y_val.reshape(-1, 1))).reshape(-1)

    # Get the indices of training and validation data
    train_val_idxs = np.append(np.full(X_train.shape[0], -1), np.full(X_val.shape[0], 0))

    # The PredefinedSplit
    ps = PredefinedSplit(train_val_idxs)

    return X_train_val, y_train_val, ps

In [21]:
# Used the implementation in pmlm_utilities.ipynb
X_train_val, y_train_val, ps = get_train_val_ps(X_train, y_train, X_val, y_val)

In [22]:
param_grids = {}

In [23]:
# Logistic Regression Parameters
# The parameter grid of tol
tol_grid = [10 ** -5, 10 ** -4, 10 ** -3]

# The parameter grid of C
C_grid = [0.1, 1, 10]

# Update param_grids
param_grids['lr'] = [{'model__tol': tol_grid,
                      'model__C': C_grid}]

In [24]:
# MPL Classifier Parameters
#The grids for alpha
alpha_grids = [10 ** i for i in range(-7, -2)]

# The grids for learning_rate_init
learning_rate_init_grids = [8 ** i for i in range(-4, -1)]

# Update param_grids
param_grids['mlpc'] = [{'model__alpha': alpha_grids,
                        'model__learning_rate_init': learning_rate_init_grids}]

In [25]:
# Random Forest Classifier Parameters
# The grids for min_samples_split
min_samples_split_grids = [2, 20, 200]

# The grids for min_samples_leaf
min_samples_leaf_grids = [1, 20, 200]

# Update param_grids
param_grids['rfc'] = [{'model__min_samples_split': min_samples_split_grids,
                       'model__min_samples_leaf': min_samples_leaf_grids}]

In [26]:
# Histogram Based Gradient Boost Parameters
# The grids for learning_rate
learning_rate_grids = [10 ** i for i in range(-4, 2)]

# The grids for min_samples_leaf
min_samples_leaf_grids = [1, 20, 100]

# Update param_grids
param_grids['hgbc'] = [{'model__learning_rate': learning_rate_grids,
                        'model__min_samples_leaf': min_samples_leaf_grids}]

In [30]:
from sklearn.model_selection import GridSearchCV

model_dict = {}

for model_name in pipeline_dict.keys():

    grid_model = GridSearchCV(estimator=pipeline_dict[model_name],
                      param_grid=param_grids[model_name],
                      scoring='f1_macro',
                      n_jobs=2,
                      cv=ps,
                      return_train_score=True)
        
    # Fit the pipeline
    model_fit = grid_model.fit(X_train_val, y_train_val)
    
    # Update best_score_params_estimator_gs
    model_dict[model_name] = {'report' : [grid_model.best_score_, grid_model.best_params_, grid_model.best_estimator_],
                              'model' : model_fit}

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


{'lr': {'report': [0.8983333333333333, {'model__C': 10, 'model__tol': 1e-05}, Pipeline(steps=[('model',
                 LogisticRegression(C=10, class_weight='balanced',
                                    random_state=42, tol=1e-05))])], 'model': GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
             estimator=Pipeline(steps=[('model',
                                        LogisticRegression(class_weight='balanced',
                                                           random_state=42))]),
             n_jobs=2,
             param_grid=[{'model__C': [0.1, 1, 10],
                          'model__tol': [1e-05, 0.0001, 0.001]}],
             return_train_score=True, scoring='f1_macro')}, 'mlpc': {'report': [0.8498769483182936, {'model__alpha': 1e-07, 'model__learning_rate_init': 0.015625}, Pipeline(steps=[('model',
                 MLPClassifier(alpha=1e-07, early_stopping=True,
                               learning_rate_init=0.015625, random_st