# ML Model pipeline

In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import os

os.chdir("../marketdata")
import yahoo
import alpaca

os.chdir("../technicals")
import technicals

os.chdir("../backtests")
import backtest, optimizers

### Set model seeds

In [None]:
# The random seed
seed = 42

# Set seeds
tf.random.set_seed(seed)
np.random.seed(seed)

In [None]:
test_tickers = ["AAPL"]
ohlcv_df = alpaca.ohlcv(test_tickers)
# tech_ind = technicals.TechnicalAnalysis(ohlcv_df)
# df = tech_ind.get_all_technicals(test_tickers[0], returns_period=14)

In [None]:
ohlcv_df = ohlcv_df.loc[:,"AAPL"].copy()

In [None]:
ohlcv_df.head()

---
### Set Williams %R params:

In [None]:
# production Williams %R ranges
wr_period = [14]
wr_upperband = [-5,-10,-20,-30,-40]
wr_lowerband = [-50,-60,-70,-80,-90]

print(f"Williams %R Period : {wr_period}")
print(f"Williams %R Upper band : {wr_upperband}")
print(f"Williams %R Lower band : {wr_lowerband}")

---
### Set broker values

In [None]:
# set broker cash, commision
start_cash = 1000000.0
broker_comm = 0.005

In [None]:
# set risk
risk = 0.9

In [None]:
from datetime import datetime, timedelta
import pytz

IST = pytz.timezone('America/New_York')
dateformat = "%Y-%m-%d"

In [None]:
date = datetime.now(IST)
end_date = date.strftime(dateformat)
start_date = date - timedelta(days=365)

In [None]:
from optimizers import wr_cross_strat

df = backtest.optimization(strat=wr_cross_strat,
                                     ohlcv=ohlcv_df, 
                                     start_cash=start_cash, 
                                     broker_comm=broker_comm, 
                                     risk=risk,
                                     period=wr_period,
                                     upperband=wr_upperband,
                                     lowerband=wr_lowerband,
                                     start_date=start_date,
                                     end_date=end_date)

In [None]:
df.head()

### Train/test plit

In [None]:
split = int(0.8 * len(df.index))

df_train = df.iloc[: split - 1]
df_test = df.iloc[split:]

In [None]:
df_train.shape

In [None]:
df_test.shape

### Train/Validate split

In [None]:
from sklearn.model_selection import train_test_split
df_train, df_validate = train_test_split(df_train, train_size=0.8, random_state=seed)

In [None]:
target = "lagging_returns"

In [None]:
# Get the feature matrix
X_train = df_train[np.setdiff1d(df_train.columns, [target])].values
X_val = df_val[np.setdiff1d(df_val.columns, [target])].values
X_test = df_test[np.setdiff1d(df_test.columns, [target])].values

# Get the target vector
y_train = df_train[target].values
y_val = df_val[target].values
y_test = df_test[target].values

In [None]:
from sklearn.preprocessing import StandardScaler

# The StandardScaler
scaler = StandardScaler()

# Standardize the training data
X_train = scaler.fit_transform(X_train)

# Standardize the validation data
X_val = scaler.transform(X_val)

# Standardize the test data
X_test = scaler.transform(X_test)

### ML Model Pipeline Hyperparameter Tuning

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

models = {'lr': LogisticRegression(class_weight='balanced', random_state=seed),
          'mlpc': MLPClassifier(early_stopping=True, random_state=seed),
          'rfc': RandomForestClassifier(class_weight='balanced', random_state=seed),
          'hgbc': HistGradientBoostingClassifier(random_state=seed)}

In [None]:
# Pipeline dictionary

from sklearn.pipeline import Pipeline

pipeline_dict = {}

for model_name, model in models.items():
    pipeline_dict[model_name] = Pipeline([('model', model)])

In [None]:
from sklearn.model_selection import PredefinedSplit

# Code source: https://www.kaggle.com/arushik1994/wids-datathon-logistic-regression

def get_train_val_ps(X_train, y_train, X_val, y_val):
    """
    Get the:
    feature matrix and target velctor in the combined training and validation data
    target vector in the combined training and validation data
    PredefinedSplit
    
    Parameters
    ----------
    X_train : the feature matrix in the training data
    y_train : the target vector in the training data
    X_val : the feature matrix in the validation data
    y_val : the target vector in the validation data  

    Return
    ----------
    The feature matrix in the combined training and validation data
    The target vector in the combined training and validation data
    PredefinedSplit
    """  

    # Combine the feature matrix in the training and validation data
    X_train_val = np.vstack((X_train, X_val))

    # Combine the target vector in the training and validation data
    y_train_val = np.vstack((y_train.reshape(-1, 1), y_val.reshape(-1, 1))).reshape(-1)

    # Get the indices of training and validation data
    train_val_idxs = np.append(np.full(X_train.shape[0], -1), np.full(X_val.shape[0], 0))

    # The PredefinedSplit
    ps = PredefinedSplit(train_val_idxs)

    return X_train_val, y_train_val, ps

In [None]:
# Used the implementation in pmlm_utilities.ipynb
X_train_val, y_train_val, ps = get_train_val_ps(X_train, y_train, X_val, y_val)

In [None]:
param_grids = {}

In [None]:
# Logistic Regression Parameters
# The parameter grid of tol
tol_grid = [10 ** -5, 10 ** -4, 10 ** -3]

# The parameter grid of C
C_grid = [0.1, 1, 10]

# Update param_grids
param_grids['lr'] = [{'model__tol': tol_grid,
                      'model__C': C_grid}]

In [None]:
# MPL Classifier Parameters
#The grids for alpha
alpha_grids = [10 ** i for i in range(-7, -2)]

# The grids for learning_rate_init
learning_rate_init_grids = [8 ** i for i in range(-4, -1)]

# Update param_grids
param_grids['mlpc'] = [{'model__alpha': alpha_grids,
                        'model__learning_rate_init': learning_rate_init_grids}]

In [None]:
# Random Forest Classifier Parameters
# The grids for min_samples_split
min_samples_split_grids = [2, 20, 200]

# The grids for min_samples_leaf
min_samples_leaf_grids = [1, 20, 200]

# Update param_grids
param_grids['rfc'] = [{'model__min_samples_split': min_samples_split_grids,
                       'model__min_samples_leaf': min_samples_leaf_grids}]

In [None]:
# Histogram Based Gradient Boost Parameters
# The grids for learning_rate
learning_rate_grids = [10 ** i for i in range(-4, 2)]

# The grids for min_samples_leaf
min_samples_leaf_grids = [1, 20, 100]

# Update param_grids
param_grids['hgbc'] = [{'model__learning_rate': learning_rate_grids,
                        'model__min_samples_leaf': min_samples_leaf_grids}]

In [None]:
from sklearn.model_selection import GridSearchCV

# The list of [best_score_, best_params_, best_estimator_] obtained by GridSearchCV
best_score_params_estimator_gs = []

# For each model
for model_name in pipeline_dict.keys():
    # GridSearchCV
    gs = GridSearchCV(estimator=pipeline_dict[model_name],
                      param_grid=param_grids[model_name],
                      scoring='f1_macro',
                      n_jobs=2,
                      cv=ps,
                      return_train_score=True)
        
    # Fit the pipeline
    gs = gs.fit(X_train_val, y_train_val)
    
    # Update best_score_params_estimator_gs
    best_score_params_estimator_gs.append([gs.best_score_, gs.best_params_, gs.best_estimator_])
    
    # Sort cv_results in ascending order of 'rank_test_score' and 'std_test_score'
    cv_results = pd.DataFrame.from_dict(gs.cv_results_).sort_values(by=['rank_test_score', 'std_test_score'])
    
    # Get the important columns in cv_results
    important_columns = ['rank_test_score',
                         'mean_test_score', 
                         'std_test_score', 
                         'mean_train_score', 
                         'std_train_score',
                         'mean_fit_time', 
                         'std_fit_time',                        
                         'mean_score_time', 
                         'std_score_time']
                         # Move the important columns ahead
    cv_results = cv_results[important_columns + sorted(list(set(cv_results.columns) - set(important_columns)))]

    # Write cv_results file
    cv_results.to_csv('test.csv', index=False)

# Sort best_score_params_estimator_gs in descending order of the best_score_
best_score_params_estimator_gs = sorted(best_score_params_estimator_gs, key=lambda x : x[0], reverse=True)

# Print best_score_params_estimator_gs
pd.DataFrame(best_score_params_estimator_gs, columns=['best_score', 'best_param', 'best_estimator'])

In [None]:
# Get the best_score, best_params and best_estimator obtained by GridSearchCV
best_score_gs, best_params_gs, best_estimator_gs = best_score_params_estimator_gs[0]