# Running Predictions w/ Trained models and packaging in CSVs for Testing.


`-- Leo Lonzarich, 28 May 2023 // [Revised 2-6-23] --`

In [85]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM
from tensorflow.keras.optimizers import Adam

from pandas_datareader import data as pdr
import yfinance as yfin
import datetime as dt
import pickle
import tqdm
import os

# Yahoo API may have broken previous versions of pd_datareader,
# so this is a workaround.
yfin.pdr_override()
pd.options.mode.chained_assignment = None  # default='warn'

In [39]:
# Yahoo Finance stock scraping.
# **Careful with how many times you run this to avoid IP ban**

STOCKS = ['SPY', 'AAPL', 'PG', 'PFE', 'LMT', 'XOM'] # Stocks to predict.
START = dt.datetime(2012, 1, 1) # First day of model's training window
END = dt.datetime(2023, 1, 15) # Last day of training window
NUMDAYS = 40 # The number of days we want our model to predict.

TICKER = STOCKS[1] # We run one stock at a time.

stock = pdr.get_data_yahoo(TICKER, START, END).rename(columns= {'Adj Close': 'AdjClose'})
print(stock.shape[0], "unique points loaded with attributes: \n", stock.
      keys())
stock = stock.reset_index()

[*********************100%***********************]  1 of 1 completed
2777 unique points loaded with attributes: 
 Index(['Open', 'High', 'Low', 'Close', 'AdjClose', 'Volume'], dtype='object')


In [40]:
def series_to_supervised(data, n_in=5, n_out=1, dropnan=True):
    """
    Frame a time series as a supervised learning dataset.
    Arguments:
    data: Sequence of observations as a list or NumPy array.
    n_in: Number of lag observations as input (X).
    n_out: Number of observations as output (y).
    dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
    Pandas DataFrame of series framed for supervised learning.
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
    if i == 0:
        names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
    else:
        names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
        
    return agg


def data_split(data, lag=60, days=1, train_ratio=0.70,
               validation=False, backtest=False):
    """
    Prepping stock data for neural net; scaling down 
    values and making train-test split.
    data: DataFrame, all stock data.
    lag: int, number of days used for prediction.
    days: int, number of days to predict.
    train_ratio: float, percentage of data for training.
    validation: bool, split data into train/valid/test when True.
    backtest: bool, only performs x-y split when True.
    Returns
        X_train: array, independent training features.
        y_train: array, objective training feature.
        X_test: array, independent test features.
        y_test: array, objective test feature.
        X_valid: array, independent validation features.
        y_valid: array, objective validation feature.
        X: array, independent features.
        y: array, target feature.
    """
    # Selecting 'AdjClose' prices as input and target feature for time series.
    data_adj = data.filter(['AdjClose']).values

    # Scaling data. Ensures quicker convergence to solution.
    scaler = MinMaxScaler(feature_range=(0,1))
    scaled_data = scaler.fit_transform(data_adj)

    # Splitting input features and target object, X and y.
    supervised_data = series_to_supervised(scaled_data, n_in=lag, n_out=days)
    X = supervised_data.loc[:, supervised_data.columns != 'var1(t)'] 
    y = supervised_data['var1(t)'] # Isolating target object.

    # Selecting converted data for train-test split.
    len_training = int(np.ceil(len(scaled_data) * train_ratio))

    X_train = X.iloc[0:len_training].to_numpy()
    y_train = y.iloc[0:len_training].to_numpy()
    # X_train, y_train = np.array(X_train), np.array(y_train)

    # Making validation/test split.
    if validation:
        len_valid = int((len(X) - len_training)/2)
        len_valid += len_training-60

        # We subtract lag days since they are needed to actually  
        X_valid = X.iloc[len_training-60:len_valid].to_numpy()
        y_valid = data_adj[len_training:len_valid]

        X_test = X.iloc[len_valid-60:].to_numpy()
        y_test = data_adj[len_valid:]

    else:
        X_test = X.iloc[len_training-60:].to_numpy()
        y_test = data_adj[len_training:]

    # Reshaping to obtain 3D reps (currently 2d) to pass into LSTM.
    # LSTM expects d1 # of samples, d2 # of timesteps, and d3 # of features.
    X_train = np.reshape(X_train, (X_train.shape[0],
                                   X_train.shape[1], 1))
    X_test = np.reshape(X_test, (X_test.shape[0],
                                 X_test.shape[1], 1))

    if len(X_test) != len(y_test):
        raise Warning('X, y length mismatch.')
    
    if validation:
        X_valid = np.reshape(X_valid, (X_valid.shape[0],
                                       X_valid.shape[1], 1))
        return X_train, y_train, X_valid, y_valid, X_test, y_test, scaler
    
    elif backtest:
        return X, y, scaler

    elif not backtest and not validation:
        return X_train, y_train, X_test, y_test, scaler
    
    else:
        ValueError(
            "Cannot simultaneously perform 'backtest' and 'validation'."
            )
        exit()


def create_model(trial, in_shape):
    '''
    A modification on a vanilla model function, where in
    this case we pass a trial object that Optuna uses both in
    its optimization routine, and for passing values for the
    hyperparameters in the case of model fitting.
    in_shape: int, gives the col shape (# of features) that the
        first LSTM node should expect to receive.
    '''
    units = trial.suggest_int('units', 64, 150, step=2)
    dropout = trial.suggest_float('dropout', 0, 1)
    classes = trial.suggest_int('classes', 13, 50, step=1)
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
    # activation = trial.suggest_categorical('dense_activation', [None, 'tanh', 'sigmoid'])
    # recurrent_dropout = trial.suggest_float('recurrent_droupout', 0, 1)

    model = Sequential()
    model.add(
        LSTM(
            units=units,
            activation='tanh',
            recurrent_activation='sigmoid',
            unroll=False,
            use_bias=True,
            dropout=dropout,
            # recurrent_dropout=recurrent_dropout,
            return_sequences=True,
            input_shape=(in_shape, 1)
        )
    )
    model.add(
        LSTM(
            units=int(units/2),
            activation='tanh',
            recurrent_activation='sigmoid',
            unroll=False,
            use_bias=True,
            dropout=dropout,
            # recurrent_dropout=recurrent_dropout,
            return_sequences=False,
        )
    )
    model.add(
        Dense(
            classes,
            activation=None,
            use_bias=True
        )
    )
    model.add(
        Dense(
            1,
            activation= None, # activation,
            use_bias=True
        )
    )

    # Only use 'accuracy' metric for classification.
    model.compile(
        loss='mean_squared_error',
        optimizer=Adam(
            learning_rate=learning_rate,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-07
        ),
        metrics=['mean_squared_error'] # ['mean_absolute_percentage_error']
    )
    
    return model


In [41]:
LAG = 60 # Number of days to use for predicting the following day(s).
DAYS = 1 # Number of days to predict with each lag period.

UNITS = 150
CLASSES = 50
BATCHSIZE= 128
EPOCHS = 10

split = data_split(stock, lag=LAG, days=DAYS, train_ratio=1)

X_train, y_train = split[0], split[1]
scaler = split[4]

if len(X_train) != len(stock) - LAG:
    raise ValueError('X_train is incorrectly formatted. Check input Stock Data.')

In [99]:
PATH = 'OptStudy_' + TICKER
PATH = 'mape6'
filepath = '/Users/leoglonz/Desktop/stock_analysis/opt_cache/' + PATH + '.pickle'


study = pickle.load(open(filepath, 'rb'))
print("Number of finished trials: %i" %len(study.trials))
print("Best trial:")

best_trial = study.best_trial

print("    RMSE Value: %.3e" %best_trial.value)
print("    Params: ")
for key, value in best_trial.params.items():
    if value == str(value) or value == None:
        print("    %s: %s" %(key, value))
    else:
        print("    %s: %.3e" %(key, value))

Number of finished trials: 40
Best trial:
    RMSE Value: 3.818e+00
    Params: 
    batchsize: 2.500e+01
    epochs: 8.000e+00
    units: 1.340e+02
    dropout: 3.478e-03
    classes: 3.000e+01
    learning_rate: 2.374e-03


In [100]:
in_shape = X_train.shape[1] # Number of input features.
model = create_model(best_trial, in_shape)

best_trial.params
model.fit(
        X_train,
        y_train,
        shuffle=True,
        batch_size=best_trial.params['batchsize'],
        epochs=best_trial.params['epochs'],
        verbose=True,
    )

Epoch 1/8


2023-06-02 11:36:44.224444: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-06-02 11:36:44.517497: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-06-02 11:36:44.761922: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-06-02 11:36:45.071061: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-06-02 11:36:45.487256: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x30c888910>

In [101]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 60, 134)           72896     
                                                                 
 lstm_5 (LSTM)               (None, 67)                54136     
                                                                 
 dense_4 (Dense)             (None, 30)                2040      
                                                                 
 dense_5 (Dense)             (None, 1)                 31        
                                                                 
Total params: 129,103
Trainable params: 129,103
Non-trainable params: 0
_________________________________________________________________


In [102]:
# Fetching available test period data. What data does not exist at time
# of writing (i.e., after 31 May) will be predicted manually.

START = dt.datetime(2023, 3, 3)
END = dt.datetime(2023, 5, 31) #dt.datetime(2023, 8, 1)


test_stock = pdr.get_data_yahoo(TICKER, START, END).rename(columns= {'Adj Close': 'AdjClose'})
print(test_stock.shape[0], "unique points loaded with attributes: \n", stock.
      keys())
test_stock = test_stock.reset_index()

[*********************100%***********************]  1 of 1 completed
61 unique points loaded with attributes: 
 Index(['Date', 'Open', 'High', 'Low', 'Close', 'AdjClose', 'Volume'], dtype='object')


In [103]:
# Making predictions on Test Period data. 

y_preds = np.zeros([NUMDAYS])

for i in tqdm.tqdm(range(NUMDAYS)):
    if i == 0: 
        # Generating first instance of test data.
        split = data_split(test_stock, lag=LAG, days=DAYS, train_ratio=1)
        X_test = split[0]

        if len(X_test) != len(test_stock) - LAG:
            raise ValueError('X_test is incorrectly formatted. Check input Stock Data.')
        
        # print("Starting with day %i prediction." %len(X_test))

    else:
        # Update test data; drop 1st value, append latest prediction to end.
        X_test = np.append(X_test[0][1:], pred, axis=0).reshape((1,60,1))
    
    pred = model.predict(X_test, verbose=0)
    y_preds[i] = pred

y_preds = scaler.inverse_transform(y_preds.reshape(-1,1))

  0%|          | 0/40 [00:00<?, ?it/s]2023-06-02 11:48:13.790545: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-06-02 11:48:13.889369: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-06-02 11:48:14.031006: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
100%|██████████| 40/40 [00:02<00:00, 18.06it/s]


In [104]:
# Prepping and offloading to CSV.

dates = pd.bdate_range(start='5/31/2023', end='7/25/2023')

predictions = pd.DataFrame({
    'Date': np.array(dates).T,
    'Predictions': y_preds.reshape(len(y_preds))}
    )


outname = TICKER + '_1mo_0623_preds.csv'
outdir = './Lit_Predictions'

if not os.path.exists(outdir):
    os.mkdir(outdir)

fullname = os.path.join(outdir, outname)   

predictions.to_csv(fullname, sep=',', encoding='utf-8')

In [90]:
test_stock

Unnamed: 0,Date,Open,High,Low,Close,AdjClose,Volume
0,2023-03-03,148.039993,151.110001,147.330002,151.029999,150.821381,70732300
1,2023-03-06,153.789993,156.300003,153.460007,153.830002,153.617523,87558000
2,2023-03-07,153.699997,154.029999,151.130005,151.600006,151.390610,56182000
3,2023-03-08,152.809998,153.470001,151.830002,152.869995,152.658844,47204800
4,2023-03-09,153.559998,154.539993,150.229996,150.589996,150.381989,53833600
...,...,...,...,...,...,...,...
56,2023-05-23,173.130005,173.380005,171.279999,171.559998,171.559998,50747300
57,2023-05-24,171.089996,172.419998,170.520004,171.839996,171.839996,45143500
58,2023-05-25,172.410004,173.899994,171.690002,172.990005,172.990005,56058300
59,2023-05-26,173.320007,175.770004,173.110001,175.429993,175.429993,54835000


In [105]:
predictions

Unnamed: 0,Date,Predictions
0,2023-05-31,151.449783
1,2023-06-01,151.198529
2,2023-06-02,150.237218
3,2023-06-05,148.899704
4,2023-06-06,147.379644
5,2023-06-07,145.785003
6,2023-06-08,144.17353
7,2023-06-09,142.574855
8,2023-06-12,141.003301
9,2023-06-13,139.465133
