In [None]:
#save the file as .h5

In [None]:
#!pip install yfinance plotly

import numpy as np
import pandas as pd
import plotly.graph_objs as go
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import joblib, pickle, dill, keras

start = '2000-01-01'
end = '2024-02-01'
stock = 'MSFT'

data = yf.download(stock, start, end)

# prints the DF
# display(data)

In [None]:
# The 100-day moving average is calculated by summing the past 100 days ClosePx and dividing the result by 100.
# returns a series
ma_100_days = data.Close.rolling(100).mean()

# Plot
fig1 = go.Figure()
fig1.add_trace(go.Scatter(x=data.index, y=ma_100_days, mode='lines', name='MA100'))
fig1.add_trace(go.Scatter(x=data.index, y=data.Close, mode='lines', name='Close Price'))
fig1.update_layout(title='Price vs MA100', xaxis_title='Date', yaxis_title='Price')
fig1.show()

# The 200-day moving average is calculated by summing the past 200 days ClosePx and dividing the result by 200.
# returns a series
ma_200_days = data.Close.rolling(200).mean()

# Plot
fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=data.index, y=ma_100_days, mode='lines', name='MA100', line=dict(color='red')))
fig2.add_trace(go.Scatter(x=data.index, y=ma_200_days, mode='lines', name='MA200', line=dict(color='blue')))
fig2.add_trace(go.Scatter(x=data.index, y=data.Close, mode='lines', name='Close Price', line=dict(color='green')))
fig2.update_layout(title='Price vs MA100 vs MA200', xaxis_title='Date', yaxis_title='Price')
fig2.show()

In [None]:
'''
1. Drops nans
2. sets up initial test/train Dataframes
3. Initiates the MinMaxScaler => all ClosePx's will range from 0-1 for normalization
'''

# Drop NANs
data.dropna(inplace=True)

# takes 80% of the ClosePx for data_train (training)
# takes the remaining 20% of the ClosePx for data_test (testing)
data_train = pd.DataFrame(data.Close[0: int(len(data)*0.80)])
data_test = pd.DataFrame(data.Close[int(len(data)*0.80): len(data)])

# Normalization technique
# all features will be transformed into the range [0,1] 
# meaning that the minimum and maximum value of a feature/variable is going to be 0 and 1
scaler = MinMaxScaler(feature_range=(0,1))

# print(f"data_train {data_train.shape}:\n{data_train.head(2)}\n")
# print(f"data_test {data_test.shape}:\n{data_test.head(2)}\n")   

In [None]:
'''
1. Setup the training sets
2. Loop below creates a sliding window with 1 step at a time
'''

x_train = []
y_train = []

# fit transforms the data
data_train_scale = scaler.fit_transform(data_train)

# print(data_train_scale.shape) => (4846, 1)

# This creates a sliding window 1 step at a time
# Loop through the range starting from 100 up to the total number of rows in the dataset `data_train_scale`
# Start at 101, grab the first 100 (ClosePx's) and append to x_train then put the 101st into y_train
for i in range(100, data_train_scale.shape[0]):
    # Creating the input sequence for the model- data_train_scale[i-100:i, 0] represents these 100 data points
    x_train.append(data_train_scale[i-100:i, 0])
    # Creating the target value for the model
    y_train.append(data_train_scale[i, 0])

x_train, y_train = np.array(x_train), np.array(y_train)
print(f"x_train.shape {x_train.shape}\ny_train.shape {y_train.shape}\n")
print(f"x_train\n{x_train[:2]}\ny_train\n{y_train[:2]}\n")

In [None]:
'''
1. Setting up x_test and y_test
2. First grab the last 100 days from our data_train df as it'll be needed to calc/predict/compare the first price of the y_test ClosePx
    - we first gather the last 100 days
    - we then run our time window slice code to predict/check prices
'''
x_test = []
y_test = []

# Need to grab the last 100 days from data_train cuz it will be used to "predict" or compare on the first element of the data_test array
# Remember that I need 100 previous ClosePx in order to predict the ClosePx of any given day in the future
pas_100_days = data_train.tail(100)

# Combine both arrays into data_test
data_test = pd.concat([pas_100_days, data_test], ignore_index=True)

data_test_scale  =  scaler.fit_transform(data_test)

# This creates a sliding window 1 step at a time
# Loop through the range starting from 100 up to the total number of rows in the dataset `data_train_scale`
# Start at 101, grab the first 100 (ClosePx's) and append to x_train then put the 101st into y_train
for i in range(100, data_test_scale.shape[0]):
    x_test.append(data_test_scale[i-100:i, 0])
    y_test.append(data_test_scale[i, 0])

# prints out shapes and first three elements    
x_test, y_test = np.array(x_test), np.array(y_test)
# print(f"x_test.shape => {x_test.shape}\ny_test.shape => {y_test.shape}\n")
# print(f"x_test\n{x_test[:2]}\ny_test\n{y_test[:2]}\n")

## function: make_results

In [None]:
def make_results(model_name:str, model_object, metric:str):
    '''
    Arguments:
    model_name (string): user labeled the model
    model_object: a fit GridSearchCV object
    metric (string): neg_mean_absolute_error, neg_mean_squared_error or r2

    Returns a pandas df with the neg_mean_absolute_error, neg_mean_squared_error and r2 scores
    for the model with the best mean 'metric' score across all validation folds.
    '''

    # Create dictionary that maps input metric to actual metric name in GridSearchCV
    metric_dict = {'neg_mean_absolute_error': 'mean_test_neg_mean_absolute_error',
                 'neg_mean_squared_error': 'mean_test_neg_mean_squared_error',
                 'r2': 'mean_test_r2',
                 }

    # Get all the results from the CV and put them in a df
    cv_results = pd.DataFrame(model_object.cv_results_)

    # Isolate the row of the df with the max(metric) score
    best_estimator_results = cv_results.iloc[cv_results[metric_dict[metric]].idxmax(), :]

    # Extract neg_mean_absolute_error, neg_mean_squared_error, and r2 score from that row
    neg_mean_absolute_error = best_estimator_results.mean_test_neg_mean_absolute_error
    neg_mean_squared_error = best_estimator_results.mean_test_neg_mean_squared_error
    r2 = best_estimator_results.mean_test_r2

    # Create table of results
    table = pd.DataFrame({'model': [model_name],
                        'neg_mean_absolute_error': [neg_mean_absolute_error],
                        'neg_mean_squared_error': [neg_mean_squared_error],
                        'r2': [r2],
                        },
                       )

    return table

## function: get_test_scores

In [None]:
def get_test_scores(model_name:str, preds, y_test_data):
    '''
    Generate a table of test scores.

    In:
    model_name (string): how the model will be named in the output table
    preds: numpy array of test predictions
    y_test_data: numpy array of y_test data

    Out:
    table: a pandas df of neg_mean_absolute_error, neg_mean_squared_error and r2 scores for your model
    '''
    negative_mean_absolute_error = mean_absolute_error(y_test_data, preds)
    negative_mean_squared_error = mean_squared_error(y_test_data, preds)
    r2 = r2_score(y_test_data, preds)

    table = pd.DataFrame({'model': [model_name],
                        'neg_mean_absolute_error': [negative_mean_absolute_error],
                        'neg_mean_squared_error': [negative_mean_squared_error],
                        'r2': [r2]
                        })

    return table

# LinearRegression and GridSearchCV

In [None]:
# Instantiate linear regression model
lr = LinearRegression()

# Create a dictionary of hyperparameters to tune
cv_params = {'fit_intercept': [True, False],
             'positive': [True, False],
             'copy_X':[True, False]
}

# Define a set of scoring metrics to capture
scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']

lr1 = GridSearchCV(lr, cv_params, scoring=scoring, cv=4, refit='neg_mean_absolute_error', n_jobs=-1, verbose=1)

In [None]:
# check the dims
print(f"x_train dims BEFORE fit => {x_train.shape}\n")
print(f"y_train dims BEFORE fit => {y_train.shape}\n")

In [None]:
%%time

lr1.fit(x_train, y_train)

In [None]:
# check the dims
# print(f"x_train AFTER dims => {x_train.shape}\n")
# print(f"y_train AFTER dims => {y_train.shape}\n")

In [None]:
# Save the model to a file

with open('./pickledModels/grid_search_lr_model.pkl', 'wb') as file:
    pickle.dump(lr1, file)

In [None]:
# Obtain best parameters

lr1.best_params_

In [None]:
# Call 'make_results()' on the GridSearch object

results = make_results('LR CV', lr1, 'neg_mean_absolute_error')
results

In [None]:
# Get scores on test data

lr_preds = lr1.best_estimator_.predict(x_test)

In [None]:
# Get scores on test data

lr_test_scores = get_test_scores('LR test', lr_preds, y_test)
results = pd.concat([results, lr_test_scores], axis=0)
results

# Tensorflow LinearRegression and GridSearchCV

In [None]:
# Important packages for Tensorflow
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.regularizers import l2
from scikeras.wrappers import KerasRegressor # Needed for GridSearchCV
from tensorflow.keras.callbacks import EarlyStopping # for use early stopping to avoid overfitting

# Function to create the model using tensor flow
# def create_model(optimizer='adam', learning_rate=1e-2):
#     model = Sequential()
#     model.add(Input(shape=(x_train.shape[1],)))
#     model.add(Dense(100, activation='relu', kernel_regularizer=l2(0.001)))
#     # model.add(Dense(100, activation='relu'))
#     model.add(Dense(1, activation='linear'))
#     model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mae'])
#     return model

model = Sequential()
model.add(Input(shape=(x_train.shape[1],)))
model.add(Dense(100, activation='relu', kernel_regularizer=l2(0.0001)))
# model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='linear'))
opt = keras.optimizers.Adam(learning_rate=1e-3)
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])

# Wrap the model using the KerasRegressor
# Needed to allow compatibility between the Tensorflow object and scikit-learn's GridSearchCV
# tf_model = KerasRegressor(model=create_model, verbose=1)
tf_model = KerasRegressor(model=model, verbose=1)

# Early Stopping variable
early_stopping = EarlyStopping(monitor='val_loss', patience=10)

In [None]:
# Create a dictionary of hyperparameters to tune
cv_params = {'batch_size': [32, 64, 128, 256, 512],
             'epochs': [50, 100, 200, 300, 400],
             'callbacks': [early_stopping]
            }

# Define a set of scoring metrics to capture
scoring = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']

# Instantiate the GridSearchCV object
tf_model1 = GridSearchCV(tf_model, cv_params, scoring=scoring, cv=4, refit='neg_mean_absolute_error', n_jobs=-1, verbose=1)

In [None]:
%%time

tf_model1.fit(x_train, y_train)

In [None]:
# joblib.dump(tf_model1, 'lrTensorFlow.pkl')
# dill.dump('lrTensorFlow.pkl')
with open('./pickledModels/lrTensorFlow.dill', 'wb') as file:
    dill.dump(tf_model1, file)

In [None]:
# Examine best score

tf_model1.best_score_

In [None]:
# Examine best parameters

tf_model1.best_params_

In [None]:
# Call 'make_results()' on the GridSearch object

tf_cv_results = make_results('TF CV', tf_model1, 'neg_mean_absolute_error')
results = pd.concat([results, tf_cv_results], axis=0)
results

In [None]:
# Get scores on test data

tf_preds = tf_model1.best_estimator_.predict(x_test)

In [None]:
# Get scores on test data

tf_test_scores = get_test_scores('TF test', tf_preds, y_test)
results = pd.concat([results, tf_test_scores], axis=0)
results