In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
#setup
#code
name = 'NVDA'
ticker = yf.Ticker(name) #import data
aapl_df = ticker.history(period="5y") #get data from 5 year period in dataframe
aapl_df.drop(['High','Close','Volume','Dividends','Stock Splits'], axis=1, inplace=True) #only have two columns, not seven
open = np.empty(shape = (1259), dtype = float)
close = np.empty(shape = (1259), dtype = float)

open=aapl_df[['Open']].to_numpy()

#when training ml model you have a set of steps, and you have to specify those steps

#**STEPS**
1. Get a dataset
2. Clean the data
3. Split the dataset (into train, test, validation) train the dataset on the training dataset, validate it and test it on the testing dataset to see how well the model handles unseen data (or else, the model will overfit to the training data)
4. Choose your model (which models you will use: like linear regression, random forests, decision trees, neural networks) how to combine and explain the models (about the parameters and the evaluation of this part)
5. Use MAE/MSE to evaluate models (MAE is mean absolute error, mean of the absolute values of the differences between predicted and actual. MSE is mean square error, the mean of the squared deviations from your predicted value and the actual value)

In [27]:
#opening prices for each day over the past 5 years
open

array([[ 10.41566827],
       [  9.84255572],
       [ 10.1986341 ],
       ...,
       [172.75      ],
       [171.33999634],
       [169.52000427]])

In [28]:
X = np.zeros((1260,3))
Y = [0]*1260
#sliding window with a fixed window of 3 days, x is the 3 days, y is the next day outside the window (our prediction)
for i in range(1250):
  X[i] = [open[i][0],open[i+1][0],open[i+2][0]]
  Y[i] = open[i+3][0]

#gives you 3 consecutive days as input and the next day as output
print(X[0], Y[0])

[10.41566827  9.84255572 10.1986341 ] 10.340169088040522


In [29]:
#next step: check for empty values/discrepancies in the dataset


#then split dataset
from sklearn.model_selection import train_test_split
from sklearn import linear_model

#shuffle so that we won't be biased and we want the general pattern of the last 5 years
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42)

In [30]:
lin_regr = linear_model.LinearRegression()
#.fit trains model
lin_regr.fit(X_train, y_train)


In [31]:
lin_regr.coef_
#weights * day 1 + weights*day 2 + weights*day 3 = prediction

array([-0.00813918,  0.13551521,  0.87262914])

In [32]:
from sklearn.metrics import mean_squared_error
Y_pred_linear = lin_regr.predict(X_test)
mean_squared_error(y_test, Y_pred_linear)

5.431896721995262

In [33]:
Y_predTrain_linear = lin_regr.predict(X_train)
mean_squared_error(y_train, Y_predTrain_linear)

5.677559355917761

Using DecisionTree

In [34]:
from sklearn.tree import DecisionTreeRegressor
dec_tree = DecisionTreeRegressor(random_state = 0)
dec_tree.fit(X_train, y_train)

In [35]:
Y_pred_tree = dec_tree.predict(X_test)
mean_squared_error(Y_pred_tree, y_test)

9.670424685164017

In [36]:
#Y_predTrain_tree = dec_tree.predict(X_test)
#mean_squared_error(y_train, Y_predTrain_tree)

### Introducing two more models:

Random Forest and MLP Regressor.

In [37]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

forest = RandomForestRegressor(n_estimators = 100, random_state = 0)
forest.fit(X_train, y_train)

mlp = MLPRegressor(random_state=1, max_iter=500).fit(X_train, y_train)

In [38]:
#random forest predictions on training data
Y_predTrain_forest = forest.predict(X_train)
mean_squared_error(y_train, Y_predTrain_forest)

1.0454508576648802

In [39]:
#random forest predictions on test data
Y_pred_forest = forest.predict(X_test)
mean_squared_error(y_test, Y_pred_forest)

6.789062939963886

In [40]:
#MLP predictions on training data
Y_predTrain_mlp = mlp.predict(X_train)
mean_squared_error(y_train, Y_predTrain_mlp)

6.291488022985977

In [41]:
#MLP predictions on testing data
Y_pred_mlp = mlp.predict(X_test)
mean_squared_error(y_test, Y_pred_mlp)

6.2789460874188325

### K-Nearest Neighbors Model

In [42]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(X_train, y_train)

In [43]:
#Training accuracy
Y_predTrain_knn = knn.predict(X_train)
print(f"Training Accuracy: {mean_squared_error(y_train, Y_predTrain_knn)}")

#test accuracy
Y_pred_knn = knn.predict(X_test)
print(f"Test Accuracy: {mean_squared_error(y_test, Y_pred_knn)}")

Training Accuracy: 4.0188464388423135
Test Accuracy: 7.418581216844686


### Training vs. Test vs. Validation

Validation set - why is it important?
It is important because you want to separate testing for the model and for post-training.

How to identify base parameters? We use validation set (unseen data) to define them.

With validation, your model will be less susceptible to overfitting (biased estimate of real world performance of the model)

**validation set simulates the testing set**

Validation - Hyperparameter tuning

### Hidden layers:
Layers between input and output,

Encoder and decoder - find numerical representation of this

attention: input is large, when you provide text or picture, attention pulls only the important information (and not excess information that is not required/needed).

attention mechanism focuses on important parts of the data.

### positional embedding

The order matters a lot (especially in time series or NLP tasks)

forces the model to remember when the prices

## Model:

LSTM model (Long short term memory)
has forget gate, input gate, output gate

https://deeplearning.cs.cmu.edu/S23/document/readings/LSTM.pdf

# Using our models to simulate buying and selling

In [64]:
def simpleSimulateBuySell(predictions, amount, interval, stocks, days):
  for i in range(days):
    if (predictions[i+1] < Y[i] and stocks > 0):
      stocks -= interval
      amount += Y[i] * interval
    elif (predictions[i + 1] > Y[i] and amount > Y[i]):
      stocks += interval
      amount -= Y[i] * interval

  amount += Y[i] * stocks
  print(f"Investing in {name}, Final Amount: ${str(amount.round(2))}")

#def realisticSimulateBuySell():


In [67]:
#using best model (linear model) to predict and simulate buying and selling

predictions = lin_regr.predict(X)
amount = 50000
stocks = 0

simpleSimulateBuySell(predictions, amount, 3, stocks, 1200)

Investing in NVDA, Final Amount: $254441.29


## Experiment with different numbers and write down results.

Baseline to beat (if we had just held the stock throughout the time period):



In [74]:
import math
print(f"If we had just held the {name} stock from the beginning to the end, we would have made ${str((math.floor(50000/Y[0]) * Y[1200] - Y[0]).round(2))}.")

If we had just held the NVDA stock from the beginning to the end, we would have made $546548.48.
