<a href="https://colab.research.google.com/github/ianm101/stock-model/blob/main/StockModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install yfinance
print("yfinance installed")

Collecting yfinance
  Downloading https://files.pythonhosted.org/packages/7a/e8/b9d7104d3a4bf39924799067592d9e59119fcfc900a425a12e80a3123ec8/yfinance-0.1.55.tar.gz
Collecting lxml>=4.5.1
[?25l  Downloading https://files.pythonhosted.org/packages/bd/78/56a7c88a57d0d14945472535d0df9fb4bbad7d34ede658ec7961635c790e/lxml-4.6.2-cp36-cp36m-manylinux1_x86_64.whl (5.5MB)
[K     |████████████████████████████████| 5.5MB 6.4MB/s 
Building wheels for collected packages: yfinance
  Building wheel for yfinance (setup.py) ... [?25l[?25hdone
  Created wheel for yfinance: filename=yfinance-0.1.55-py2.py3-none-any.whl size=22616 sha256=c38eb0e86929fa6aae964d9dd2c817de87465edf5bedd1d8ea8b669448071dc1
  Stored in directory: /root/.cache/pip/wheels/04/98/cc/2702a4242d60bdc14f48b4557c427ded1fe92aedf257d4565c
Successfully built yfinance
Installing collected packages: lxml, yfinance
  Found existing installation: lxml 4.2.6
    Uninstalling lxml-4.2.6:
      Successfully uninstalled lxml-4.2.6
Successfully

In [47]:
import yfinance as yf
import pandas as pd
#from pandas import Series as pds
import numpy as np
import matplotlib.pyplot as plt
import os, datetime

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout
from tensorflow.keras.models import Sequential

In [48]:
# Get the data
ticker = yf.Ticker('ibm')
historicals = ticker.history(interval='1d', period='max')

In [49]:


# Add simple moving averages (1 wk, 2 wk, 25 day, 50 day, 200 day)
historicals['SMA_07'] = historicals.Close.rolling(7).mean()
historicals['SMA_14'] = historicals.Close.rolling(14).mean()
historicals['SMA_25'] = historicals.Close.rolling(25).mean()
historicals['SMA_50'] = historicals.Close.rolling(50).mean()
historicals['SMA_200'] = historicals.Close.rolling(200).mean()

# drop values without any data and convert types 
historicals.dropna(inplace=True)
try:
  historicals.drop(columns=['Dividends', 'Stock Splits'], inplace=True)
except:
  print("Dividends / Stock splits already removed from dataframe")
historicals = historicals.convert_dtypes()



print(historicals.info())
print(historicals.head())





<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 14659 entries, 1962-10-15 to 2021-01-08
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Open     14659 non-null  float64
 1   High     14659 non-null  float64
 2   Low      14659 non-null  float64
 3   Close    14659 non-null  float64
 4   Volume   14659 non-null  Int64  
 5   SMA_07   14659 non-null  float64
 6   SMA_14   14659 non-null  float64
 7   SMA_25   14659 non-null  float64
 8   SMA_50   14659 non-null  float64
 9   SMA_200  14659 non-null  float64
dtypes: Int64(1), float64(9)
memory usage: 1.2 MB
None
                Open      High       Low  ...    SMA_25    SMA_50   SMA_200
Date                                      ...                              
1962-10-15  1.155774  1.188148  1.155774  ...  1.188924  1.224515  1.447901
1962-10-16  1.188147  1.193003  1.157391  ...  1.185751  1.223063  1.444433
1962-10-17  1.157391  1.181672  1.150916  ...  1.182886  1.

In [50]:


# Normalize the data
scaler = MinMaxScaler(feature_range= (0, 1))
scaler = scaler.fit(historicals)
normalized = scaler.transform(historicals)

normal_data = pd.DataFrame(data=normalized, columns = historicals.columns)
print(normal_data.shape)

# Split data into training, validation, and testing
# Get rid of 2 categories, dividends and stock splits

WINDOW_SIZE = 14

# Split using scikit learn TimeSeriesSplit class
tscv = TimeSeriesSplit(n_splits = len(normal_data)-WINDOW_SIZE, max_train_size = WINDOW_SIZE)
# [1,2,3], [4]
# [2,3,4], [5]

X, y = [], []
for train_index, test_index in tscv.split(normal_data):
  #print("train:{0}\ttest{1}".format(train_index, test_index))
  
  #print("historical stonks: {0}\nstonk to predict:{1}".format(historicals.Close[train_index], historicals.Close[test_index]))
  
  X.append(normal_data['Close'][train_index])
  y.append(normal_data['Close'][test_index])

X = np.array(X)
y = np.array(y)

(14659, 10)


In [51]:
print(normal_data.head())
print(normal_data.shape)

       Open      High       Low  ...    SMA_25    SMA_50   SMA_200
0  0.000555  0.000615  0.000723  ...  0.000342  0.000352  0.001473
1  0.000760  0.000646  0.000733  ...  0.000321  0.000342  0.001450
2  0.000565  0.000574  0.000692  ...  0.000303  0.000337  0.001426
3  0.000647  0.000554  0.000684  ...  0.000279  0.000326  0.001402
4  0.000516  0.000390  0.000537  ...  0.000246  0.000314  0.001379

[5 rows x 10 columns]
(14659, 10)


In [52]:
# Basic DNN, takes in inputs size [WINDOW_SIZE, 1] (currrently univariate) and outputs scalar
# [BATCH_SIZE, WINDOW_SIZE, NUMBER_FEATURES] (ndim = 3)
# WHAT IS DIMENSION OF INPUT SEQUENCES? --> (Number samples, WINDOW_SIZE)
print(X.shape)
num_variables = 1
X = X.reshape((-1, WINDOW_SIZE, 1))

model = keras.models.Sequential([
  keras.layers.SimpleRNN(10, return_sequences=True, input_shape=(WINDOW_SIZE, num_variables)), 
  keras.layers.Dropout(0.2),
  keras.layers.SimpleRNN(10, return_sequences=False),  
  keras.layers.Dropout(0.2),
  keras.layers.Dense(1, activation='linear')
])
print("Model built")

# Compile the model
optimizer = keras.optimizers.Adam(lr=0.0015)
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mean_absolute_error'])

X = np.array(X)
y = np.array(y)
model.summary()

(14645, 14)
Model built
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_4 (SimpleRNN)     (None, 14, 10)            120       
_________________________________________________________________
dropout_4 (Dropout)          (None, 14, 10)            0         
_________________________________________________________________
simple_rnn_5 (SimpleRNN)     (None, 10)                210       
_________________________________________________________________
dropout_5 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 341
Trainable params: 341
Non-trainable params: 0
_________________________________________________________________


In [53]:
%load_ext tensorboard
# callbacks are objects that can perform actions at various training stages - using them to graph model accuracy in training and validation
logdir = os.path.join('logs', datetime.datetime.now().strftime("%Y%m%d - %H%M"))
my_callbacks = [tf.keras.callbacks.TensorBoard(log_dir=logdir)]


# Fit model
model.fit(X, y, epochs = 30, shuffle=False, callbacks=my_callbacks)



The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f37ade41be0>

In [None]:
%tensorboard --logdir logs

In [None]:
sample = X[:14]

sample = sample.reshape(-1, WINDOW_SIZE, 1)
pred = model.predict(sample)
truth = y[14]

print("Sample: {0} : ({1},\t{2})".format(sample, pred, truth))


In [56]:
print(y)

[[5.58920605e-04]
 [6.47644372e-04]
 [6.42518489e-04]
 ...
 [8.12347818e-01]
 [8.10447246e-01]
 [8.05093743e-01]]
