In [None]:
!pip install imageio git+https://github.com/tensorflow/docs XlsxWriter tensorflow_addons mplfinance yfinance &> /dev/null

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# from google.colab import auth
# auth.authenticate_user()

In [None]:
# !mkdir historical
# !gsutil -m cp -n gs://ganstick_project/historical/*.png historical/ &> /dev/null

In [None]:
import glob
import pandas as pd
import numpy as np

import random
from datetime import datetime

import yfinance as yf
import mplfinance as mpf

import math
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg') # remain in headless environment

from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array

import keras
from tensorflow import keras
from tensorflow.keras.utils import to_categorical

from keras.models import Sequential, Model
from keras.layers import Dense
from keras.layers import Layer
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.layers import GRU
from keras.layers import Dropout
from keras.layers import Input
from keras.layers import LayerNormalization
from keras.layers import MultiHeadAttention
from keras.layers import Conv1D
from keras.layers import Softmax
from keras.layers import Permute
from keras.layers import Reshape
from keras.layers import Lambda
from keras.layers import Dot
from keras.layers import Activation
from keras.layers import Concatenate
from keras.layers import Embedding
from keras.layers import LayerNormalization
from keras.layers import RepeatVector
from keras.layers import TimeDistributed


from keras.layers import *

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping

In [None]:
np.set_printoptions(suppress=True)

In [None]:
candle_style = mpf.make_marketcolors(up='green', down='red', edge='inherit', wick='inherit')
make_style = mpf.make_mpf_style(marketcolors=candle_style)

# create candlestick images from selected dates of specific stocks
fig = mpf.figure()
def generate_candlestick_images(df):
    fname = 'current_image.png'
    mpf.plot(df, type='candle', figsize=(0.56, 0.56), savefig=fname, style=make_style, ylabel='', scale_padding=0.0, scale_width_adjustment=dict(candle=0.8), axisoff=True)
    fig.clf()
    return fname

In [None]:
macro_data = []
macro_csvs = list(glob.glob("drive/MyDrive/ganstick_project/lstm_data/*.csv"))
for f in macro_csvs:
  name = f.split("/")[-1].split(".")[0]
  df = pd.read_csv(f)
  df['date'] = pd.to_datetime(df['date'])
  df.set_index('date', inplace=True)
  df.sort_index(inplace=True)
  # cutting off where our covid data ends/no longer compiled sources for it.
  df = df.loc[df.index <= '2021-03-01']
  if name == 'vix':
    vix = df
    vix['rolling_vix_mean'] = vix['open'].rolling(window=10).mean().fillna(0)
    vix = vix[['rolling_vix_mean']]
    # some duplicate in the download data
    vix = vix[~vix.index.duplicated(keep='first')]
    vix = vix.set_index(vix.index).resample('D').ffill()
    macro_data.append(vix)
  elif name == 'cpi':
    cpi = df
    cpi = cpi.set_index(cpi.index).resample('D').ffill()
    macro_data.append(cpi)
  elif name == 'civpart':
    civpart = df
    civpart = civpart.set_index(civpart.index).resample('D').ffill()
    macro_data.append(civpart)
  elif name == 'homepriceindex':
    homepriceindex = df
    homepriceindex = homepriceindex.set_index(homepriceindex.index).resample('D').ffill()
    macro_data.append(homepriceindex)
  elif name == 'covidrates':
    covidrates = df
    keep_cols = ['death', 'hospitalizedCurrently']
    covidrates = covidrates[keep_cols]
    covidrates.columns = ['covid_deaths', 'covid_current_hospitalized']
    
    # cutting off ends where covid data is either lacking or non-existent/inaccurate
    covidrates = covidrates.loc[covidrates.index > '2020-03-17']
    macro_data.append(covidrates)
  elif name == 'fedfunds':
    fedfunds = df
    fedfunds = fedfunds.set_index(fedfunds.index).resample('D').ffill()
    macro_data.append(fedfunds)

In [None]:
# handling treasury yield spread separately (to not include in the overall list)
for f in macro_csvs:
  name = f.split("/")[-1].split(".")[0]
  df = pd.read_csv(f)
  df['date'] = pd.to_datetime(df['date'])
  df.set_index('date', inplace=True)
  df.sort_index(inplace=True)
  # cutting off where our covid data ends/no longer compiled sources for it.
  df = df.loc[df.index <= '2021-03-01']
  if name == '30yearrates':
    thirty_year_rates = df
    thirty_year_rates = thirty_year_rates.set_index(thirty_year_rates.index).resample('D').ffill()
    thirty_year_rates.columns = ['thirty_year_rates']
  elif name == '10yearrates':
    ten_year_rates = df
    ten_year_rates = ten_year_rates.set_index(ten_year_rates.index).resample('D').ffill()
    ten_year_rates.columns = ['ten_year_rates']
    
# get 10-30 year spread
first_common_date = "1987-01-02" # universal beginning date
lastdate = "2021-03-01"
yield_spread_df = pd.DataFrame()
tenyear = ten_year_rates.loc[(ten_year_rates.index > first_common_date) & (ten_year_rates.index < lastdate)].copy()
thirtyyear = thirty_year_rates.loc[(thirty_year_rates.index > first_common_date) & (thirty_year_rates.index < lastdate)].copy()
yield_spread_df['spread'] = thirtyyear['thirty_year_rates'] - tenyear['ten_year_rates']

In [None]:
def get_historical(ticker):
	stock = yf.Ticker(ticker)
	stock_df = pd.DataFrame(stock.history(period='max', interval='1D'))
	stock_df.index = pd.to_datetime(stock_df.index).strftime('%Y-%m-%d')

	# only close to close
	stock_df.drop(columns=['Dividends', 'Stock Splits'], inplace=True)
 
	BEGINNING_DATE = "1987-01-02"
	# BEGINNING_DATE = "2019-12-01"
	FINAL_DATE = "2021-03-01"

	# the latest common date is 1987-01-02
	stock_df = stock_df.loc[(stock_df.index > BEGINNING_DATE) & (stock_df.index <= FINAL_DATE)]
	print(stock_df.index[0], stock_df.index[-1])
	stock_df.index = pd.to_datetime(stock_df.index)
	return stock_df

# updated version (future five days)
# create windowed version of dataset, set look_back to be the skip length (in days)
def create_dataset(dataset, cnn, col_indexes, pred_col_index, look_back=1):
	x_data, y_data = [], []
	for i in range(len(dataset)-look_back-5-1): # 5 day buffer for future days
		a = dataset[i:(i+look_back)][:, col_indexes] # chain indexing for non-contiguous columns

		ohlc = pd.DataFrame(a[:, [0, 1, 2, 3]], columns=['Open', 'High', 'Low', 'Close'])
		ohlc.index = pd.date_range(start="1990-01-01", end="1990-01-05") # dummy dates, only needed for mplfinance

		fname = generate_candlestick_images(ohlc)
		img = load_img(fname)
		img_array = img_to_array(img)[None, :, :]

		features = cnn.predict(img_array)
		features = features[0, 0, :] # (:, :, features)
		# duplication of features for each day
		features = np.tile(features, (lookback, 1))

		a = np.concatenate((a, features), axis=1)
		b = dataset[(i+look_back):(i+look_back+5)][:, pred_col_index]
  
		x_data.append(a)
		y_data.append(b)
	return np.array(x_data), np.array(y_data).astype(int)
 
def bollinger(close_price, sma, sigma, alpha):
	'''
	bollinger bands
	@param int alpha:         multiplicative factor for standard deviations
	@param float sigma:       standard deviation
	@param float sma: simple  moving average
	@param float close_price: current close price (k + 1) 
	'''
	if close_price > (sma + alpha * sigma):
		return 1
	elif close_price < (sma - alpha * sigma):
		return 1
	return 0

# set boolean for whether current price is greater than mean of previous (lookback) number of days
def trend(df, lookback, alpha):
	'''
	based on look-back period, set boolean for exceeding volatility 
	'''
	new_df = df.copy()
	new_df['up_down'] = new_df['Close'].rolling(lookback + 1, min_periods=1).apply(lambda x: bollinger(x[-1], x[:-1].mean(), x[:-1].std(), alpha)).astype(int)
	return new_df

def build_cnn():
    input_image = Input(shape=(56, 56, 3))

    feature = Conv2D(64, (5, 5), strides=(3, 3), padding='valid')(input_image)
    feature = Conv2D(128, (3, 3), strides=(3, 3), padding='valid')(feature)
    feature = Conv2D(256, (3, 3), strides=(3, 3), padding='valid')(feature)
    feature = MaxPooling2D((2, 2), strides=(2, 2))(feature)

    model = Model(inputs=input_image, outputs=feature)
    return model

def build_df(df, lookback, alpha):
    for macro_df in macro_data:
        df = df.join(macro_df)
    df = df.join(yield_spread_df)

    # divide sequence into only 2: non-covid and covid sequences
    covid_start = covidrates.index[0]

    # split df on dates
    precovid_df = df.loc[df.index < covid_start].copy()
    precovid_df.drop(columns=['covid_deaths', 'covid_current_hospitalized'], inplace=True)

    # # for covid only 
    # covid_df = df.loc[df.index >= covid_start].copy()

    data = precovid_df.copy()

    # add our boolean up or down by lookback days
    data = trend(data, lookback, alpha)
    dataset = data.to_numpy()

    split_ratio = 3.3
    split_index = int(len(dataset)//split_ratio)

    dataset_train = dataset[split_index:]
    dataset_test = dataset[:split_index]

    return dataset_train, dataset_test, data

In [None]:
lookback = 5
alpha = 1.5
cnn = build_cnn()

def build_array(ticker):
    print("working on:", ticker)
    df = get_historical(ticker)
    dataset_train, dataset_test, data = build_df(df, lookback, alpha)
    
    y_col_index = len(data.columns) - 1
    x_col_indexes = [i for i in range(y_col_index)]

    x_train, y_train = create_dataset(dataset_train, cnn, x_col_indexes, y_col_index, lookback)
    x_test, y_test = create_dataset(dataset_test, cnn, x_col_indexes, y_col_index, lookback)

    return x_train, y_train, x_test, y_test

In [None]:
# tech = ['MSFT', 'AAPL', 'INTC','XOM', 'WMT', 'BP']
tech = ['MSFT', 'XOM', 'AAPL']
techtest = ['MSFT']

x_train, y_train, x_test, y_test = map(np.vstack, zip(*map(build_array, tech)))

In [None]:
print(x_train.shape, y_train.shape, y_train[0])

In [None]:
x_train_std, x_test_std, = x_train.copy(), x_test.copy()

scaler = StandardScaler()

# scaler only accepts 2 dim, so we need to flatten the last dim(the dim with columns we want to standardize) before passing to scaler
# then reshape it back to original
x_train_std = scaler.fit_transform(x_train_std.reshape(-1, x_train_std.shape[-1])).reshape(x_train_std.shape)
x_test_std = scaler.transform(x_test_std.reshape(-1, x_test_std.shape[-1])).reshape(x_test_std.shape)

In [None]:
'''
Walk-Foward Validation
Input                   Predict
[t-4 . . . t]          [t+1 . . . t+5]
[t-9 . . . t]          [t+1 . . . t+5]
[t-14 . . . t]         [t+1 . . . t+5]

In our case, we have to set timesteps to multiples of 5 (5-day candlestick patterns)

'''

In [None]:
# shape=[samples, timesteps, features]
samples = x_train_std.shape[0]
timesteps = lookback
features = x_train_std.shape[-1]

x_train_std = np.reshape(x_train_std, (samples, timesteps, features))

samples = x_test_std.shape[0]
timesteps = lookback
features = x_test_std.shape[-1]

x_test_std = np.reshape(x_test_std, (samples, timesteps, features))

In [None]:
x_train_std = x_train_std.astype('float32')
x_test_std = x_test_std.astype('float32')

In [None]:
print(x_train_std.shape, y_train.shape)

In [None]:
n_timesteps, n_features, n_outputs = x_train_std.shape[1], x_train_std.shape[2], y_train.shape[1]

def lstm():
    input_feature = Input(shape=(n_timesteps, n_features))

    feature = Bidirectional(LSTM(40, activation='relu', input_shape=(n_timesteps, n_features), return_sequences=False))(input_feature)
    feature = RepeatVector(n_outputs)(feature)

    feature = Bidirectional(LSTM(n_outputs, return_sequences=True))(feature)
    feature = TimeDistributed(Dense(1))(feature)
    out = Activation('sigmoid')(feature)

    model = keras.Model(inputs=input_feature, outputs=out)
    return model


In [None]:
from keras import metrics
# MSE = keras.losses.MeanSquaredError()
# MAE = keras.losses.MeanAbsoluteError()

CC = keras.losses.CategoricalCrossentropy()

# sgd_opt = keras.optimizers.SGD(learning_rate=4e-4)

# BCE = keras.losses.BinaryCrossentropy()
# rmsprop_opt = keras.optimizers.RMSprop(learning_rate=1e-4)

# learning rate reduce
# rmsprop_opt = keras.optimizers.RMSprop(learning_rate=1e-6)
adam_opt = keras.optimizers.Adam(learning_rate=1e-7)

In [None]:
model = lstm()
# model = lstm_attention()
# model = gru()
model.compile(loss=CC, optimizer=adam_opt, metrics=[metrics.Accuracy()])

In [None]:
model.summary()

In [None]:
history = model.fit(x_train_std, y_train, epochs=1000, batch_size=24, verbose=2, shuffle=True)

In [None]:
# need this to show plot in colab
%matplotlib inline

In [None]:
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['accuracy'], label='accuracy')
plt.legend()
plt.show()

In [None]:
results = model.evaluate(x_test_std, y_test, batch_size=24, verbose=2)
print("Testing loss: ", results[0])
print("Testing accuracy: ", results[1])