In [114]:
import ta
import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

import tensorflow as tf

In [115]:
volumes = pd.read_csv('Data/stock_volumes.csv', index_col = 'date', parse_dates = True)
prices = pd.read_csv('Data/stock_prices.csv', index_col = 'date', parse_dates = True)
info = pd.read_csv('Data/stock_info.csv')
sp_listings = pd.read_csv('Data/sp500_listings.csv')                   

In [116]:
stocks = ['AMZN']
info[info["MNEM"].str.contains('|'.join(stocks), na= False)]

Unnamed: 0,Instrument,MNEM,RIC,ISIN,NAME,ESTAT
883,891399,@AMZN,AMZN.O,US0231351067,AMAZON.COM,ACT.


In [117]:
df = pd.DataFrame(prices['891399']).loc['2000-05-1':'2021-05-31']

In [118]:
df['volume'] = pd.DataFrame(volumes['891399']).loc['2000-05-1':'2021-05-31']
df = df.rename(columns={"891399": "close"})
df.head()

Unnamed: 0_level_0,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2000-05-01,59.9375,8578.2
2000-05-02,56.125,7427.6
2000-05-03,54.125,5424.8
2000-05-04,55.0625,4725.2
2000-05-05,58.5,4060.9


**Add targets**

The target to be predicted in the $i^{th}$ day is calculated as follows:

$$target_i = sign(P_{i+d} - P_i)$$

Where $d$ is the number of days after which the prediction is to be made

In [119]:
df['label'] = df.close.shift(-1) - df.close
df.label = df.label.apply(lambda x: 0 if x < 0 else 1)
df.head()

Unnamed: 0_level_0,close,volume,label
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-05-01,59.9375,8578.2,0
2000-05-02,56.125,7427.6,0
2000-05-03,54.125,5424.8,1
2000-05-04,55.0625,4725.2,1
2000-05-05,58.5,4060.9,0


**Features: Technical Indicators**

In [120]:
# Momentum indicators
df['roc'] = ta.momentum.roc(close=df.close) # Rate of Change (ROC)
df['rsi'] = ta.momentum.rsi(close=df.close) # Relative Strength Index (RSI)
df['tsi'] = ta.momentum.tsi(close=df.close) # True strength index (TSI)

# Volatility indicators
bb_indicator = ta.volatility.BollingerBands(close=df.close)
df['bb_bbhi'] = bb_indicator.bollinger_hband_indicator() # Bollinger Band high indicator
df['bb_bbli'] = bb_indicator.bollinger_lband_indicator() # Bollinger Band low indicator

# Trend indicators
aroon_indicator = ta.trend.AroonIndicator(close=df.close)
macd_indicator = ta.trend.MACD(close=df.close)
kst_indicator = ta.trend.KSTIndicator(close=df.close)
df['aroon_down'] = aroon_indicator.aroon_down() # Aroon Down Channel
df['aroon'] = aroon_indicator.aroon_indicator() # Aroon Indicator
df['aroon_up'] = aroon_indicator.aroon_up() # Aroon Up Channel
df['macd_line'] = macd_indicator.macd() # MACD Line
df['macd_hist'] = macd_indicator.macd_diff() # MACD Histogram
df['macd_signal'] = macd_indicator.macd_signal() # MACD Signal Line
df['kst'] = kst_indicator.kst() # Know Sure Thing (KST)
df['kst_diff'] = kst_indicator.kst_diff() # Diff Know Sure Thing (KST)
df['kst_signal'] = kst_indicator.kst_sig() # Signal Line Know Sure Thing (KST)
df['dpo'] = ta.trend.dpo(close=df.close) # Detrended Price Oscillator (DPO)
df['trix'] = ta.trend.trix(close=df.close) # Trix (TRIX)
df['sma_10'] = ta.trend.sma_indicator(close=df.close, window=10) # SMA n=10
df['sma_20'] = ta.trend.sma_indicator(close=df.close, window=20) # SMA n=20
df['sma_30'] = ta.trend.sma_indicator(close=df.close, window=30) # SMA n=30
df['sma_60'] = ta.trend.sma_indicator(close=df.close, window=60) # SMA n=60
df['ema_10'] = ta.trend.sma_indicator(close=df.close, window=10) # EMA n=10
df['ema_20'] = ta.trend.sma_indicator(close=df.close, window=20) # EMA n=20
df['ema_30'] = ta.trend.sma_indicator(close=df.close, window=30) # EMA n=30
df['ema_60'] = ta.trend.sma_indicator(close=df.close, window=60) # EMA n=60

# Volume indicators
df['obv'] = ta.volume.on_balance_volume(close=df.close, volume=df.volume) # On Balance Volume (OBV)
df['vpt'] = ta.volume.volume_price_trend(close=df.close, volume=df.volume) # Volume-price trend (VPT)
df['fi'] = ta.volume.force_index(close=df.close, volume=df.volume) # Force Index (FI)
df['nvi'] = ta.volume.negative_volume_index(close=df.close, volume=df.volume) # Negative Volume Index (NVI)

df.tail()

Unnamed: 0_level_0,close,volume,label,roc,rsi,tsi,bb_bbhi,bb_bbli,aroon_down,aroon,...,sma_30,sma_60,ema_10,ema_20,ema_30,ema_60,obv,vpt,fi,nvi
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-05-24,3244.99,2422.8,1,-1.856417,48.172664,-6.814223,0.0,0.0,68.0,-36.0,...,3313.388,3215.229,3219.044,3288.1235,3313.388,3215.229,1505136.7,-24.671503,-23573.005285,670.966455
2021-05-25,3259.05,3261.1,1,-0.989182,49.577056,-6.018646,0.0,0.0,64.0,-36.0,...,3308.689667,3217.110833,3222.558,3280.2045,3308.689667,3217.110833,1508397.8,45.830399,-13655.280816,670.966455
2021-05-26,3265.16,2384.0,0,2.340393,50.208486,-5.186664,0.0,0.0,60.0,-36.0,...,3306.428333,3219.954667,3233.88,3270.5375,3306.428333,3219.954667,1510781.8,18.599276,-9623.634985,672.224369
2021-05-27,3230.11,2561.2,0,0.192313,46.603165,-5.458353,0.0,0.0,56.0,-36.0,...,3301.462333,3223.7065,3240.744,3258.4775,3301.462333,3223.7065,1508220.6,-23.023837,-21073.124273,672.224369
2021-05-28,3223.07,2331.5,1,2.256705,45.890381,-5.877039,0.0,0.0,52.0,-36.0,...,3295.583333,3227.798167,3240.761,3246.26,3295.583333,3227.798167,1505889.1,-32.574798,-20407.500805,670.759261


**Features: Datetime cyclical encoding**

$$x_{sin} = \sin(\frac{2*\pi*x}{max(x)})$$

$$x_{cos} = \cos(\frac{2*\pi*x}{max(x)})$$

In [121]:
df['datetime'] = df.index.to_pydatetime()
df['day_sin'] = np.sin(2 * np.pi * df.datetime.dt.day / 30)
df['day_cos'] = np.cos(2 * np.pi * df.datetime.dt.day / 30)
df['month_sin'] = np.sin(2 * np.pi * df.datetime.dt.month / 12)
df['month_cos'] = np.cos(2 * np.pi * df.datetime.dt.month / 12)

df = df.drop(['datetime'], axis=1)
print(df.shape)
df.tail()

(5304, 35)


Unnamed: 0_level_0,close,volume,label,roc,rsi,tsi,bb_bbhi,bb_bbli,aroon_down,aroon,...,ema_30,ema_60,obv,vpt,fi,nvi,day_sin,day_cos,month_sin,month_cos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-05-24,3244.99,2422.8,1,-1.856417,48.172664,-6.814223,0.0,0.0,68.0,-36.0,...,3313.388,3215.229,1505136.7,-24.671503,-23573.005285,670.966455,-0.951057,0.309017,0.5,-0.866025
2021-05-25,3259.05,3261.1,1,-0.989182,49.577056,-6.018646,0.0,0.0,64.0,-36.0,...,3308.689667,3217.110833,1508397.8,45.830399,-13655.280816,670.966455,-0.866025,0.5,0.5,-0.866025
2021-05-26,3265.16,2384.0,0,2.340393,50.208486,-5.186664,0.0,0.0,60.0,-36.0,...,3306.428333,3219.954667,1510781.8,18.599276,-9623.634985,672.224369,-0.743145,0.669131,0.5,-0.866025
2021-05-27,3230.11,2561.2,0,0.192313,46.603165,-5.458353,0.0,0.0,56.0,-36.0,...,3301.462333,3223.7065,1508220.6,-23.023837,-21073.124273,672.224369,-0.587785,0.809017,0.5,-0.866025
2021-05-28,3223.07,2331.5,1,2.256705,45.890381,-5.877039,0.0,0.0,52.0,-36.0,...,3295.583333,3227.798167,1505889.1,-32.574798,-20407.500805,670.759261,-0.406737,0.913545,0.5,-0.866025


In [122]:
df_na = df.dropna(axis=0)
print(df_na.shape)
df_na.head()

(5245, 35)


Unnamed: 0_level_0,close,volume,label,roc,rsi,tsi,bb_bbhi,bb_bbli,aroon_down,aroon,...,ema_30,ema_60,obv,vpt,fi,nvi,day_sin,day_cos,month_sin,month_cos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-07-25,37.625,27743.9,0,4.152249,43.173187,-14.062078,0.0,0.0,60.0,-56.0,...,39.695833,46.492968,-56541.9,-1139.15773,-629.318075,659.593185,-0.866025,0.5,-0.5,-0.866025
2000-07-26,36.0625,15452.1,0,2.85205,40.134541,-15.064725,0.0,0.0,56.0,-52.0,...,39.264583,46.095052,-71994.0,-1447.166569,-3988.544957,632.201442,-0.743145,0.669131,-0.5,-0.866025
2000-07-27,31.375,23576.7,0,-5.283019,32.699086,-18.375261,0.0,1.0,100.0,-36.0,...,38.7625,45.682552,-95570.7,-3706.260897,-19206.72157,632.201442,-0.587785,0.809017,-0.5,-0.866025
2000-07-28,30.0,12444.2,1,-14.438503,30.891204,-21.659706,0.0,1.0,100.0,-40.0,...,38.21875,45.280468,-108014.9,-3609.925738,-18907.300632,604.495403,-0.406737,0.913545,-0.5,-0.866025
2000-07-31,30.125,9478.4,1,-13.928571,31.263266,-24.163073,0.0,0.0,96.0,-40.0,...,37.689583,44.864843,-98536.5,-505.870013,-16037.000541,607.014133,0.207912,0.978148,-0.5,-0.866025


In [123]:
labels = df_na.label
df_na = df_na.drop(['label'], axis=1)

**Writing data to csv**

In [135]:
df_na.to_csv('amzn_cleaned')

In [124]:
df_na.columns

Index(['close', 'volume', 'roc', 'rsi', 'tsi', 'bb_bbhi', 'bb_bbli',
       'aroon_down', 'aroon', 'aroon_up', 'macd_line', 'macd_hist',
       'macd_signal', 'kst', 'kst_diff', 'kst_signal', 'dpo', 'trix', 'sma_10',
       'sma_20', 'sma_30', 'sma_60', 'ema_10', 'ema_20', 'ema_30', 'ema_60',
       'obv', 'vpt', 'fi', 'nvi', 'day_sin', 'day_cos', 'month_sin',
       'month_cos'],
      dtype='object')

**Splitting the dataset**

In [125]:
X_train, X_test, y_train, y_test = train_test_split(df_na.values, labels.values, test_size=0.05, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4982, 34), (263, 34), (4982,), (263,))

**Scaling**

In [126]:
scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler()

In [127]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

**Principal Component Analysis (PCA)**

In [128]:
pca = PCA(n_components=0.8, random_state=42)
pca.fit(X_train_scaled)

PCA(n_components=0.8, random_state=42)

In [129]:
X_train_pca = pca.transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [130]:
X_train_pca.shape, X_test_pca.shape

((4982, 9), (263, 9))

**Logistic Regression**

In [131]:
model = LogisticRegression()
model.fit(X_train_pca, y_train)
y_pred = model.predict_proba(X_test_pca)
acc = accuracy_score(y_test, y_pred.argmax(axis=1))
f1 = f1_score(y_test, y_pred.argmax(axis=1))
roc = roc_auc_score(y_test, y_pred[:, 1])
print(f'LogisticRegression: acc={acc*100:.2f}% - f1={f1*100:.2f}% - roc={roc*100:.2f}%')

LogisticRegression: acc=49.81% - f1=61.63% - roc=52.73%


**Decision Trees**

In [132]:
model = DecisionTreeClassifier()
model.fit(X_train_pca, y_train)
y_pred = model.predict_proba(X_test_pca)
acc = accuracy_score(y_test, y_pred.argmax(axis=1))
f1 = f1_score(y_test, y_pred.argmax(axis=1))
roc = roc_auc_score(y_test, y_pred[:, 1])
print(f'LogisticRegression: acc={acc*100:.2f}% - f1={f1*100:.2f}% - roc={roc*100:.2f}%')

LogisticRegression: acc=54.37% - f1=55.56% - roc=54.34%


**Random Forest**

In [133]:
model = RandomForestClassifier()
model.fit(X_train_pca, y_train)
y_pred = model.predict_proba(X_test_pca)
acc = accuracy_score(y_test, y_pred.argmax(axis=1))
f1 = f1_score(y_test, y_pred.argmax(axis=1))
roc = roc_auc_score(y_test, y_pred[:, 1])
print(f'RandomForestClassifier: acc={acc*100:.2f}% - f1={f1*100:.2f}% - roc={roc*100:.2f}%')

RandomForestClassifier: acc=52.09% - f1=53.68% - roc=51.85%


**Gradient Boosting**

In [134]:
model = GradientBoostingClassifier()
model.fit(X_train_pca, y_train)
y_pred = model.predict_proba(X_test_pca)
acc = accuracy_score(y_test, y_pred.argmax(axis=1))
f1 = f1_score(y_test, y_pred.argmax(axis=1))
roc = roc_auc_score(y_test, y_pred[:, 1])
print(f'GradientBoostingClassifier: acc={acc*100:.2f}% - f1={f1*100:.2f}% - roc={roc*100:.2f}%')

GradientBoostingClassifier: acc=55.89% - f1=62.82% - roc=53.48%
