In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
import sklearn as sk
import yfinance as yf


## Data Retrieval

In [2]:
symbols = ['AAPL', 'MSFT', 'INTC', 'AMZN', 'GS'] 
raw = pd.read_csv('tr_eikon_eod_data.csv', index_col=0, parse_dates=True).dropna()
raw

Unnamed: 0_level_0,AAPL.O,MSFT.O,INTC.O,AMZN.O,GS.N,SPY,.SPX,.VIX,EUR=,XAU=,GDX,GLD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2010-01-04,30.572827,30.950,20.88,133.90,173.08,113.33,1132.99,20.04,1.4411,1120.00,47.71,109.80
2010-01-05,30.625684,30.960,20.87,134.69,176.14,113.63,1136.52,19.35,1.4368,1118.65,48.17,109.70
2010-01-06,30.138541,30.770,20.80,132.25,174.26,113.71,1137.14,19.16,1.4412,1138.50,49.34,111.51
2010-01-07,30.082827,30.452,20.60,130.00,177.67,114.19,1141.69,19.06,1.4318,1131.90,49.10,110.82
2010-01-08,30.282827,30.660,20.83,133.52,174.31,114.57,1144.98,18.13,1.4412,1136.10,49.84,111.37
...,...,...,...,...,...,...,...,...,...,...,...,...
2018-06-25,182.170000,98.390,50.71,1663.15,221.54,271.00,2717.07,17.33,1.1702,1265.00,22.01,119.89
2018-06-26,184.430000,99.080,49.67,1691.09,221.58,271.60,2723.06,15.92,1.1645,1258.64,21.95,119.26
2018-06-27,184.160000,97.540,48.76,1660.51,220.18,269.35,2699.63,17.91,1.1552,1251.62,21.81,118.58
2018-06-28,185.500000,98.630,49.25,1701.45,223.42,270.89,2716.31,16.85,1.1567,1247.88,21.93,118.22


In [3]:
start, end = '2010-01-04', '2018-06-30'

In [4]:

def fetch_stock_data(symbols, start_date, end_date):
    stock_data_dict = {}
    try:
        for symbol in symbols:
            stock_data = yf.download(symbol, start=start_date, end=end_date)
            stock_data_dict[symbol] = stock_data
        return stock_data_dict
    except Exception as e:
        print(f"Error fetching data: {e}")
        return None

In [5]:
df = fetch_stock_data(symbols, start, end)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


In [6]:
# Adjust to df
# Name variable as f{symbol}
AAPL = df['AAPL']
MSFT = df['MSFT']
INTC = df['INTC']
AMZN = df['AMZN']
GS = df['GS']

In [7]:
# Adjust stock prices
AAPL[['Open', 'High', 'Low', 'Close', 'Adj Close']] = AAPL[['Open', 'High', 'Low', 'Close', 'Adj Close']] * 4
AMZN[['Open', 'High', 'Low', 'Close', 'Adj Close']] = AMZN[['Open', 'High', 'Low', 'Close', 'Adj Close']]  * 20

In [8]:
#Round to 2 decimal places
AAPL = AAPL.round(2)
AMZN = AMZN.round(2)
MSFT = MSFT.round(2)
INTC = INTC.round(2)
GS = GS.round(2)

In [9]:
# 1 is high 0 is low
AAPL['Next Day Close'] = (AAPL['Close'].shift(-1) - AAPL['Close']).apply(lambda x: 1 if x > 0 else 0)
AMZN['Next Day Close'] = (AMZN['Close'].shift(-1) - AMZN['Close']).apply(lambda x: 1 if x > 0 else 0)
MSFT['Next Day Close'] = (MSFT['Close'].shift(-1) - MSFT['Close']).apply(lambda x: 1 if x > 0 else 0)
INTC['Next Day Close'] = (INTC['Close'].shift(-1) - INTC['Close']).apply(lambda x: 1 if x > 0 else 0)
GS['Next Day Close'] = (GS['Close'].shift(-1) - GS['Close']).apply(lambda x: 1 if x > 0 else 0)


## Generate Technical Indicators

In [10]:
import talib as ta
from talib.abstract import *

In [11]:
AAPL

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Next Day Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-04,30.50,30.64,30.34,30.57,25.85,493728200,1
2010-01-05,30.68,30.80,30.46,30.63,25.89,601904016,0
2010-01-06,30.63,30.75,30.11,30.14,25.48,552158376,0
2010-01-07,30.24,30.29,29.86,30.08,25.43,477129296,1
2010-01-08,30.06,30.29,29.87,30.28,25.60,447876324,0
...,...,...,...,...,...,...,...
2018-06-25,183.40,184.92,180.73,182.17,173.18,126652384,1
2018-06-26,182.99,186.53,182.54,184.43,175.33,98276804,0
2018-06-27,185.23,187.28,184.03,184.16,175.07,101141312,1
2018-06-28,184.10,186.21,183.80,185.50,176.35,69460940,0


In [16]:
def create_features(df):
    ohlcv = df[['Open', 'High', 'Low', 'Close', 'Volume']]
    ohlcv.columns = ['open', 'high', 'low', 'close', 'volume']

    # KD  
    KD = STOCH(ohlcv, fastk_period=9, slowk_period=3, slowd_period=3)
    KD['K - D'] = KD['slowk'] - KD['slowd']

    # BBANDS
    bbands = BBANDS(ohlcv, timeperiod=5, nbdevup=2.0, nbdevdn=2.0, matype=0)
    bbands['band width'] = (bbands['upperband'] - bbands['lowerband']) / bbands['middleband']

    # MACD
    macd = MACD(ohlcv)

    # RSI
    rsi =pd.DataFrame(RSI(ohlcv), columns=['rsi'])

    # OBV
    obv = pd.DataFrame(OBV(ohlcv), columns=['obv'])




    features = pd.concat([KD, bbands, macd, rsi, obv], axis=1)

    return features
    




In [36]:
# Sequential split 80/20
from sklearn.model_selection import TimeSeriesSplit
tss = TimeSeriesSplit(n_splits=4)
X = pd.concat([GS, create_features(GS)], axis = 1).drop(labels=['Next Day Close'], axis=1)
y = MSFT['Next Day Close']

In [37]:
for train_index, test_index in tss.split(X):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [39]:
symbol = 'GS'
X_train.to_csv(f'X_train_{symbol}.csv')
X_test.to_csv(f'X_test_{symbol}.csv')
y_train.to_csv(f'y_train_{symbol}.csv')
y_test.to_csv(f'y_test_{symbol}.csv')
