##### Imports and Config

In [1]:
import sys
import os
import pandas as pd
import math
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import yfinance as yf
from datetime import datetime
from src.components import data_ingestion, data_transformation, model_trainer

In [2]:
plt.style.use("fivethirtyeight")
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
CWD_PATH = "/home/james/Projects/stock_price_prediction/"
DATA_PATH = "/home/james/Projects/stock_price_prediction/artifacts/datasets"
sys.path.append(CWD_PATH)

##### Query data

In [3]:
AAPL = data_ingestion.get_stock_data("AAPL", start_date="2004-07-01", end_date="2024-07-01", save_as="")

[*********************100%%**********************]  1 of 1 completed


In [6]:
AAPL.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2002-01-02,0.39375,0.416071,0.392143,0.416071,529496800
2002-01-03,0.410714,0.424107,0.406607,0.421071,612007200
2002-01-04,0.416786,0.427679,0.410536,0.423036,409976000
2002-01-07,0.423571,0.428571,0.40625,0.408929,444584000
2002-01-08,0.40625,0.411607,0.401071,0.40375,450038400


In [7]:
from json import loads, dumps
result = AAPL.to_json(orient="index")
parsed = loads(result)
dumps(parsed, indent=4)

'{\n    "1009929600000": {\n        "Open": 0.3937500119,\n        "High": 0.4160709977,\n        "Low": 0.3921430111,\n        "Close": 0.4160709977,\n        "Volume": 529496800\n    },\n    "1010016000000": {\n        "Open": 0.4107140005,\n        "High": 0.4241069853,\n        "Low": 0.406607002,\n        "Close": 0.4210709929,\n        "Volume": 612007200\n    },\n    "1010102400000": {\n        "Open": 0.4167859852,\n        "High": 0.4276790023,\n        "Low": 0.4105359912,\n        "Close": 0.4230360091,\n        "Volume": 409976000\n    },\n    "1010361600000": {\n        "Open": 0.4235709906,\n        "High": 0.4285709858,\n        "Low": 0.40625,\n        "Close": 0.4089289904,\n        "Volume": 444584000\n    },\n    "1010448000000": {\n        "Open": 0.40625,\n        "High": 0.4116069973,\n        "Low": 0.401071012,\n        "Close": 0.4037500024,\n        "Volume": 450038400\n    },\n    "1010534400000": {\n        "Open": 0.4071429968,\n        "High": 0.4094640017

In [3]:
macro = data_ingestion.get_macro_data("EFFR UNRATE UMCSENT", start_date="01-01-2002")

In [4]:
macro.head(30)

Unnamed: 0,EFFR,UNRATE,UMCSENT
2002-01-01,1.92,5.7,93.0
2002-01-02,1.92,5.7,93.0
2002-01-03,1.72,5.7,93.0
2002-01-04,1.61,5.7,93.0
2002-01-07,1.61,5.7,93.0
2002-01-08,1.61,5.7,93.0
2002-01-09,1.74,5.7,93.0
2002-01-10,1.81,5.7,93.0
2002-01-11,1.71,5.7,93.0
2002-01-14,1.78,5.7,93.0


##### Format data for LSTM

In [40]:
name = "SP500"

In [50]:
working_df = pd.read_pickle(os.path.join(DATA_PATH, f"{name}/{name}_20y.pkl"))
# working_df["Open_dff"] = data_transformation.close_open_diff(working_df)
macro_df = pd.read_pickle(os.path.join(DATA_PATH, "macro/macro_20y.pkl"))
working_df = working_df.merge(macro_df, how="inner", left_index=True, right_index=True)


In [51]:
data_transformation.get_technical_indicators(working_df, inplace=True)

In [52]:
working_df.head()

Unnamed: 0,Open,High,Low,Close,Volume,EFFR,UNRATE,UMCSENT,^VIX,DX-Y.NYB,MACD,ATR,RSI
2004-07-01,1140.839966,1140.839966,1123.060059,1128.939941,1495700000,1.4,5.5,96.7,15.2,88.790001,0.0,17.779907,
2004-07-02,1128.939941,1129.150024,1123.26001,1125.380005,1085000000,1.25,5.5,96.7,15.08,87.970001,0.283984,11.614778,-0.0
2004-07-06,1125.380005,1125.380005,1113.209961,1116.209961,1283300000,1.3,5.5,96.7,16.25,88.120003,1.234755,11.81374,-0.0
2004-07-07,1116.209961,1122.369995,1114.920044,1118.329956,1328600000,1.26,5.5,96.7,15.81,87.650002,1.796474,10.598698,-19.981131
2004-07-08,1118.329956,1119.119995,1108.719971,1109.109985,1401100000,1.26,5.5,96.7,16.200001,87.540001,2.95159,10.552867,-10.690871


In [53]:
working_df.drop(columns=["Open", "High", "Low"], inplace=True)

In [55]:
working_df.corr()

Unnamed: 0,Close,Volume,EFFR,UNRATE,UMCSENT,^VIX,DX-Y.NYB,MACD,ATR,RSI
Close,1.0,0.145856,0.196423,-0.500507,-0.092746,-0.103582,0.76856,-0.250057,0.663943,-0.016256
Volume,0.145856,1.0,-0.29723,0.294812,-0.452644,0.663984,0.047474,0.191838,0.470283,0.005415
EFFR,0.196423,-0.29723,1.0,-0.567089,0.043574,-0.252733,0.239582,-0.069143,0.028163,-0.020295
UNRATE,-0.500507,0.294812,-0.567089,1.0,-0.367131,0.361199,-0.58623,-0.026569,-0.175842,0.011561
UMCSENT,-0.092746,-0.452644,0.043574,-0.367131,1.0,-0.53759,0.166752,-0.110941,-0.402976,0.014636
^VIX,-0.103582,0.663984,-0.252733,0.361199,-0.53759,1.0,-0.094293,0.476199,0.519556,0.001298
DX-Y.NYB,0.76856,0.047474,0.239582,-0.58623,0.166752,-0.094293,1.0,-0.066672,0.571282,-0.016216
MACD,-0.250057,0.191838,-0.069143,-0.026569,-0.110941,0.476199,-0.066672,1.0,0.300614,-0.003653
ATR,0.663943,0.470283,0.028163,-0.175842,-0.402976,0.519556,0.571282,0.300614,1.0,-0.009606
RSI,-0.016256,0.005415,-0.020295,0.011561,0.014636,0.001298,-0.016216,-0.003653,-0.009606,1.0


In [56]:
train_df, val_df, test_df = data_transformation.train_val_test_ordered_split(working_df)

In [57]:
train_df.to_pickle(os.path.join(DATA_PATH, f"{name}/train_df.pkl"))
val_df.to_pickle(os.path.join(DATA_PATH, f"{name}/val_df.pkl"))
test_df.to_pickle(os.path.join(DATA_PATH, f"{name}/test_df.pkl"))