In [6]:
import numpy as np
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

from keras import backend as K

In [24]:
file_path = "./30_yr_stock_market_data.csv"
num_columns = len(np.genfromtxt(file_path, delimiter=',', max_rows=1))
load_string = lambda s: str(s)[2:-1]

value_converters = {i: lambda s: float(s or 0) for i in range(1,num_columns)}
value_converters[0] = load_string
name_converters = {i: load_string for i in range(num_columns)}

names = np.genfromtxt(
    file_path, 
    delimiter=',',
    converters = name_converters,
    max_rows=1
    ).item()

raw_data = np.genfromtxt(
    file_path, 
    delimiter=',', 
    skip_header=1,
    converters = value_converters,
    )

dates = [_data[0] for _data in raw_data]
dow_jones = [_data[1] for _data in raw_data]
sp500 = [_data[4] for _data in raw_data]
nasdaq = [_data[6] for _data in raw_data]
nyse = [_data[7] for _data in raw_data]
russell = [_data[8] for _data in raw_data]
cboe_volitility = [_data[9] for _data in raw_data]

data = [dates, dow_jones, sp500, nasdaq, nyse, russell, cboe_volitility]
selected_name = ["dates", "dow_jones", "sp500", "nasdaq", "nyse", "russell", "cboe_volitility"]

invalid_index = []
invalidations = {}
for i in range(len(dates)):
    if sp500[i] == 0:
        invalid_index.append(i)
        invalidations[dates[i]] = i

invalid_index.reverse()
for ind in invalid_index:
    for j in range(len(data)):
        stock_data = data[j]
        stock_data.pop(ind)
     

In [12]:
def get_day_of_year(date_str: str):
    date_object = datetime.strptime(date_str, "%Y-%m-%d")
    return (date_object - datetime(date_object.year, 1, 1)).days + 1

def get_predicting_date(date_str: str, predict_period: int):
    date_object = datetime.strptime(date_str, "%Y-%m-%d")
    return date_object + timedelta(days=predict_period)


def get_date_diff(predict_date: str, latest_date: str):
    predict_date_object = datetime.strptime(predict_date, "%Y-%m-%d")
    latest_date_object = datetime.strptime(latest_date, "%Y-%m-%d")

    return (predict_date_object - latest_date_object).days - 10

def calculate_growth(base: int, target: int):
    return target/base

def calculate_acc(y_predicted, y_test):
    cnt = 0
    for i in range(len(y_predicted)):
        y_p = y_predicted[i]
        y_t = y_test[i]
        if y_p > 1 and y_t <= 1:
            cnt += 1
        if y_p <= 1 and y_t > 1:
            cnt += 1
    
    print(cnt, "/", len(y_test))
    return 100 - cnt/len(y_test)*100


In [13]:
"""
PROCESS DATA

X_data: proportion vs expected y_value
Y_data: proportion vs previous day y_value
"""
INPUT_SIZE = 90
PREDICT_PERIOD = 30
DATA_LENGTH = len(sp500)
X_data = []
Y_data = []


for i in range(DATA_LENGTH - INPUT_SIZE - PREDICT_PERIOD + 1):
    stack_data = []
    
    start_train_idx = i
    end_train_idx = start_train_idx + INPUT_SIZE - 1 
    predict_idx = end_train_idx + PREDICT_PERIOD
    
    
    y_based_value = sp500[end_train_idx]
    doy = get_day_of_year(dates[end_train_idx])
    
    for stock_data in data[1:]:
        interval_data = [doy] + [calculate_growth(x, y_based_value) for x in stock_data[start_train_idx : start_train_idx+INPUT_SIZE]]
        stack_data.append(interval_data)

    X_data.append(stack_data)
    
    y_growth = calculate_growth(sp500[start_train_idx], sp500[predict_idx])
    
    Y_data.append(y_growth)


X_data = np.array(X_data)
Y_data = np.array(Y_data)
x_train, x_test, y_train, y_test = train_test_split(X_data, Y_data, test_size=0.2)

In [14]:
pre_ft_model = keras.models.load_model("./models/prod_ft.h5")

fine_tuned_test_loss = pre_ft_model.evaluate(x_test, y_test)
print(f"Fine Tuned Mean Absoluted Error on Test Data: {fine_tuned_test_loss}")

# Make predictions
fine_tuned_predictions = pre_ft_model.predict(x_test)

Fine Tuned Mean Absoluted Error on Test Data: 0.009391771629452705


2023-12-07 15:31:26.969733: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


In [15]:
y_predicted = np.array([predict[0] for predict in fine_tuned_predictions])
ft_acc = calculate_acc(y_predicted, y_test)
print("Acc:", ft_acc)

39 / 1487
Acc: 97.37726967047747


In [33]:
"""
Process data to predict
"""
INPUT_SIZE = 90
PREDICT_PERIOD = 30
DATA_LENGTH = len(sp500)


PREDICT_DATE = "2023-11-17"


DATE_DIFF = get_date_diff(PREDICT_DATE, dates[-1])
predict_idx = DATA_LENGTH + PREDICT_PERIOD + DATE_DIFF - 1
end_train_idx = predict_idx - PREDICT_PERIOD
start_train_idx = end_train_idx - INPUT_SIZE + 1

    
predict_data = []
print(len(sp500), end_train_idx)
y_based_value = sp500[end_train_idx]
doy = get_day_of_year(dates[end_train_idx])

for stock_data in data[1:]:
    interval_data = [doy] + [calculate_growth(x, y_based_value) for x in stock_data[start_train_idx : start_train_idx + INPUT_SIZE]]
    predict_data.append(interval_data)


print(start_train_idx, end_train_idx, predict_idx)
predictions = pre_ft_model.predict([predict_data])
print("Predict date:", get_predicting_date(dates[end_train_idx], PREDICT_PERIOD))
print("OG:", sp500[start_train_idx])
print("Predicted growth:", predictions[0][0])
print("Predicted value:", sp500[start_train_idx] * predictions[0][0])

7554 7543
7454 7543 7573
Predict date: 2023-12-03 00:00:00
OG: 4396.43994140625
Predicted growth: 1.0704906
Predicted value: 4706.347624930437


In [174]:
# FT_RANGE = 0
# _y_test = y_test[FT_RANGE:]
# _y_predicted = fine_tuned_predictions[FT_RANGE:]

# data_range = range(FT_RANGE, len(_y_test))
# plt.figure(figsize=(70, 10))

# plt.plot(data_range, _y_test, label='Y TEST')
# plt.plot(data_range, _y_predicted, label='Y PREDICTED')
# plt.axhline(y=1, color='r', linestyle='-', label='y=1')
# plt.legend(loc='upper right')
# plt.title('Test and Predicted')
# plt.savefig('output_plot.png')

# plt.show()

In [38]:
def predict_future(predict_dates: list[str]):
    INPUT_SIZE = 90
    PREDICT_PERIOD = 30
    DATA_LENGTH = len(sp500)
    res = {}
    
    for date in predict_dates:
        DATE_DIFF = get_date_diff(date, dates[-1])
        print(DATE_DIFF)
        predict_idx = DATA_LENGTH + PREDICT_PERIOD + DATE_DIFF - 1
        end_train_idx = predict_idx - PREDICT_PERIOD
        start_train_idx = end_train_idx - INPUT_SIZE + 1
            
        predict_data = []
        print(date)
        y_based_value = sp500[end_train_idx]
        doy = get_day_of_year(dates[end_train_idx])
        
        for crypto_data in data[1:]:
            interval_data = [doy] + [calculate_growth(x, y_based_value) for x in crypto_data[start_train_idx : start_train_idx + INPUT_SIZE]]
            predict_data.append(interval_data)
        
        predictions = pre_ft_model.predict([predict_data])
        res[date] = sp500[start_train_idx] * predictions[0][0]
    return res

In [39]:
print("Latest:", dates[-1])
predict_dates = [f"2023-11-{day}" for day in range(1,30)]
future = predict_future(predict_dates)
future

Latest: 2023-11-17
-26
2023-11-1
-25
2023-11-2
-24
2023-11-3
-23
2023-11-4
-22
2023-11-5
-21
2023-11-6
-20
2023-11-7
-19
2023-11-8
-18
2023-11-9
-17
2023-11-10
-16
2023-11-11
-15
2023-11-12
-14
2023-11-13
-13
2023-11-14
-12
2023-11-15
-11
2023-11-16
-10
2023-11-17
-9
2023-11-18
-8
2023-11-19
-7
2023-11-20
-6
2023-11-21
-5
2023-11-22
-4
2023-11-23
-3
2023-11-24
-2
2023-11-25
-1
2023-11-26
0
2023-11-27
1
2023-11-28


IndexError: list index out of range

In [215]:
low = future["2023-12-2"]
high = future["2023-12-9"]
gain = (high - low)/low
invest = 50
print(invest*gain)

6.946726504257004


In [27]:
print(len(invalidations))
for k,v in invalidations.items():
    print(k, v)

198
1993-11-17 0
1993-11-25 6
1993-12-24 27
1994-02-21 68
1994-04-27 114
1994-05-30 137
1994-07-04 162
1994-09-05 207
1994-11-24 265
1995-01-02 291
1995-02-20 326
1995-05-29 395
1995-07-04 421
1995-09-04 465
1995-11-23 523
1996-02-19 583
1996-05-27 652
1996-07-04 680
1996-09-02 722
1996-11-28 785
1997-02-17 840
1997-05-26 909
1997-07-04 938
1997-09-01 979
1997-11-27 1042
1998-01-19 1077
1998-02-16 1097
1998-05-25 1166
1998-07-03 1195
1998-09-07 1241
1998-11-26 1299
1999-01-18 1334
1999-02-15 1354
1999-05-31 1428
1999-07-05 1453
1999-09-06 1498
1999-11-25 1556
1999-12-24 1577
2000-01-17 1593
2000-02-21 1618
2000-05-29 1687
2000-07-04 1713
2000-09-04 1757
2000-11-23 1815
2001-01-15 1850
2001-02-19 1875
2001-05-28 1944
2001-07-04 1971
2001-09-03 2014
2001-09-11 2020
2001-09-12 2021
2001-09-13 2022
2001-09-14 2023
2001-11-22 2072
2002-01-21 2112
2002-02-18 2132
2002-05-27 2201
2002-07-04 2229
2002-09-02 2271
2002-11-28 2334
2003-01-20 2369
2003-02-17 2389
2003-05-26 2458
2003-07-04 2487
20