In [1]:
import os
import datetime

from utils.visualization import plot_predictions

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from sqlalchemy import create_engine, sql

In [2]:
# matplotlib settings
mpl.rcParams["figure.figsize"] = (12, 3)
mpl.rcParams["axes.grid"] = False

# constnats
PRINT_RED = '\033[91m'

In [3]:
#--------------------------------------------------------------------------
# Database connection setup
#--------------------------------------------------------------------------

# get postgres environment variables
PG_HOST = os.getenv("PG_HOST")
PG_PORT = os.getenv("PG_PORT")
PG_DB_NAME = os.getenv("PG_DB_NAME")
PG_USERNAME = os.getenv("PG_USERNAME")
PG_PASSWORD = os.getenv("PG_PASSWORD")

# check for missing environment variables
if PG_HOST == None or PG_PORT == None or PG_DB_NAME == None or PG_USERNAME == None or PG_PASSWORD == None:
    print(f'{PRINT_RED}[ ERROR ] Environment variables PG_HOST, PG_PORT, PG_DB_NAME, PG_USERNAME, or PG_PASSWORD not found.')

# hard-code table name
PG_TABLE = "bars_minute_eastern"

# connect to db and open a cursor to perform database operations
conn_string = f"postgresql://{PG_USERNAME}:{PG_PASSWORD}@{PG_HOST}:{PG_PORT}/{PG_DB_NAME}"
db = create_engine(conn_string)
conn = db.connect()

In [4]:
#------------------------------------------------------------------------------
# Pull data from db and wrangle into correct shape
# (minute bars for NVDA from 09:00-14:00 from 2022/06/01-2022/07/01)
#------------------------------------------------------------------------------

# create empty dataframe to hold bar data
days_data_df = pd.DataFrame(columns=["date", "minute_bars"])

# create empty dataframe to hold timestamp data
days_timestamps_df = pd.DataFrame(columns=["date", "timestamp"])

# NOTE: db contains minute bars for NVDA,INTC from 2022/06/01 to 2022/07/01 (inclusive)
start_date = datetime.date(2022, 6, 1)
end_date = datetime.date(2022, 7, 1)

# loop over each date and get minute_bars from db
current_date = start_date
day_delta = datetime.timedelta(days=1)
while current_date <= end_date:
    # pull minute_bars for current_date
    day_minute_bars_df = pd.read_sql_query(
        sql=sql.text(f"select * from { PG_TABLE } where SYMBOL='NVDA' and date(TIMESTAMP)='{ current_date }'"),
        con=conn,
    )

    # add minute_bars to days_df if there is bar data
    if not day_minute_bars_df.empty:
        # drop symbol column
        day_minute_bars_df.drop("symbol", axis=1, inplace=True)

        #----------------------------------------------------------------------
        # Convert timestamps to day/week/year sin and cosine signals
        #----------------------------------------------------------------------

        # extract timestamps
        timestamps = pd.to_datetime(day_minute_bars_df.pop("timestamp"))

        timestamp_s = timestamps.map(pd.Timestamp.timestamp)

        seconds_per_day = 24 * 60 * 60
        seconds_per_week = seconds_per_day * 7
        seconds_per_year = seconds_per_week * 52.1429

        day_minute_bars_df["day-sin"] = np.sin(timestamp_s * (2 * np.pi / seconds_per_day))
        day_minute_bars_df["day-cos"] = np.cos(timestamp_s * (2 * np.pi / seconds_per_day))

        day_minute_bars_df["week-sin"] = np.sin(timestamp_s * (2 * np.pi / seconds_per_week))
        day_minute_bars_df["week-cos"] = np.cos(timestamp_s * (2 * np.pi / seconds_per_week))

        day_minute_bars_df["year-sin"] = np.sin(timestamp_s * (2 * np.pi / seconds_per_year))
        day_minute_bars_df["year-cos"] = np.cos(timestamp_s * (2 * np.pi / seconds_per_year))

        # insert day minute bars and timestamps to their respecive dataframes
        days_data_df.loc[len(days_data_df), days_data_df.columns] = current_date, day_minute_bars_df
        days_timestamps_df.loc[len(days_timestamps_df), days_timestamps_df.columns] = current_date, timestamps

    # go to next day
    current_date += day_delta

In [5]:
# -----------------------------------------------------------------------------
# Split into training/validation (70%), validation (20%), and test (10%) sets
# -----------------------------------------------------------------------------

n = len(days_data_df)  # total number of days

days_train_df = days_data_df[0 : int(n * 0.7)]
days_test_timestamps_df = days_timestamps_df[0 : int(n * 0.7)]

days_val_df = days_data_df[int(n * 0.7) : int(n * 0.9)]
days_test_timestamps_df = days_timestamps_df[int(n * 0.7) : int(n * 0.9)]

days_test_df = days_data_df[int(n * 0.9) :]
days_test_timestamps_df = days_timestamps_df[int(n * 0.9) :]

In [37]:
#-----------------------------------------------------------------------------
# Create and train a convolutional model
#-----------------------------------------------------------------------------

SEQUENCE_LENGTH = 30
SEQUENCE_STRIDE = 1
BATCH_SIZE = 1
CONV_WIDTH = 3
MAX_EPOCHS = 20

conv_model = tf.keras.Sequential([
    tf.keras.layers.Conv1D(filters=32, kernel_size=CONV_WIDTH, activation='relu'),
    tf.keras.layers.Dense(units=32, activation='relu'),
    tf.keras.layers.Dense(units=1, activation='linear'),
])

conv_model.compile(
    loss=tf.keras.losses.MeanSquaredError(), # Computes the mean of squares of errors between labels and predictions.
    optimizer=tf.keras.optimizers.Adam(), # Adam optimization is a stochastic gradient descent method that is based on adaptive estimation of first-order and second-order moments.
    metrics=[tf.keras.metrics.MeanAbsoluteError()], # Computes the mean of absolute difference between labels and predictions.
)

# loop over training days to fit the model
for _, day_data in days_train_df.iterrows():
    # Extract the minute bars for this day
    minute_bars = day_data["minute_bars"].values

    # Extract the target variable (which we want to predict) from the minute bars
    targets = day_data["minute_bars"]['close'].values[SEQUENCE_LENGTH:]

    ds = tf.keras.utils.timeseries_dataset_from_array(
        data=minute_bars[:-1], # use all but the last minute as input
        targets=targets,
        sequence_length=SEQUENCE_LENGTH,
        sequence_stride=SEQUENCE_STRIDE,
        batch_size=BATCH_SIZE,
    )
    
    # Fit the model to the training data
    conv_model.fit(
        x=ds,
        epochs=MAX_EPOCHS,
        verbose=1,
    )

Epoch 1/20


2023-04-28 14:48:47.157085: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20

In [38]:
#------------------------------------------------------------------------------
#
#-----------------------

for index, day_data in days_test_df.iterrows():
    # Extract the minute bars for this day
    minute_bars = day_data["minute_bars"].values

    # Extract the target variable (which we want to predict) from the minute bars
    targets = day_data["minute_bars"]['close'].values[SEQUENCE_LENGTH:]

    dataset = tf.keras.utils.timeseries_dataset_from_array(
        data=minute_bars[:-1], # use all but the last minute as input
        targets=targets,
        sequence_length=SEQUENCE_LENGTH,
        sequence_stride=SEQUENCE_STRIDE,
        batch_size=BATCH_SIZE,
    )

    for batch in dataset:
        batch_inputs, batch_targets = batch
    
    # Evaluate the model on the test data
    loss, mae = conv_model.evaluate(x=dataset, verbose=0)
    print(f"Test loss: {loss}, Test MAE: {mae}")
    
    predictions = conv_model.predict(x=dataset, verbose=0, steps=None)

    print(predictions.shape) # (391, 1)

    # plot_predictions(labels=targets, predictions=predictions, timestamps=days_test_timestamps_df[index])

2023-04-28 15:06:02.155953: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Test loss: 4.537816047668457, Test MAE: 1.1700814962387085


2023-04-28 15:06:04.886531: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


(391, 28, 1)
Test loss: 5.749110221862793, Test MAE: 1.525199055671692
(391, 28, 1)
Test loss: 3.4051945209503174, Test MAE: 1.1835238933563232
(391, 28, 1)
