In [1]:
import tensorflow as tf
import polars as pl
from datetime import date, timedelta
import numpy as np

2025-09-12 15:11:12.299249: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-12 15:11:12.534846: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-09-12 15:11:13.388412: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
SILVER_DATA = "data/silver/"

In [184]:
CONTEXT_WINDOW = 90
VALIDATE_FILTER = pl.col("date").gt(date(2025, 7, 1))
FORWARD_DAYS_PREDICTION = 1

In [71]:
df_raw = pl.read_parquet(SILVER_DATA + "stock_*")

In [72]:
list_tickers = (
    df_raw.with_columns(tmp=pl.col("volume") * pl.col("close"))
    .sort("date", descending=True)
    .unique(subset="ticker")
    .sort("tmp", descending=True)
    .select("ticker")
    .limit(1)
    .to_numpy()[:, 0]
)
list_tickers

array(['TSLA'], dtype=object)

In [73]:
def fn_win_collect(x: str) -> pl.Expr:
    return (
        pl.col(x)
        .over(
            partition_by="ticker",
            order_by="date",
            mapping_strategy="join",
            descending=False,
        )
        .list.slice(
            pl.col("index") - CONTEXT_WINDOW,
            CONTEXT_WINDOW,
        )
        .alias("context_" + x)
    )

In [200]:
df = df_raw.filter(pl.col("ticker").is_in(list_tickers))
df = df.sort("ticker", "date")
df = df.with_columns(
    pl.row_index().over(partition_by="ticker", order_by="date").alias("index"),
    # pl.col("date")+timedelta(days=1)
)
df = df.with_columns(
    pl.col("high")
    .rolling_max(
        window_size=FORWARD_DAYS_PREDICTION + 1,
        weights=[1] * (FORWARD_DAYS_PREDICTION) + [0],
    )
    .over(partition_by="ticker", order_by="date", descending=True)
    .alias(f"comming_{FORWARD_DAYS_PREDICTION}_day_max"),
    pl.col("low")
    .rolling_min(
        window_size=FORWARD_DAYS_PREDICTION + 1,
        weights=[1] * (FORWARD_DAYS_PREDICTION) + [np.inf],
    )
    .over(partition_by="ticker", order_by="date", descending=True)
    .alias(f"comming_{FORWARD_DAYS_PREDICTION}_day_min"),
    fn_win_collect("open"),
    fn_win_collect("low"),
    fn_win_collect("high"),
    fn_win_collect("close"),
)


df = df.filter(pl.col("index").gt(CONTEXT_WINDOW) & pl.col(f"comming_{FORWARD_DAYS_PREDICTION}_day_max").is_not_null())
df

ticker,date,high,low,open,close,volume,index,comming_1_day_max,comming_1_day_min,context_open,context_low,context_high,context_close
str,date,f64,f64,f64,f64,f64,u32,f64,f64,list[f64],list[f64],list[f64],list[f64]
"""TSLA""",2010-11-05,1.664667,1.581333,1.658,1.629333,1.5165e7,91,1.666667,1.602,"[1.719333, 1.666667, … 1.506667]","[1.553333, 1.351333, … 1.476667]","[2.028, 1.728, … 1.688667]","[1.588667, 1.464, … 1.66]"
"""TSLA""",2010-11-08,1.666667,1.602,1.633333,1.665333,7.6425e6,92,1.712667,1.603333,"[1.666667, 1.533333, … 1.658]","[1.351333, 1.247333, … 1.581333]","[1.728, 1.54, … 1.664667]","[1.464, 1.28, … 1.629333]"
"""TSLA""",2010-11-09,1.712667,1.603333,1.666667,1.642,1.4346e7,93,1.998,1.603333,"[1.533333, 1.333333, … 1.633333]","[1.247333, 1.055333, … 1.602]","[1.54, 1.333333, … 1.666667]","[1.28, 1.074, … 1.665333]"
"""TSLA""",2010-11-10,1.998,1.603333,1.632,1.957333,4.59075e7,94,1.94,1.822,"[1.333333, 1.093333, … 1.666667]","[1.055333, 0.998667, … 1.603333]","[1.333333, 1.108667, … 1.712667]","[1.074, 1.053333, … 1.642]"
"""TSLA""",2010-11-11,1.94,1.822,1.906667,1.869333,2.91795e7,95,2.033333,1.871333,"[1.093333, 1.076, … 1.632]","[0.998667, 1.038, … 1.603333]","[1.108667, 1.168, … 1.998]","[1.053333, 1.164, … 1.957333]"
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""TSLA""",2025-08-28,353.549988,340.26001,350.910004,345.980011,6.79032e7,3815,348.75,331.700012,"[230.259995, 230.960007, … 351.940002]","[222.789993, 229.850006, … 349.160004]","[232.210007, 242.789993, … 355.390015]","[227.5, 237.970001, … 349.600006]"
"""TSLA""",2025-08-29,348.75,331.700012,347.230011,333.869995,8.11457e7,3816,333.329987,325.600006,"[230.960007, 254.860001, … 350.910004]","[229.850006, 244.429993, … 340.26001]","[242.789993, 259.450012, … 353.549988]","[237.970001, 250.740005, … 345.980011]"
"""TSLA""",2025-09-02,333.329987,325.600006,328.230011,329.359985,5.8392e7,3817,343.329987,328.51001,"[254.860001, 250.5, … 347.230011]","[244.429993, 249.199997, … 331.700012]","[259.450012, 259.540009, … 348.75]","[250.740005, 259.51001, … 333.869995]"
"""TSLA""",2025-09-03,343.329987,328.51001,335.200012,334.089996,8.87333e7,3818,338.890015,331.480011,"[250.5, 261.690002, … 328.230011]","[249.199997, 259.630005, … 325.600006]","[259.540009, 286.850006, … 333.329987]","[259.51001, 284.950012, … 329.359985]"


In [190]:
def convert_to_train_data(df_local):
    train_x = df_local.select(
        "context_open",
        "context_low",
        "context_high",
        "context_close",
    ).to_numpy()
    train_x = np.apply_along_axis(lambda x: np.vstack(x).T, 1, train_x)
    train_min = train_x.min(axis=(1, 2), keepdims=True)
    train_max = train_x.max(axis=(1, 2), keepdims=True)
    train_x_norm = (train_x - train_min) / (train_max - train_min)
    train_x_norm = train_x_norm.reshape(*train_x_norm.shape, 1)
    print(f"{train_x_norm.shape=}")

    train_y = df_local.select(
        f"comming_{FORWARD_DAYS_PREDICTION}_day_min",
        f"comming_{FORWARD_DAYS_PREDICTION}_day_max",
    ).to_numpy()
    train_y = (
        np.apply_along_axis(lambda x: np.vstack(x).T, 1, train_y)
    )
    train_y_norm = (train_y - train_min) / (train_max - train_min)
    train_y_norm = train_y_norm.reshape(train_y_norm.shape[0], train_y_norm.shape[2],1)
    print(f"{train_y_norm.shape=}")
    train_idx = df_local.select("date")
    return (
        train_idx,
        train_min,
        train_max,
        train_x_norm,
        train_y_norm,
    )

train_idx, train_min, train_max, train_x, train_y = convert_to_train_data(df.filter(~VALIDATE_FILTER))

train_x_norm.shape=(3684, 90, 4, 1)
train_y_norm.shape=(3684, 2, 1)


In [210]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, InputLayer


input_shape = train_x.shape[1:]
print(input_shape)
model = Sequential()
model.add(InputLayer(input_shape))
model.add(Conv2D(32, (30, 2), strides=1, activation="relu", padding="same"))
model.add(Conv2D(32, (30, 2), strides=1, activation="relu", padding="same"))
model.add(Conv2D(32, (30, 2), strides=1, activation="relu", padding="same"))
model.add(Flatten())
model.add(Dense(128, activation="relu"))
model.add(Dense(2))


model.compile(optimizer="adam", loss="mean_squared_error", metrics=["mae"])
model.summary()


(90, 4, 1)


In [211]:
print("\n--- Training the model ---")
model.fit(train_x, train_y, epochs=100, batch_size=16, verbose=1)


--- Training the model ---
Epoch 1/100


[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - loss: 0.0424 - mae: 0.1379
Epoch 2/100
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - loss: 0.0137 - mae: 0.0868
Epoch 3/100
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - loss: 0.0133 - mae: 0.0852
Epoch 4/100
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - loss: 0.0121 - mae: 0.0804
Epoch 5/100
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - loss: 0.0113 - mae: 0.0778
Epoch 6/100
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - loss: 0.0116 - mae: 0.0786
Epoch 7/100
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - loss: 0.0104 - mae: 0.0740
Epoch 8/100
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - loss: 0.0105 - mae: 0.0755
Epoch 9/100
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/st

<keras.src.callbacks.history.History at 0x7a3bce332550>

In [217]:
valid_idx, valid_min, valid_max, valid_x, valid_y = convert_to_train_data(
    df.filter(VALIDATE_FILTER).filter(
        pl.col("close")
        < pl.col(
            f"comming_{FORWARD_DAYS_PREDICTION}_day_max",
        )
    )
)
print("\n--- Making a prediction ---")

loss, mae = model.evaluate(valid_x, valid_y)
print(f"{loss=} , {mae=}")

train_x_norm.shape=(42, 90, 4, 1)
train_y_norm.shape=(42, 2, 1)

--- Making a prediction ---
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.0066 - mae: 0.0661
loss=0.006565130781382322 , mae=0.06608875095844269


In [None]:
loss=0.006565130781382322 , mae=0.06608875095844269
loss=0.0072648185305297375 , mae=0.07034825533628464
loss=0.017060449346899986 , mae=0.1299811601638794


In [213]:
prediction = model.predict(valid_x)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step


In [214]:
last_signal = valid_x[:, -1]
last_signal = last_signal * (valid_max - valid_min) + valid_min
last_signal = last_signal.squeeze()
prediction_denomalized = (
    prediction.reshape(*prediction.shape, 1) * (valid_max - valid_min) + valid_min
)
prediction_denomalized = prediction_denomalized.squeeze()

In [208]:
valid_idx

date
date
2025-07-02
2025-07-03
2025-07-07
2025-07-08
2025-07-09
…
2025-08-28
2025-08-29
2025-09-02
2025-09-03


In [215]:
df_prediction = pl.DataFrame(
    {
        "date": valid_idx,
        "max": last_signal[:, 2],
        "min": last_signal[:, 1],
        "open": last_signal[:, 0],
        "closed": last_signal[:, 3],
        "prediction_min": prediction_denomalized[:,0],
        "prediction_max": prediction_denomalized[:,1],
    }
).sort("date", descending=True)
df_prediction

date,max,min,open,closed,prediction_min,prediction_max
date,f64,f64,f64,f64,f64,f64
2025-09-04,343.329987,328.51001,335.200012,334.089996,339.644176,350.084406
2025-09-03,333.329987,325.600006,328.230011,329.359985,326.867044,335.65712
2025-09-02,348.75,331.700012,347.230011,333.869995,339.768521,340.965087
2025-08-29,353.549988,340.26001,350.910004,345.980011,342.15752,350.520247
2025-08-28,355.390015,349.160004,351.940002,349.600006,339.829529,353.419778
…,…,…,…,…,…,…
2025-07-09,304.049988,294.350006,297.0,297.809998,298.18065,314.606877
2025-07-08,296.149994,288.769989,291.369995,293.940002,307.411629,312.860482
2025-07-07,318.450012,312.76001,317.98999,315.350006,314.740358,327.168098
2025-07-03,316.829987,303.820007,312.630005,315.649994,307.508276,314.453639


In [None]:
"TSLA"	2025-09-04	338.890015	331.480011	336.149994	338.529999