In [3]:
import polars as pls
import numpy as np 
import random
from datetime import datetime, timedelta

import torch
import torch.nn as nn
import torch.optim as optim
import research
research.set_seed(42)

import altair as alt

import binance

In [4]:
symbol = "BTCUSDT"
time_interval = "1h"
history_period = 7 * 4 * 1
max_lags = 4
forecast_horizon = 1
anualized_rate = research.sharpe_annualization_factor(time_interval, 365, 24)

### Data Gathering ###
- downloading tick data from binance
- converting tick data into time series data
- visualizing time series data

In [5]:
binance.download_trades(symbol, history_period)

Downloading BTCUSDT: 100%|██████████| 28/28 [03:03<00:00,  6.54s/it]


In [6]:
ts = research.load_ohlc_timeseries(symbol, time_interval)
ts

Loading BTCUSDT: 100%|██████████| 174/174 [01:26<00:00,  2.01file/s]


datetime,open,high,low,close
datetime[μs],f64,f64,f64,f64
2025-05-18 00:00:00,103079.5,103380.8,103052.2,103329.6
2025-05-18 01:00:00,103329.7,103359.7,103088.1,103208.7
2025-05-18 02:00:00,103208.8,103317.0,103193.4,103292.7
2025-05-18 03:00:00,103292.8,103348.1,103089.0,103244.2
2025-05-18 04:00:00,103244.3,103399.0,103244.3,103314.7
…,…,…,…,…
2025-11-07 19:00:00,102551.3,103350.7,102429.4,103350.0
2025-11-07 20:00:00,103350.0,103849.9,102889.6,103718.4
2025-11-07 21:00:00,103718.4,103844.9,103295.6,103844.9
2025-11-07 22:00:00,103844.8,104070.7,103553.1,103586.6


In [7]:
alt.data_transformers.enable('vegafusion')
research.plot_dyn_timeseries(ts, symbol, 'close', time_interval)

### Feature Engineering ###

In [8]:
ts = ts.with_columns((pls.col('close')/pls.col('close').shift(forecast_horizon)).log().alias('log_return'))
ts

datetime,open,high,low,close,log_return
datetime[μs],f64,f64,f64,f64,f64
2025-05-18 00:00:00,103079.5,103380.8,103052.2,103329.6,
2025-05-18 01:00:00,103329.7,103359.7,103088.1,103208.7,-0.001171
2025-05-18 02:00:00,103208.8,103317.0,103193.4,103292.7,0.000814
2025-05-18 03:00:00,103292.8,103348.1,103089.0,103244.2,-0.00047
2025-05-18 04:00:00,103244.3,103399.0,103244.3,103314.7,0.000683
…,…,…,…,…,…
2025-11-07 19:00:00,102551.3,103350.7,102429.4,103350.0,0.007758
2025-11-07 20:00:00,103350.0,103849.9,102889.6,103718.4,0.003558
2025-11-07 21:00:00,103718.4,103844.9,103295.6,103844.9,0.001219
2025-11-07 22:00:00,103844.8,104070.7,103553.1,103586.6,-0.00249


In [9]:
target = 'log_return'
log_return = pls.col(target)
max_lags = 5
ts = ts.with_columns(
    [log_return.shift(i).alias(f'{target}_lag {i}') for i in range(1, max_lags + 1)]
)

In [10]:
ts = ts.drop_nulls()
ts.head(5)

datetime,open,high,low,close,log_return,log_return_lag 1,log_return_lag 2,log_return_lag 3,log_return_lag 4,log_return_lag 5
datetime[μs],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2025-05-18 06:00:00,103597.8,103636.4,103300.2,103414.4,-0.001772,0.002736,0.000683,-0.00047,0.000814,-0.001171
2025-05-18 07:00:00,103414.4,103949.6,103383.8,103893.0,0.004617,-0.001772,0.002736,0.000683,-0.00047,0.000814
2025-05-18 08:00:00,103893.0,104066.1,103781.3,103869.2,-0.000229,0.004617,-0.001772,0.002736,0.000683,-0.00047
2025-05-18 09:00:00,103869.3,103988.0,103800.1,103834.9,-0.00033,-0.000229,0.004617,-0.001772,0.002736,0.000683
2025-05-18 10:00:00,103835.0,103898.7,103759.0,103818.6,-0.000157,-0.00033,-0.000229,0.004617,-0.001772,0.002736


In [11]:
research.plot_distribution(ts, target, no_bins=200)

### Model Building ###

In [12]:
class LinearModel(nn.Module):
    def __init__(self, input_features):
        super(LinearModel, self).__init__()
        self.linear = nn.Linear(input_features, 1)
    
    def forward(self, x):
        return self.linear(x)

In [13]:
# split data manually by time

features = [f'{target}_lag {1}']
test_size = 0.25


In [14]:
split_idx = int(len(ts) * (1 - test_size))
split_idx

3127

In [15]:
ts_train, ts_test = ts[:split_idx], ts[split_idx:]

In [16]:
X_train = torch.tensor(ts_train[features].to_numpy(), dtype=torch.float32)
X_test = torch.tensor(ts_test[features].to_numpy(), dtype=torch.float32)

Y_train = torch.tensor(ts_train[[target]].to_numpy(), dtype=torch.float32)
Y_test = torch.tensor(ts_test[[target]].to_numpy(), dtype=torch.float32)

In [17]:
n_epochs = 5000
learning_rate = 0.01

model = LinearModel(len(features))
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

print("Starting training...")

for epoch in range(n_epochs):
    # forward pass
    y = model(X_train)
    loss = criterion(y, Y_train)

    # backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # check for improvement
    train_loss = loss.item()

    # logging
    if(epoch + 1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {train_loss:.10f}')
    

print('\n Learned Params')
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"{name}\n {param.data.numpy()}")

# Evaluation
model.eval()
with torch.no_grad():
    y = model(X_test)
    test_loss = criterion(y, Y_test)
    
    print(f"\nTest Loss: {test_loss.item():.6f}, Train Loss: {train_loss:.6f}")

Starting training...
Epoch [100/5000], Loss: 0.0132317320
Epoch [200/5000], Loss: 0.0000140722
Epoch [300/5000], Loss: 0.0000120116
Epoch [400/5000], Loss: 0.0000120113
Epoch [500/5000], Loss: 0.0000120113
Epoch [600/5000], Loss: 0.0000120113
Epoch [700/5000], Loss: 0.0000120113
Epoch [800/5000], Loss: 0.0000120113
Epoch [900/5000], Loss: 0.0000120113
Epoch [1000/5000], Loss: 0.0000120113
Epoch [1100/5000], Loss: 0.0000120113
Epoch [1200/5000], Loss: 0.0000120113
Epoch [1300/5000], Loss: 0.0000120113
Epoch [1400/5000], Loss: 0.0000120113
Epoch [1500/5000], Loss: 0.0000120113
Epoch [1600/5000], Loss: 0.0000120113
Epoch [1700/5000], Loss: 0.0000120113
Epoch [1800/5000], Loss: 0.0000120113
Epoch [1900/5000], Loss: 0.0000120113
Epoch [2000/5000], Loss: 0.0000120113
Epoch [2100/5000], Loss: 0.0000120113
Epoch [2200/5000], Loss: 0.0000120113
Epoch [2300/5000], Loss: 0.0000120113
Epoch [2400/5000], Loss: 0.0000120113
Epoch [2500/5000], Loss: 0.0000120113
Epoch [2600/5000], Loss: 0.0000120113


### Test Predictions ###

In [20]:
trade_results = pls.DataFrame({
    'y_pred': y.squeeze(),
    'y': Y_test.squeeze()
}).with_columns(
    (pls.col('y_pred').sign() == pls.col('y').sign()).alias('won'),
    (pls.col('y_pred').sign().alias('signal'))
).with_columns(
    (pls.col('signal') * pls.col('y')).alias('trade_log_return'),
).with_columns(
    (pls.col('trade_log_return').cum_sum()).alias('equity_curve')
)

trade_results

y_pred,y,won,signal,trade_log_return,equity_curve
f32,f32,bool,f32,f32,f32
0.000028,-0.001094,false,1.0,-0.001094,-0.001094
0.000025,0.004716,true,1.0,0.004716,0.003621
0.000009,0.000497,true,1.0,0.000497,0.004118
0.000021,-0.005669,false,1.0,-0.005669,-0.001551
0.000039,-0.019245,false,1.0,-0.019245,-0.020795
…,…,…,…,…,…
0.000017,0.007758,true,1.0,0.007758,-0.060717
2.0855e-7,0.003558,true,1.0,0.003558,-0.057158
0.000012,0.001219,true,1.0,0.001219,-0.055939
0.000019,-0.00249,false,1.0,-0.00249,-0.05843


In [21]:
research.plot_column(trade_results, 'equity_curve', title='Equity Curve')

In [22]:
trade_results = trade_results.with_columns(
    (pls.col('equity_curve') - pls.col('equity_curve').cum_max()).alias('drawdown')
)

trade_results

y_pred,y,won,signal,trade_log_return,equity_curve,drawdown
f32,f32,bool,f32,f32,f32,f32
0.000028,-0.001094,false,1.0,-0.001094,-0.001094,0.0
0.000025,0.004716,true,1.0,0.004716,0.003621,0.0
0.000009,0.000497,true,1.0,0.000497,0.004118,0.0
0.000021,-0.005669,false,1.0,-0.005669,-0.001551,-0.005669
0.000039,-0.019245,false,1.0,-0.019245,-0.020795,-0.024914
…,…,…,…,…,…,…
0.000017,0.007758,true,1.0,0.007758,-0.060717,-0.174015
2.0855e-7,0.003558,true,1.0,0.003558,-0.057158,-0.170457
0.000012,0.001219,true,1.0,0.001219,-0.055939,-0.169238
0.000019,-0.00249,false,1.0,-0.00249,-0.05843,-0.171729


In [27]:
max_drawdown = trade_results['drawdown'].min()
max_drawdown = np.exp(max_drawdown) - 1
max_drawdown

np.float64(-0.18701720465647587)

In [28]:
equity_peak = 1000
equity_peak * max_drawdown

np.float64(-187.01720465647585)

In [29]:
win_rate = trade_results['won'].mean()
win_rate

0.49856184084372

In [32]:
avg_win = trade_results.filter(pls.col('won') == True)['trade_log_return'].mean()
avg_loss = trade_results.filter(pls.col('won') == False)['trade_log_return'].mean()
ev = win_rate * avg_win + (1 - win_rate) * avg_loss
print(avg_win, avg_loss, ev)

0.003065293523641143 -0.0031646323437480514 -5.862903498258555e-05
