In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
LOOKBACK = 63 # на сколько трейдинговых дней смотрим назад (для определения средней волатильности)
DOLLAR_VOL_THRESHOLD = 5_000_000 # порог с которого перестаем считать компанию микро стоком
WINDOW_SIZE = 30 # то, какое окно используем в лстм для предсказания следующего значения
HIDDEN_SIZE = 64

def prepare_dataset(df, company):
    global LOOKBACK, DOLLAR_VOL_THRESHOLD
    
    ds = pd.DataFrame()
    recent_data = df.tail(LOOKBACK)
    avg_dollar_vol = (recent_data['Adj Close'] * recent_data['Volume']).mean()
    if avg_dollar_vol < DOLLAR_VOL_THRESHOLD: #micro stock
        print(f'{company} is considered a micro stock: {avg_dollar_vol}$', end='\r')
        return ds

    df['Date'] = pd.to_datetime(df['Date'], yearfirst=True)
    ds['date'] = df['Date']
    ds['price'] = df['Close']
    
    ds['rolling_mean'] = df['Close'].rolling(window=7).sum()/7/df['Close']
    rolling_sum_sq = (df['Close']**2).rolling(window=7).sum()
    term = rolling_sum_sq / (7 * df['Close']**2)
    variance = term - ds['rolling_mean']**2
    variance_clipped = variance.clip(lower=0)
    ds['rolling_std'] = np.sqrt(variance_clipped)

    ds['weekday'] = df['Date'].dt.dayofweek
    ds['month'] = df['Date'].dt.month
    ds['quarter'] = df['Date'].dt.quarter

    ds['long_skewness'] = df['Close'].rolling(window=30).skew()
    ds['short_skewness'] = df['Close'].rolling(window=7).skew()
    
    ds['long_kurtosis'] = df['Close'].rolling(window=30).kurt()
    ds['short_kurtosis'] = df['Close'].rolling(window=7).kurt()

    for day in (1, 2, 7):
        ds[f'lag_{day}'] = df['Close'].shift(day)/df['Close']

    ds['change'] = df['Close'].pct_change(fill_method=None).shift(-1)

    ds = ds.replace([np.inf, -np.inf], np.nan)
    ds = ds.dropna()
    ds = ds[~(ds.drop(columns=['weekday']) == 0).any(axis=1)]

    ds['grew'] = ds['change'].apply(lambda x: 0 if x < 0 else 1) # таргет - следующий день, поэтому выше никаких ликов не случилось
    
    ds = ds.reset_index(drop=True)
    return ds
    

In [3]:
from sklearn.model_selection import train_test_split
import random
from torch.utils.data import Dataset, DataLoader
import torch

class TrendDataset(Dataset):
    def __init__(self, chunks):
        self.inputs = torch.tensor(
            np.array([chunk.drop(columns=["grew", "change"]).values for chunk in chunks]),
            dtype=torch.float32
        )
        
        self.target = torch.tensor(
            [chunk["grew"].iloc[-1] for chunk in chunks],
            dtype=torch.float32
        ).unsqueeze(-1)

        self.returns = torch.tensor(
            [chunk["change"].iloc[-1] for chunk in chunks],
            dtype=torch.float32
        ).unsqueeze(-1)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return (self.inputs[idx], self.target[idx], self.returns[idx])

files_dir = 'dataset/stocks'
files = os.listdir(files_dir)
random.Random(97).shuffle(files) # сид для воспроизводимости

trader_agent_dir = 'trader-agent'
train_files, test_files = train_test_split(files, test_size=0.1)

train_dataloaders = [] # т.к. батчи не должны пересекаться между компаниями
test_dataloaders = []

if not os.path.exists(trader_agent_dir):
    for split in ['train', 'test']:
        os.makedirs(os.path.join(trader_agent_dir, split))

def process_company_ds(ds, batch_size=64):
    global WINDOW_SIZE
    chunks = [ds[i:i + WINDOW_SIZE].copy() for i in range(0, len(ds)-WINDOW_SIZE)]
    if len(chunks[-1]) != WINDOW_SIZE:
        chunks = chunks[:-1]
    return DataLoader(TrendDataset(chunks), batch_size=batch_size)
    
for file in files:
    df = pd.read_csv(os.path.join(files_dir, file))
    company = file[:file.find('.csv')]
    company_df = prepare_dataset(df, company)

    if len(company_df)<WINDOW_SIZE: 
        continue
    elif file in train_files:
        company_df.drop(columns=["change", "grew"]).to_csv(os.path.join(trader_agent_dir, 'train', file), index=False)
        company_df = company_df.drop(columns=["date", "price"])
        train_dataloaders.append(process_company_ds(company_df))
        
    elif file in test_files:
        company_df.drop(columns=["change", "grew"]).to_csv(os.path.join(trader_agent_dir, 'test', file), index=False)
        company_df = company_df.drop(columns=["date", "price"])
        test_dataloaders.append(process_company_ds(company_df, len(company_df)))

DTW is considered a micro stock: 635161.0294039288$9$$

In [4]:
import torch.nn as nn
import torch
class TrendPredictor(nn.Module):
    def __init__(self, num_features, hidden_size=HIDDEN_SIZE):
        super(TrendPredictor, self).__init__()
        self.fc = nn.Linear(hidden_size, 1)
        self.do = nn.Dropout(0.1)
        self.LSTM = nn.LSTM(num_features, hidden_size, batch_first=True)

    def forward(self, x):
        out, _ = self.LSTM(x)  # out: (batch, seq_len, hidden_size)
        out = self.do(out)
        return nn.functional.sigmoid(self.fc(out[:, -1, :]))  # только последний таймстеп интересует

model = TrendPredictor(num_features=12)
loss_fn = nn.BCELoss()
lr = 1e-4
optim = torch.optim.AdamW(model.parameters(), lr = lr, weight_decay = 0.001)

In [5]:
from sklearn.metrics import f1_score

num_epochs = 100
patience = 3
best_score = 0
bad_epochs = 0
torch.set_num_threads(14) # cpu goes brrr
for epoch in range(num_epochs):
    model.train()
    train_running_loss = 0.0
    total_samples = 0
    
    for train_dl in train_dataloaders:
        for batch in train_dl:
            inputs, targets, returns = batch
            optim.zero_grad()
            outputs = model(inputs)
            loss = loss_fn(outputs, targets)
            
            loss.backward()
            optim.step()
            
            train_running_loss += loss.item() * inputs.size(0)
            total_samples += inputs.size(0)
    
    model.eval()
    all_targets = []
    all_preds = []
    
    for test_dl in test_dataloaders:
        for batch in test_dl:
            inputs, targets, returns = batch
            with torch.no_grad():
                outputs = model(inputs)
                preds = torch.round(outputs)
            
            all_targets.append(targets)
            all_preds.append(preds)
    
    global_targets = torch.cat(all_targets)
    global_preds = torch.cat(all_preds)
    f1 = f1_score(global_targets, global_preds, pos_label=1)

    avg_loss = train_running_loss / total_samples
    
    print(f"Train BCE: {avg_loss:.4f}, Test F1: {f1:.4f}")
    
    if f1 > best_score:
        torch.save(model.state_dict(), 'lstm_predictor_best.pt')
        best_score = f1
        bad_epochs = 0
    else:
        bad_epochs += 1
        if bad_epochs >= patience:
            print("Early stopping")
            break

Train BCE: 0.6926, Test F1: 0.6192
Train BCE: 0.6914, Test F1: 0.6053
Train BCE: 0.6902, Test F1: 0.5940
Train BCE: 0.6884, Test F1: 0.5906
Early stopping


In [6]:
model = TrendPredictor(num_features=12)
model.load_state_dict(torch.load('new_predictor_best.pt'))
model.eval()

TrendPredictor(
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (do): Dropout(p=0.1, inplace=False)
  (LSTM): LSTM(12, 64, batch_first=True)
)

In [7]:
import matplotlib.pyplot as plt

sharpe_values = []
instances_count = 0

for test_dl in test_dataloaders[:50]:
    instances_count += 1
    for batch in test_dl:
        inputs, targets, returns = batch
        with torch.no_grad():
            outputs = model(inputs)
            preds = torch.round(outputs)

        profits = []
        for preds, rets in zip(preds, returns):
            profit = preds * rets  # если не покупали — 0, если покупали — доходность
            profits.append(profit)
        
        profits = torch.cat(profits).numpy()
        cum_profit = np.cumsum(profits)

        # Sharpe Ratio
        mean_return = np.mean(profits)
        std_return = np.std(profits)
        sharpe = (mean_return / std_return) * np.sqrt(252) if std_return != 0 else 0.0
        sharpe_values.append(sharpe)
        print(f"Sharpe Ratio: {sharpe:.3f}")

        # Plot Cumulative Profit
        plt.figure(figsize=(12, 6))
        plt.plot(np.arange(len(cum_profit)), cum_profit)
        plt.xlabel("Day")
        plt.ylabel("Cumulative Profit")
        plt.title(f"Cumulative Profit Curve — Sharpe: {sharpe:.3f}")
        plt.savefig(f'predictor_eval/{instances_count}.png', bbox_inches='tight')
        plt.close()

# Print average Sharpe ratio
if sharpe_values:
    avg_sharpe = sum(sharpe_values) / len(sharpe_values)
    print(f"\nAverage Sharpe Ratio: {avg_sharpe:.3f}")


Sharpe Ratio: 0.508
Sharpe Ratio: 0.537
Sharpe Ratio: 0.370
Sharpe Ratio: 0.335
Sharpe Ratio: 0.106
Sharpe Ratio: 0.638
Sharpe Ratio: 0.632
Sharpe Ratio: 0.208
Sharpe Ratio: 0.506
Sharpe Ratio: 0.437
Sharpe Ratio: 0.541
Sharpe Ratio: 0.058
Sharpe Ratio: -0.120
Sharpe Ratio: 0.330
Sharpe Ratio: 1.476
Sharpe Ratio: 0.807
Sharpe Ratio: 0.151
Sharpe Ratio: 0.880
Sharpe Ratio: -0.019
Sharpe Ratio: -0.946
Sharpe Ratio: 0.385
Sharpe Ratio: 0.118
Sharpe Ratio: 0.716
Sharpe Ratio: 0.512
Sharpe Ratio: 0.891
Sharpe Ratio: 0.332
Sharpe Ratio: -0.001
Sharpe Ratio: 0.717
Sharpe Ratio: -0.551
Sharpe Ratio: -1.108
Sharpe Ratio: 0.200
Sharpe Ratio: 0.528
Sharpe Ratio: 0.744
Sharpe Ratio: 0.680
Sharpe Ratio: 0.730
Sharpe Ratio: 1.066
Sharpe Ratio: 0.509
Sharpe Ratio: 0.526
Sharpe Ratio: 0.278
Sharpe Ratio: -1.256
Sharpe Ratio: 0.742
Sharpe Ratio: 0.395
Sharpe Ratio: 0.629
Sharpe Ratio: 0.398
Sharpe Ratio: 0.582
Sharpe Ratio: 0.123
Sharpe Ratio: 0.546
Sharpe Ratio: 0.341
Sharpe Ratio: 0.095
Sharpe Ratio: