In [1]:
# =====================================================
# Cell 1: Imports & Device
# =====================================================
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


Device: cuda


In [2]:
# =====================================================
# Cell 2: Load Train Data (ONCE)
# =====================================================

df = pd.read_csv(
    "train.csv",
    dtype={
        'Open': 'float32',
        'High': 'float32',
        'Low': 'float32',
        'Close': 'float32',
        'Volume': 'float32',
        'Ticker': 'category'
    }
)

df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(['Ticker', 'Date'])

print("Original rows:", len(df))

# ---- Reduce to 1.8M rows (time-safe) ----
MAX_ROWS = 600_000
rows_per_ticker = MAX_ROWS // df['Ticker'].nunique()

df = (
    df.groupby('Ticker', observed=False)
      .head(rows_per_ticker)
      .reset_index(drop=True)
)

print("Reduced rows:", len(df))


Original rows: 21033522
Reduced rows: 600000


In [3]:
# =====================================================
# Cell 3: Features & Target
# =====================================================
SEQ_LEN = 30
PRED_DAYS = 30

df['Month'] = df['Date'].dt.month / 12.0
df['DayOfWeek'] = df['Date'].dt.dayofweek / 7.0

FEATURES = ['Open', 'High', 'Low', 'Close', 'Volume', 'Month', 'DayOfWeek']

df['Target'] = (
    df.groupby('Ticker', observed=False)['Close']
      .shift(-PRED_DAYS) > df['Close']
).astype('float32')

df.dropna(inplace=True)

scaler = StandardScaler()
df[FEATURES] = scaler.fit_transform(df[FEATURES])

ticker2idx = {t: i for i, t in enumerate(df['Ticker'].cat.categories)}
df['Ticker_idx'] = df['Ticker'].map(ticker2idx)

num_tickers = len(ticker2idx)


In [4]:
# =====================================================
# Class Distribution
# =====================================================
target_counts = df['Target'].value_counts()
target_ratio = df['Target'].value_counts(normalize=True) * 100

print("Counts:")
print(target_counts)

print("\nPercentages:")
print(target_ratio)


Counts:
Target
0.0    371896
1.0    228104
Name: count, dtype: int64

Percentages:
Target
0.0    61.982667
1.0    38.017333
Name: proportion, dtype: float64


In [5]:
# =====================================================
# Cell 4: Dataset & Loader
# =====================================================
class StockDataset(Dataset):
    def __init__(self, df, seq_len):
        self.X = df[FEATURES].values
        self.y = df['Target'].values
        self.ticker = df['Ticker_idx'].values
        self.seq_len = seq_len

    def __len__(self):
        return len(self.X) - self.seq_len

    def __getitem__(self, idx):
        return (
            torch.tensor(self.X[idx:idx+self.seq_len], dtype=torch.float32),
            torch.tensor(self.ticker[idx+self.seq_len], dtype=torch.long),
            torch.tensor(self.y[idx+self.seq_len], dtype=torch.float32)
        )

dataset = StockDataset(df, SEQ_LEN)

loader = DataLoader(
    dataset,
    batch_size=256,
    shuffle=True,
    num_workers=4,   # fixed
    pin_memory=True,
    drop_last=True
)


In [6]:
# =====================================================
# Cell 5: Model
# =====================================================
class HybridModel(nn.Module):
    def __init__(
        self,
        input_dim,
        num_tickers,
        embed_dim=16,
        cnn_channels=64,
        lstm_hidden=128,
        dropout=0.3
    ):
        super().__init__()

        self.ticker_emb = nn.Embedding(num_tickers, embed_dim)

        self.cnn = nn.Sequential(
            nn.Conv1d(input_dim, cnn_channels, 3, padding=1),
            nn.ReLU(),
            nn.Conv1d(cnn_channels, cnn_channels, 3, padding=1),
            nn.ReLU()
        )

        self.lstm = nn.LSTM(
            cnn_channels + embed_dim,
            lstm_hidden,
            batch_first=True,
            bidirectional=True
        )

        self.norm = nn.LayerNorm(lstm_hidden * 2)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(lstm_hidden * 2, 1)

    def forward(self, x, ticker_id):
        x = x.permute(0, 2, 1)
        x = self.cnn(x)
        x = x.permute(0, 2, 1)

        emb = self.ticker_emb(ticker_id).unsqueeze(1).expand(-1, x.size(1), -1)
        x = torch.cat([x, emb], dim=-1)

        _, (h, _) = self.lstm(x)
        h = torch.cat([h[-2], h[-1]], dim=1)

        h = self.norm(h)
        return self.fc(self.dropout(h)).squeeze(1)

model = HybridModel(len(FEATURES), num_tickers).to(device)


In [None]:
# =====================================================
# Cell 6: Training
# =====================================================
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
criterion = nn.BCEWithLogitsLoss()
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

scaler = torch.amp.GradScaler('cuda')

EPOCHS = 20

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0

    for xb, tid, yb in loader:
        xb = xb.to(device, non_blocking=True)
        tid = tid.to(device, non_blocking=True)
        yb = yb.to(device, non_blocking=True)

        optimizer.zero_grad()

        with torch.amp.autocast('cuda'):
            logits = model(xb, tid)
            loss = criterion(logits, yb)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    scheduler.step()
    avg_loss = total_loss / len(loader)
    print(f"Epoch {epoch+1} | Avg Loss: {avg_loss:.4f}")


In [None]:
# =====================================================
# Cell 7: Save Model
# =====================================================
torch.save(model.state_dict(), "model_1p8M_bilstm.bin")
print("Model saved.")


In [None]:
# =====================================================
# Cell 8: Load Test Data (UPDATED)
# =====================================================
test_df = pd.read_csv("test.csv")

# Keep the original ID for submission
test_df['ID_orig'] = test_df['ID']  # Save original ID

# Rename ID -> Ticker to match training schema
test_df = test_df.rename(columns={'ID': 'Ticker'})

# Parse date
test_df['Date'] = pd.to_datetime(test_df['Date'])

# Sort for time-series safety
test_df = test_df.sort_values(['Ticker', 'Date']).reset_index(drop=True)

print(test_df.head())
print(test_df.columns)


In [None]:
# =====================================================
# Cell 9: Combine for History
# =====================================================
full_df = pd.concat([df, test_df], axis=0, ignore_index=True)
full_df = full_df.sort_values(['Ticker', 'Date'])


In [None]:
# =====================================================
# Cell 10: Build Test Sequences (FAST)
# =====================================================
from collections import defaultdict

# Pre-group full_df by ticker ONCE
ticker_groups = {
    t: g.sort_values('Date').reset_index(drop=True)
    for t, g in full_df.groupby('Ticker')
}

test_sequences = []
test_indices = []

for i, row in test_df.iterrows():
    g = ticker_groups.get(row['Ticker'])
    if g is None:
        continue

    # Find position of test date
    idx = g['Date'].searchsorted(row['Date'])

    if idx >= SEQ_LEN:
        hist = g.iloc[idx-SEQ_LEN:idx]
        test_sequences.append(hist[FEATURES].values)
        test_indices.append(i)


In [None]:
# =====================================================
# Cell 11: Prepare Test Tensors
# =====================================================
# Convert sequences to tensor
X_test = torch.tensor(test_sequences, dtype=torch.float32).to(device)

# Map tickers in test_indices to IDs
ticker_ids = torch.tensor(
    test_df.loc[test_indices, 'Ticker'].map(ticker2idx).values,
    dtype=torch.long
).to(device)

print("X_test shape:", X_test.shape)
print("ticker_ids shape:", ticker_ids.shape)


In [None]:
# =====================================================
# Cell 12: Inference
# =====================================================
batch_size = 256  # same as training batch
model.eval()
preds = []

with torch.no_grad():
    for i in range(0, len(X_test), batch_size):
        xb = X_test[i:i+batch_size]
        tb = ticker_ids[i:i+batch_size]

        logits = model(xb, tb)
        probs = torch.sigmoid(logits)
        preds.extend(probs.cpu().numpy())


In [None]:
# =====================================================
# Cell 13: Submission (binary 0/1)
# =====================================================
import numpy as np

submission = test_df.copy()

# Default predictions for rows we couldn't predict
submission['Pred'] = 0  # default 0 for unknown

# Convert preds to NumPy array
preds_arr = np.array(preds)

# Fill only the rows we have model predictions
submission.loc[test_indices, 'Pred'] = (preds_arr >= 0.5).astype(int)

# Keep only the columns required by competition
submission = submission[['ID_orig', 'Pred']].rename(columns={'ID_orig': 'ID'})

# Save CSV
submission.to_csv("submission.csv", index=False)
print("submission.csv saved")

# Print first 5 rows
print(submission.head())
