## Step 1: Load Data + Basic Checks

In [None]:
# --- 0) Imports + path ---
import pandas as pd
import numpy as np

DATA_PATH = "model_input_features_12f_2001_2024.parquet" 

In [2]:
# --- 1) Read data ---
df = pd.read_parquet(DATA_PATH)

print("shape:", df.shape)
print("columns:", df.columns.tolist())
df.head()

shape: (1932300, 16)
columns: ['permno', 'month', 'ret_fwd', 'split', 'mktcap_z', 'log_mktcap_z', 'book_to_market_z', 'momentum_z', 'rev_1m_z', 'volatility_z', 'beta_z', 'roa_z', 'ni_over_at_z', 'investment_z', 'asset_growth_z', 'leverage_z']


Unnamed: 0,permno,month,ret_fwd,split,mktcap_z,log_mktcap_z,book_to_market_z,momentum_z,rev_1m_z,volatility_z,beta_z,roa_z,ni_over_at_z,investment_z,asset_growth_z,leverage_z
0,10001,2001-01-31,-0.012658,train,-0.157869,0.013525,-0.027821,0.439072,0.0,-1.023105,-0.797419,0.0,0.129901,0.0,0.0,-0.008146
1,10001,2001-02-28,0.038462,train,-0.15687,0.014942,-0.027516,0.551295,-0.468295,-1.069425,-0.743954,0.0,0.130449,0.0,0.0,-0.007248
2,10001,2001-03-31,-0.025,train,-0.157546,0.016448,-0.029609,0.519111,0.276944,-1.089087,-0.731609,0.0,0.127774,0.0,0.0,-0.009131
3,10001,2001-04-30,0.097436,train,-0.15516,0.01049,-0.035844,0.636299,0.535419,-1.07785,-0.81165,0.0,0.125295,0.0,0.0,-0.00879
4,10001,2001-05-31,0.114953,train,-0.156406,0.012822,-0.031098,0.660769,-0.447603,-1.055489,-0.824151,0.0,0.12402,0.0,0.0,-0.005912


In [None]:
# --- 2) Basic checks: types, key columns, feature columns ---
df.info()


df["month"] = pd.to_datetime(df["month"])

key_cols = ["permno", "month"]
target_col = "ret_fwd"
feature_cols = [c for c in df.columns if c.endswith("_z")]

print("\n#keys:", key_cols)
print("#features:", len(feature_cols))
print("feature cols:", feature_cols)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1932300 entries, 0 to 1932299
Data columns (total 16 columns):
 #   Column            Dtype         
---  ------            -----         
 0   permno            int64         
 1   month             datetime64[ns]
 2   ret_fwd           float64       
 3   split             object        
 4   mktcap_z          float32       
 5   log_mktcap_z      float32       
 6   book_to_market_z  float32       
 7   momentum_z        float32       
 8   rev_1m_z          float32       
 9   volatility_z      float32       
 10  beta_z            float32       
 11  roa_z             float32       
 12  ni_over_at_z      float32       
 13  investment_z      float32       
 14  asset_growth_z    float32       
 15  leverage_z        float32       
dtypes: datetime64[ns](1), float32(12), float64(1), int64(1), object(1)
memory usage: 147.4+ MB

#keys: ['permno', 'month']
#features: 12
feature cols: ['mktcap_z', 'log_mktcap_z', 'book_to_market_z', 'm

In [None]:
# --- 3) Integrity checks: duplicates, missingness ---
dup_n = df.duplicated(key_cols).sum()
print("duplicate (permno, month) rows:", dup_n)

na_rate = df.isna().mean().sort_values(ascending=False)
print("\nTop NA rates:")
print(na_rate.head(20))

print("\nTarget ret_fwd summary:")
print(df[target_col].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))

duplicate (permno, month) rows: 0

Top NA rates:
permno              0.0
month               0.0
ret_fwd             0.0
split               0.0
mktcap_z            0.0
log_mktcap_z        0.0
book_to_market_z    0.0
momentum_z          0.0
rev_1m_z            0.0
volatility_z        0.0
beta_z              0.0
roa_z               0.0
ni_over_at_z        0.0
investment_z        0.0
asset_growth_z      0.0
leverage_z          0.0
dtype: float64

Target ret_fwd summary:
count    1.932300e+06
mean     8.315928e-03
std      1.714482e-01
min     -9.936000e-01
1%      -3.851090e-01
5%      -2.041291e-01
50%      4.419000e-03
95%      2.163530e-01
99%      5.037592e-01
max      3.900000e+01
Name: ret_fwd, dtype: float64


In [None]:
# --- 4) Time coverage + split checks ---
print("\nOverall month range:", df["month"].min(), "to", df["month"].max())
print("Unique permno:", df["permno"].nunique())

if "split" in df.columns:
    split_summary = df.groupby("split").agg(
        n_rows=("permno", "size"),
        n_permno=("permno", "nunique"),
        min_month=("month", "min"),
        max_month=("month", "max"),
        n_months=("month", "nunique"),
    )
    print("\nSplit summary:")
    display(split_summary)


    if set(["train", "val", "test"]).issubset(set(df["split"].unique())):
        train_max = split_summary.loc["train", "max_month"]
        val_min = split_summary.loc["val", "min_month"]
        val_max = split_summary.loc["val", "max_month"]
        test_min = split_summary.loc["test", "min_month"]
        print("\nSplit time order checks:")
        print("train_max < val_min:", train_max < val_min)
        print("val_max < test_min:", val_max < test_min)


Overall month range: 2001-01-31 00:00:00 to 2024-11-30 00:00:00
Unique permno: 20759

Split summary:


Unnamed: 0_level_0,n_rows,n_permno,min_month,max_month,n_months
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
test,381679,11047,2021-01-31,2024-11-30,47
train,1223153,14662,2001-01-31,2016-12-31,192
val,327468,8877,2017-01-31,2020-12-31,48



Split time order checks:
train_max < val_min: True
val_max < test_min: True


In [6]:
# --- 5) How many zeros in each feature? (possible imputation artifact) ---
zero_rate = (df[feature_cols] == 0).mean().sort_values(ascending=False)
print("Zero rate by feature:")
print(zero_rate)

Zero rate by feature:
asset_growth_z      0.295309
roa_z               0.148212
investment_z        0.074031
rev_1m_z            0.003720
mktcap_z            0.000000
log_mktcap_z        0.000000
book_to_market_z    0.000000
momentum_z          0.000000
volatility_z        0.000000
beta_z              0.000000
ni_over_at_z        0.000000
leverage_z          0.000000
dtype: float64


In [7]:
# --- 6) ret_fwd outlier rates (overall + by split) ---
def outlier_rates(s):
    return pd.Series({
        "pct_ret_gt_1": (s > 1).mean(),     # >100% monthly return
        "pct_ret_gt_2": (s > 2).mean(),     # >200%
        "pct_ret_gt_5": (s > 5).mean(),     # >500%
        "pct_ret_lt_-0.5": (s < -0.5).mean()
    })

print("Overall outlier rates:")
print(outlier_rates(df["ret_fwd"]))

print("\nOutlier rates by split:")
print(df.groupby("split")["ret_fwd"].apply(outlier_rates).unstack())

Overall outlier rates:
pct_ret_gt_1       0.002118
pct_ret_gt_2       0.000408
pct_ret_gt_5       0.000040
pct_ret_lt_-0.5    0.003803
dtype: float64

Outlier rates by split:
       pct_ret_gt_1  pct_ret_gt_2  pct_ret_gt_5  pct_ret_lt_-0.5
split                                                           
test       0.002570      0.000642      0.000084         0.004802
train      0.001917      0.000300      0.000017         0.003446
val        0.002339      0.000541      0.000076         0.003970


## Step 2: Construct Sequence Samples with K=6 Sliding Window

In [8]:
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
K = 6

df = df.sort_values(["permno", "month"]).reset_index(drop=True)

X_all = df[feature_cols].to_numpy(dtype=np.float32)   
y_all = df["ret_fwd"].to_numpy(dtype=np.float32)      
permno_all = df["permno"].to_numpy(dtype=np.int64)
month_all = df["month"].to_numpy()                    
split_all = df["split"].to_numpy()

In [None]:
def make_sequence_centers(permno_arr: np.ndarray, K: int) -> np.ndarray:
    """
    Return indices i such that rows [i-K+1, ..., i] belong to same permno.
    Assumes data sorted by permno then month.
    """
    n = len(permno_arr)
    centers = []
    start = 0
    while start < n:
        end = start
        p = permno_arr[start]
        while end < n and permno_arr[end] == p:
            end += 1
        
        if end - start >= K:
            centers.extend(range(start + K - 1, end))
        start = end
    return np.array(centers, dtype=np.int64)

centers = make_sequence_centers(permno_all, K=K)
print("Total centers:", len(centers))
print("Example centers:", centers[:5])

Total centers: 1831112
Example centers: [5 6 7 8 9]


In [11]:
train_centers = centers[split_all[centers] == "train"]
val_centers   = centers[split_all[centers] == "val"]
test_centers  = centers[split_all[centers] == "test"]

print("train:", len(train_centers), "val:", len(val_centers), "test:", len(test_centers))
print("train month range:", df.loc[train_centers, "month"].min(), "to", df.loc[train_centers, "month"].max())
print("val month range:", df.loc[val_centers, "month"].min(), "to", df.loc[val_centers, "month"].max())
print("test month range:", df.loc[test_centers, "month"].min(), "to", df.loc[test_centers, "month"].max())

train: 1151578 val: 316780 test: 362754
train month range: 2001-06-30 00:00:00 to 2016-12-31 00:00:00
val month range: 2017-01-31 00:00:00 to 2020-12-31 00:00:00
test month range: 2021-01-31 00:00:00 to 2024-11-30 00:00:00


## Step 3: Temporal Transformer Model

In [None]:
import torch
from torch.utils.data import Dataset


X_all_t = torch.from_numpy(X_all)            
y_all_t = torch.from_numpy(y_all)            

class SeqDatasetXYFast(Dataset):
    def __init__(self, centers_idx, X_all_t, y_all_t, K: int):
        self.centers = torch.as_tensor(centers_idx, dtype=torch.long)
        self.X_all_t = X_all_t
        self.y_all_t = y_all_t
        self.K = K

    def __len__(self):
        return self.centers.numel()

    def __getitem__(self, j):
        i = int(self.centers[j])
        x_seq = self.X_all_t[i - self.K + 1 : i + 1]  
        y = self.y_all_t[i]                           
        return x_seq, y

train_ds_xy = SeqDatasetXYFast(train_centers, X_all_t, y_all_t, K)
val_ds_xy   = SeqDatasetXYFast(val_centers,   X_all_t, y_all_t, K)
test_ds_xy  = SeqDatasetXYFast(test_centers,  X_all_t, y_all_t, K)

print(len(train_ds_xy), len(val_ds_xy), len(test_ds_xy))

1151578 316780 362754


In [None]:
x_seq, y = train_ds_xy[0]
print("x_seq shape:", x_seq.shape)   
print("y:", float(y))
print("first row feats:", x_seq[0])
print("last row feats:", x_seq[-1])

x_seq shape: torch.Size([6, 12])
y: 0.02542399987578392
first row feats: tensor([-0.1579,  0.0135, -0.0278,  0.4391,  0.0000, -1.0231, -0.7974,  0.0000,
         0.1299,  0.0000,  0.0000, -0.0081])
last row feats: tensor([-0.1549,  0.0073, -0.0343,  0.4853,  0.1869, -1.0103, -0.8459,  0.0000,
         0.1219,  0.0000,  0.0000, -0.0046])


In [16]:
# --- 0) Torch setup: device + seed ---
import os
import random
import torch
import torch.nn as nn

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True

set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
# --- 1) Wrap dataset to only return (X, y) for faster training ---
from torch.utils.data import Dataset, DataLoader

class XYWrapper(Dataset):
    def __init__(self, base_ds):
        self.base = base_ds
    def __len__(self):
        return len(self.base)
    def __getitem__(self, idx):
        x, y, *_ = self.base[idx]
        return x, y



X_all_t = torch.from_numpy(X_all)          
y_all_t = torch.from_numpy(y_all)          

class SeqDatasetXYFast(Dataset):
    def __init__(self, centers_idx, X_all_t, y_all_t, K: int):
        self.centers = torch.as_tensor(centers_idx, dtype=torch.long)
        self.X_all_t = X_all_t
        self.y_all_t = y_all_t
        self.K = K

    def __len__(self):
        return self.centers.numel()

    def __getitem__(self, j):
        i = int(self.centers[j])
        x_seq = self.X_all_t[i - self.K + 1 : i + 1]  
        y = self.y_all_t[i]
        return x_seq, y

train_ds_xy = SeqDatasetXYFast(train_centers, X_all_t, y_all_t, K)
val_ds_xy   = SeqDatasetXYFast(val_centers,   X_all_t, y_all_t, K)

BATCH_SIZE = 1024
NUM_WORKERS = 0

train_loader = DataLoader(
    train_ds_xy, batch_size=BATCH_SIZE, shuffle=True,
    num_workers=NUM_WORKERS, pin_memory=(device.type=="cuda")
)
val_loader = DataLoader(
    val_ds_xy, batch_size=BATCH_SIZE, shuffle=False,
    num_workers=NUM_WORKERS, pin_memory=(device.type=="cuda")
)

BATCH_SIZE, NUM_WORKERS

BATCH_SIZE, NUM_WORKERS

(1024, 0)

In [None]:
# --- 2) Positional encoding (sin/cos) ---
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, max_len: int):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float32) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer("pe", pe.unsqueeze(0))  

    def forward(self, x):
        
        K = x.size(1)
        return x + self.pe[:, :K, :]

In [None]:
# --- 3) Temporal Transformer model ---
class TemporalTransformer(nn.Module):
    def __init__(
        self,
        n_features: int,
        seq_len: int,
        d_model: int = 64,
        n_heads: int = 4,
        n_layers: int = 2,
        dim_ff: int = 256,
        dropout: float = 0.1,
        pooling: str = "last",  
    ):
        super().__init__()
        assert pooling in ("last", "mean")

        self.seq_len = seq_len
        self.pooling = pooling

        self.input_proj = nn.Linear(n_features, d_model)
        self.pos_enc = PositionalEncoding(d_model=d_model, max_len=seq_len)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=dim_ff,
            dropout=dropout,
            batch_first=True,
            activation="gelu",
            norm_first=True,
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)

        self.head = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, 64),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(64, 1),
        )

    def forward(self, x):
        # x: (B, K, n_features)
        h = self.input_proj(x)          # (B, K, d_model)
        h = self.pos_enc(h)             # add positional encoding
        h = self.encoder(h)             # (B, K, d_model)

        if self.pooling == "last":
            z = h[:, -1, :]             # (B, d_model)
        else:
            z = h.mean(dim=1)           # (B, d_model)

        out = self.head(z).squeeze(-1)  # (B,)
        return out

In [20]:
# --- 4) Train / Eval utilities  ---
from torch.cuda.amp import autocast, GradScaler

def run_eval(model, loader, loss_fn):
    model.eval()
    total_loss = 0.0
    n = 0
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)
            preds = model(x)
            loss = loss_fn(preds, y)
            bs = x.size(0)
            total_loss += loss.item() * bs
            n += bs
    return total_loss / n

def train_one_epoch(model, loader, optimizer, loss_fn, scaler=None):
    model.train()
    total_loss = 0.0
    n = 0
    use_amp = (scaler is not None)

    for x, y in loader:
        x = x.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)

        if use_amp:
            with autocast():
                preds = model(x)
                loss = loss_fn(preds, y)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            preds = model(x)
            loss = loss_fn(preds, y)
            loss.backward()
            optimizer.step()

        bs = x.size(0)
        total_loss += loss.item() * bs
        n += bs

    return total_loss / n

In [None]:
# --- 5) Smoke test: one forward pass + 2-3 epochs baseline training ---
K = 6
model = TemporalTransformer(
    n_features=len(feature_cols),
    seq_len=K,
    d_model=64,
    n_heads=4,
    n_layers=2,
    dim_ff=256,
    dropout=0.1,
    pooling="last",
).to(device)

loss_fn = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)


x0, y0 = next(iter(train_loader))
with torch.no_grad():
    p0 = model(x0.to(device))
print("batch preds shape:", p0.shape)
print(f"\nTotal model params: {sum(p.numel() for p in model.parameters()):,}")



batch preds shape: torch.Size([1024])

Total model params: 105,153


In [None]:
# (mini-train): run 200 batches to confirm training is moving

model.train()
max_batches = 200

for b, (x, y) in enumerate(train_loader, start=1):
    x = x.to(device, non_blocking=True)
    y = y.to(device, non_blocking=True)

    optimizer.zero_grad(set_to_none=True)
    pred = model(x)
    loss = loss_fn(pred, y)
    loss.backward()
    optimizer.step()

    if b % 50 == 0:
        print(f"batch {b}/{max_batches}  loss={loss.item():.6f}")

    if b >= max_batches:
        break

batch 50/200  loss=0.023007
batch 100/200  loss=0.028059
batch 150/200  loss=0.029491
batch 200/200  loss=0.023292


## Step 4: full training with validation + early stopping (K=6 baseline)

In [None]:


from torch.cuda.amp import GradScaler

scaler = GradScaler() if device.type == "cuda" else None
EPOCHS = 20
PATIENCE = 3

best_val = float("inf")
bad_epochs = 0

for epoch in range(1, EPOCHS + 1):
    train_loss = train_one_epoch(model, train_loader, optimizer, loss_fn, scaler=scaler)
    val_loss = run_eval(model, val_loader, loss_fn)

    print(f"epoch {epoch:02d}: train_loss={train_loss:.6f}  val_loss={val_loss:.6f}")

    if val_loss < best_val - 1e-6:
        best_val = val_loss
        bad_epochs = 0
        torch.save(model.state_dict(), f"temporal_transformer_K{K}.pt")
    else:
        bad_epochs += 1
        if bad_epochs >= PATIENCE:
            print("early stop")
            break

print("best val:", best_val)

  scaler = GradScaler() if device.type == "cuda" else None
  with autocast():


epoch 01: train_loss=0.025029  val_loss=0.031643
epoch 02: train_loss=0.024758  val_loss=0.031615
epoch 03: train_loss=0.024704  val_loss=0.031624
epoch 04: train_loss=0.024673  val_loss=0.031606
epoch 05: train_loss=0.024643  val_loss=0.031605
epoch 06: train_loss=0.024617  val_loss=0.031647
epoch 07: train_loss=0.024591  val_loss=0.031622
early stop
best val: 0.03160587973733695


## Step 5: build test loader with meta, then predict

In [None]:


# 1) build meta test dataset/loader
permno_all_t = torch.from_numpy(permno_all.astype(np.int64))
month_all_np = month_all

class SeqDatasetWithMetaFast(Dataset):
    def __init__(self, centers_idx, X_all_t, y_all_t, permno_all_t, month_all_np, K: int):
        self.centers = np.asarray(centers_idx, dtype=np.int64)
        self.X_all_t = X_all_t
        self.y_all_t = y_all_t
        self.permno_all_t = permno_all_t
        self.month_all_np = month_all_np
        self.K = K

    def __len__(self):
        return self.centers.shape[0]

    def __getitem__(self, j):
        i = int(self.centers[j])
        x_seq = self.X_all_t[i - self.K + 1 : i + 1]
        y = self.y_all_t[i]
        permno = self.permno_all_t[i]

        m = self.month_all_np[i]  
       
        month_int = int(str(m)[:7].replace("-", ""))

        return x_seq, y, permno, month_int

test_ds_meta = SeqDatasetWithMetaFast(test_centers, X_all_t, y_all_t, permno_all_t, month_all_np, K)
test_loader = DataLoader(test_ds_meta, batch_size=4096, shuffle=False, num_workers=0,
                         pin_memory=(device.type=="cuda"))


model.load_state_dict(torch.load(f"temporal_transformer_K{K}.pt", map_location=device))
model.eval()


preds_list, ys_list, permnos, months = [], [], [], []

with torch.no_grad():
    for x, y, permno, month in test_loader:
        x = x.to(device, non_blocking=True)
        pred = model(x).detach().cpu().numpy()
        preds_list.append(pred)
        ys_list.append(y.numpy())
        permnos.append(permno.numpy())
        months.append(month)

pred_test = pd.DataFrame({
    "permno": np.concatenate(permnos),
    "month_yyyymm": np.concatenate(months),
    "y_true": np.concatenate(ys_list),
    "y_pred": np.concatenate(preds_list),
})

pred_test["month"] = pd.to_datetime(pred_test["month_yyyymm"].astype(str) + "01") + pd.offsets.MonthEnd(0)
pred_test = pred_test.drop(columns=["month_yyyymm"])

print(pred_test.shape)
pred_test.head()

(362754, 4)


Unnamed: 0,permno,y_true,y_pred,month
0,10026,0.039958,0.014404,2021-01-31
1,10026,-0.007275,0.014551,2021-02-28
2,10026,0.048271,0.014604,2021-03-31
3,10026,0.066642,0.014501,2021-04-30
4,10026,-0.003058,0.01458,2021-05-31


## Step 6: Monthly decile portfolios (equal-weighted) + long-short + Sharpe

In [None]:


tmp = pred_test.copy()

# 1) assign deciles within each month by y_pred
def assign_decile(s):
    
    r = s.rank(method="first")
    return pd.qcut(r, 10, labels=False) + 1  # 1..10

tmp["decile"] = tmp.groupby("month")["y_pred"].transform(assign_decile)

# 2) compute equal-weighted realized returns by month x decile
dec_ret = (
    tmp.groupby(["month", "decile"])["y_true"]
    .mean()
    .reset_index(name="ret")
)

# 3) long-short (top - bottom) each month
wide = dec_ret.pivot(index="month", columns="decile", values="ret").sort_index()
wide["long_short"] = wide[10] - wide[1]
wide["long"] = wide[10]
wide["short"] = wide[1]

# 4) summary stats
ls = wide["long_short"].dropna()
mean_ls = ls.mean()
std_ls = ls.std(ddof=1)
sharpe_ann = (mean_ls / std_ls) * np.sqrt(12)

tstat = mean_ls / (std_ls / np.sqrt(ls.shape[0]))

print("months:", ls.shape[0])
print("mean long-short:", mean_ls)
print("t-stat (approx):", tstat)
print("annualized Sharpe:", sharpe_ann)

wide.head()

months: 47
mean long-short: 0.019358955
t-stat (approx): 3.121737114384867
annualized Sharpe: 1.5773861437476815


decile,1,2,3,4,5,6,7,8,9,10,long_short,long,short
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2021-01-31,0.040144,0.040651,0.020047,0.035777,0.04556,0.068785,0.081641,0.098142,0.11748,0.156352,0.116208,0.156352,0.040144
2021-02-28,-0.049125,0.003958,0.010909,0.030135,0.032239,0.035247,0.038711,0.035201,0.045195,0.034574,0.083699,0.034574,-0.049125
2021-03-31,-0.02375,0.001764,0.019775,0.025411,0.03376,0.034955,0.031407,0.031447,0.032469,0.014222,0.037972,0.014222,-0.02375
2021-04-30,-0.02997,-0.002281,0.011086,0.014452,0.019631,0.02153,0.031821,0.022705,0.03726,0.061772,0.091742,0.061772,-0.02997
2021-05-31,0.072508,0.035793,0.009192,0.015938,0.008194,0.000644,0.00327,0.003085,0.007037,0.040388,-0.03212,0.040388,0.072508


## Step 7: OOS R^2 on test

In [None]:


y = pred_test["y_true"].to_numpy()
yhat = pred_test["y_pred"].to_numpy()

ss_res = np.sum((y - yhat) ** 2)
ss_tot = np.sum((y - y.mean()) ** 2)
oos_r2 = 1.0 - ss_res / ss_tot

mse = np.mean((y - yhat) ** 2)

print("test MSE:", mse)
print("test OOS R^2:", oos_r2)

test MSE: 0.0409826
test OOS R^2: -0.0023403167724609375


## Step 8: summarize results + export for group

In [None]:


results = pd.DataFrame([{
    "model": f"TemporalTransformer_K{K}",
    "test_mse": float(mse),
    "test_oos_r2": float(oos_r2),
    "ls_mean": float(mean_ls),
    "ls_tstat": float(tstat),
    "ls_sharpe_ann": float(sharpe_ann),
    "n_test_months": int(ls.shape[0]),
    "n_test_obs": int(pred_test.shape[0]),
}])

results

Unnamed: 0,model,test_mse,test_oos_r2,ls_mean,ls_tstat,ls_sharpe_ann,n_test_months,n_test_obs
0,TemporalTransformer_K6,0.040983,-0.00234,0.019359,3.121737,1.577386,47,362754


In [None]:

pred_test.to_csv(f"pred_test_transformer_K{K}.csv", index=False)
wide.reset_index().to_csv(f"ls_portfolio_monthly_transformer_K{K}.csv", index=False)
results.to_csv(f"summary_transformer_K{K}.csv", index=False)

print("saved:",
      f"pred_test_transformer_K{K}.csv",
      f"ls_portfolio_monthly_transformer_K{K}.csv",
      f"summary_transformer_K{K}.csv")

saved: pred_test_transformer_K6.csv ls_portfolio_monthly_transformer_K6.csv summary_transformer_K6.csv
