In [1]:
# FT-Transformer for tabular/time-series (hand-crafted features)
# --------------------------------------------------------------
# - Works with continuous + categorical columns
# - Multiclass classification ready
# - Class weights, early stopping, and simple scheduler included

import math
import numpy as np
import pandas as pd
from typing import List, Dict, Optional, Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score, classification_report

# ---------------------------
# Config (tune these later)
# ---------------------------
CFG = dict(
    d_token=192,             # token (embedding) dimension
    n_heads=8,               # attention heads (d_token % n_heads == 0)
    n_layers=4,              # transformer encoder layers
    attn_dropout=0.1,
    ff_dropout=0.2,          # feed-forward (MLP in the transformer) dropout
    token_dropout=0.1,       # Drop tokens (feature dropout) during training
    mlp_hidden=[256, 128],   # head MLP hidden dims
    mlp_dropout=0.2,
    lr=1e-3,
    weight_decay=1e-4,
    batch_size=256,
    epochs=40,
    early_stopping_patience=8,
    seed=42,
    num_workers=0,
    device="cuda" if torch.cuda.is_available() else "cpu",
)

# ---------------------------
# Utilities
# ---------------------------
def set_seed(seed: int):
    import random, os
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

class TabularDataset(Dataset):
    """
    Expects:
      - df: pandas DataFrame with both train/val features already present
      - cont_cols: list of continuous feature col names
      - cat_cols: list of categorical feature col names (already integer-encoded 0..card-1)
      - label_col: column name with integer class ids [0..num_classes-1]
      - scaler: fitted StandardScaler for continuous features (optional during training; required for val/test)
    """
    def __init__(self, df: pd.DataFrame, cont_cols: List[str], cat_cols: List[str],
                 label_col: Optional[str] = None, scaler: Optional[StandardScaler] = None):
        self.df = df.reset_index(drop=True)
        self.cont_cols = cont_cols
        self.cat_cols = cat_cols
        self.label_col = label_col

        cont = df[cont_cols].astype(float).values if cont_cols else np.zeros((len(df), 0), dtype=np.float32)
        if scaler is not None and cont.shape[1] > 0:
            cont = scaler.transform(cont)
        self.cont = cont.astype(np.float32)

        if cat_cols:
            cats = []
            for c in cat_cols:
                # ensure integer type
                cats.append(df[c].astype(int).values)
            self.cats = np.stack(cats, axis=1).astype(np.int64)  # [N, n_cat]
        else:
            self.cats = np.zeros((len(df), 0), dtype=np.int64)

        if label_col is not None:
            self.y = df[label_col].astype(int).values
        else:
            self.y = None

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        x_cont = torch.from_numpy(self.cont[idx]) if self.cont.shape[1] > 0 else torch.zeros(0)
        x_cat = torch.from_numpy(self.cats[idx]) if self.cats.shape[1] > 0 else torch.zeros(0, dtype=torch.long)
        if self.y is not None:
            y = torch.tensor(self.y[idx], dtype=torch.long)
            return x_cont, x_cat, y
        return x_cont, x_cat

# ---------------------------
# FT-Transformer Components
# ---------------------------
class FeatureTokenizer(nn.Module):
    """
    Tokenizes features into a sequence of tokens:
      [CLS] + [cat_1, ..., cat_K] + [cont_1, ..., cont_M]
    - Each cat feature gets its own Embedding -> d_token
    - Each cont feature gets a linear projection -> d_token (with LayerNorm)
    """
    def __init__(self, n_cont: int, cat_cardinalities: List[int], d_token: int):
        super().__init__()
        self.n_cont = n_cont
        self.n_cat = len(cat_cardinalities)
        self.d_token = d_token

        # CLS token (learned)
        self.cls = nn.Parameter(torch.zeros(1, 1, d_token))
        nn.init.trunc_normal_(self.cls, std=0.02)

        # Cat embeddings
        self.cat_embeds = nn.ModuleList(
            [nn.Embedding(card, d_token) for card in cat_cardinalities]
        )
        for emb in self.cat_embeds:
            nn.init.trunc_normal_(emb.weight, std=0.02)

        # Continuous projection: one linear per feature (more expressive than single shared)
        self.cont_linears = nn.ModuleList([nn.Linear(1, d_token) for _ in range(n_cont)])
        self.cont_norm = nn.LayerNorm(d_token)

    def forward(self, x_cont: torch.Tensor, x_cat: torch.Tensor) -> torch.Tensor:
        """
        x_cont: [B, n_cont] float
        x_cat : [B, n_cat] long
        returns: tokens [B, 1 + n_cat + n_cont, d_token]
        """
        B = x_cont.size(0) if x_cont.ndim == 2 else x_cat.size(0)
        tokens = []

        # CLS
        cls_tok = self.cls.expand(B, -1, -1)  # [B, 1, d]
        tokens.append(cls_tok)

        # Cat tokens
        if self.n_cat > 0:
            cat_tokens = []
            for i, emb in enumerate(self.cat_embeds):
                cat_tokens.append(emb(x_cat[:, i]))  # [B, d]
            cat_tokens = torch.stack(cat_tokens, dim=1)  # [B, n_cat, d]
            tokens.append(cat_tokens)

        # Cont tokens
        if self.n_cont > 0:
            cont_tokens = []
            for i, lin in enumerate(self.cont_linears):
                v = lin(x_cont[:, i:i+1])  # [B, d]
                v = self.cont_norm(v)
                cont_tokens.append(v)
            cont_tokens = torch.stack(cont_tokens, dim=1)  # [B, n_cont, d]
            tokens.append(cont_tokens)

        return torch.cat(tokens, dim=1)  # [B, L, d]

class TokenDropout(nn.Module):
    """Feature (token) dropout: randomly drop non-CLS tokens during training."""
    def __init__(self, p: float):
        super().__init__()
        self.p = p

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if not self.training or self.p <= 0.0:
            return x
        B, L, D = x.shape
        # keep CLS (index 0)
        mask = torch.ones(B, L, device=x.device, dtype=torch.bool)
        drop = torch.rand(B, L-1, device=x.device) < self.p
        mask[:, 1:] = ~drop
        # To keep sequence length, zero-out dropped tokens (could also replace with learnable padding)
        x = x * mask.unsqueeze(-1)
        return x

class FTTransformer(nn.Module):
    def __init__(
        self,
        n_cont: int,
        cat_cardinalities: List[int],
        d_token: int,
        n_heads: int,
        n_layers: int,
        attn_dropout: float,
        ff_dropout: float,
        token_dropout: float,
        num_classes: int,
        mlp_hidden: List[int],
        mlp_dropout: float,
    ):
        super().__init__()
        self.tokenizer = FeatureTokenizer(n_cont, cat_cardinalities, d_token)
        self.token_dropout = TokenDropout(token_dropout)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_token,
            nhead=n_heads,
            dim_feedforward=d_token * 4,
            dropout=ff_dropout,
            batch_first=True,
            activation="gelu",
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

        # Head on CLS token
        head_layers = []
        in_dim = d_token
        for h in mlp_hidden:
            head_layers += [nn.Linear(in_dim, h), nn.ReLU(), nn.Dropout(mlp_dropout)]
            in_dim = h
        head_layers += [nn.Linear(in_dim, num_classes)]
        self.head = nn.Sequential(*head_layers)

        # Xavier init for head
        for m in self.head:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)

    def forward(self, x_cont: torch.Tensor, x_cat: torch.Tensor) -> torch.Tensor:
        # Tokens: [B, L, d]
        x = self.tokenizer(x_cont, x_cat)
        x = self.token_dropout(x)
        x = self.encoder(x)               # [B, L, d]
        cls = x[:, 0, :]                  # [B, d]
        logits = self.head(cls)           # [B, C]
        return logits

# ---------------------------
# Training / Evaluation
# ---------------------------
def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    all_pred, all_true = [], []
    for x_cont, x_cat, y in loader:
        x_cont = x_cont.to(device)
        x_cat = x_cat.to(device)
        y = y.to(device)

        optimizer.zero_grad()
        logits = model(x_cont, x_cat)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * y.size(0)
        all_pred.append(torch.argmax(logits, dim=1).detach().cpu())
        all_true.append(y.detach().cpu())

    y_true = torch.cat(all_true).numpy()
    y_pred = torch.cat(all_pred).numpy()
    macro_f1 = f1_score(y_true, y_pred, average='macro')
    return running_loss / len(loader.dataset), macro_f1

@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    all_pred, all_true = [], []
    for x_cont, x_cat, y in loader:
        x_cont = x_cont.to(device)
        x_cat = x_cat.to(device)
        y = y.to(device)

        logits = model(x_cont, x_cat)
        loss = criterion(logits, y)
        running_loss += loss.item() * y.size(0)

        all_pred.append(torch.argmax(logits, dim=1).cpu())
        all_true.append(y.cpu())

    y_true = torch.cat(all_true).numpy()
    y_pred = torch.cat(all_pred).numpy()
    macro_f1 = f1_score(y_true, y_pred, average='macro')
    return running_loss / len(loader.dataset), macro_f1, (y_true, y_pred)

def fit(
    model,
    train_loader,
    val_loader,
    optimizer,
    scheduler,
    criterion,
    device,
    epochs,
    patience
):
    best_val_f1 = -1
    best_state = None
    patience_ctr = 0

    for epoch in range(1, epochs + 1):
        tr_loss, tr_f1 = train_one_epoch(model, train_loader, optimizer, criterion, device)
        val_loss, val_f1, _ = evaluate(model, val_loader, criterion, device)
        if scheduler is not None:
            scheduler.step()

        print(f"Epoch {epoch:03d} | train_loss={tr_loss:.4f} f1={tr_f1:.4f} "
              f"| val_loss={val_loss:.4f} f1={val_f1:.4f}")

        # Early stopping on F1
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            patience_ctr = 0
        else:
            patience_ctr += 1
            if patience_ctr >= patience:
                print("Early stopping.")
                break

    if best_state is not None:
        model.load_state_dict({k: v.to(device) for k, v in best_state.items()})
    return model, best_val_f1

# ---------------------------
# Example: wiring it together
# ---------------------------
def build_loaders(
    df_train: pd.DataFrame,
    df_val: pd.DataFrame,
    cont_cols: List[str],
    cat_cols: List[str],
    label_col: str,
    batch_size: int,
    num_workers: int
):
    # Standardize continuous on train only
    scaler = StandardScaler()
    if cont_cols:
        scaler.fit(df_train[cont_cols].astype(float).values)

    ds_train = TabularDataset(df_train, cont_cols, cat_cols, label_col, scaler)
    ds_val   = TabularDataset(df_val,   cont_cols, cat_cols, label_col, scaler)

    train_loader = DataLoader(ds_train, batch_size=batch_size, shuffle=True,
                              num_workers=num_workers, pin_memory=True)
    val_loader   = DataLoader(ds_val, batch_size=batch_size, shuffle=False,
                              num_workers=num_workers, pin_memory=True)
    return train_loader, val_loader, scaler

def infer_cardinalities(df: pd.DataFrame, cat_cols: List[str]) -> List[int]:
    cards = []
    for c in cat_cols:
        cards.append(int(df[c].max()) + 1)  # assumes 0..max encoding
    return cards

def run_ft_transformer(
    df_train: pd.DataFrame,
    df_val: pd.DataFrame,
    cont_cols: List[str],
    cat_cols: List[str],
    label_col: str,
    num_classes: int,
    cfg: Dict = CFG
):
    set_seed(cfg["seed"])
    device = cfg["device"]

    # Dataloaders
    train_loader, val_loader, scaler = build_loaders(
        df_train, df_val, cont_cols, cat_cols, label_col,
        batch_size=cfg["batch_size"], num_workers=cfg["num_workers"]
    )

    # Cardinalities
    cat_cards = infer_cardinalities(df_train, cat_cols) if cat_cols else []

    # Model
    model = FTTransformer(
        n_cont=len(cont_cols),
        cat_cardinalities=cat_cards,
        d_token=cfg["d_token"],
        n_heads=cfg["n_heads"],
        n_layers=cfg["n_layers"],
        attn_dropout=cfg["attn_dropout"],
        ff_dropout=cfg["ff_dropout"],
        token_dropout=cfg["token_dropout"],
        num_classes=num_classes,
        mlp_hidden=cfg["mlp_hidden"],
        mlp_dropout=cfg["mlp_dropout"],
    ).to(device)

    # Class weights (optional but useful for imbalance)
    y_train = df_train[label_col].astype(int).values
    classes = np.unique(y_train)
    class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
    # Map to full range [0..num_classes-1]
    cw_vec = np.ones(num_classes, dtype=np.float32)
    for i, c in enumerate(classes):
        cw_vec[c] = class_weights[i]
    criterion = nn.CrossEntropyLoss(weight=torch.tensor(cw_vec, dtype=torch.float32, device=device))

    optimizer = torch.optim.AdamW(model.parameters(), lr=cfg["lr"], weight_decay=cfg["weight_decay"])
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=cfg["epochs"])

    model, best_val_f1 = fit(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        scheduler=scheduler,
        criterion=criterion,
        device=device,
        epochs=cfg["epochs"],
        patience=cfg["early_stopping_patience"]
    )

    # Final val report
    _, _, (y_true, y_pred) = evaluate(model, val_loader, criterion, device)
    print("\nValidation classification report:")
    print(classification_report(y_true, y_pred, digits=4))

    return model, scaler


  from pandas.core import (


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import GroupShuffleSplit, StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder

# -----------------------------
# 1) Load your dataset
# -----------------------------
df = pd.read_csv(Path("C:/Users/lamia/Downloads/augmented_dataset1_trainval.csv"))

# -----------------------------
# 2) Define label & drop columns
# -----------------------------
label_cols = ['oestrus', 'calving', 'lameness', 'mastitis', 'other_disease', 'OK']
drop_cols  = ['cow', 'duration_hours']   # will be removed from feature set

# Build a single multiclass target from the one-hot columns
# (argmax over the one-hot columns; ensure they are numeric 0/1)
labels_matrix = df[label_cols].astype(float).values
y_int = labels_matrix.argmax(axis=1)
df['target'] = y_int

# -----------------------------
# 3) Train/Val split (grouped by cow if available)
# -----------------------------
if 'cow' in df.columns:
    # Grouped split to prevent leakage across cows
    gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=42)
    groups = df['cow']
    train_idx, val_idx = next(gss.split(df, y_int, groups))
else:
    # Stratified split on labels
    sss = StratifiedShuffleSplit(n_splits=1, train_size=0.8, random_state=42)
    train_idx, val_idx = next(sss.split(df, y_int))

df_train = df.iloc[train_idx].copy()
df_val   = df.iloc[val_idx].copy()

# -----------------------------
# 4) Build feature lists
# -----------------------------
# Remove labels & drop_cols from the feature space
excluded = set(label_cols + drop_cols + ['target'])
candidate_cols = [c for c in df.columns if c not in excluded]

# Continuous columns = numeric dtypes
cont_cols = [c for c in candidate_cols if np.issubdtype(df[c].dtype, np.number)]

# Categorical columns (integer-encoded 0..card-1).
# If you have encoded IDs like 'cow_id_enc', add them here.
cat_cols = []  # e.g., ['cow_id_enc', 'month_enc'] if present & integer-encoded

# Sanity check
assert len(cont_cols) + len(cat_cols) > 0, "No features found. Check your drop/feature lists."

# -----------------------------
# 5) Configure FT-Transformer
# -----------------------------
CFG.update({
    "d_token": 192,
    "n_heads": 8,                 # must divide d_token
    "n_layers": 4,
    "attn_dropout": 0.10,
    "ff_dropout": 0.20,
    "token_dropout": 0.10,
    "mlp_hidden": [512, 256, 128],
    "mlp_dropout": 0.30,

    "batch_size": 128,
    "lr": 5e-4,
    "weight_decay": 1e-4,
    "epochs": 200,
    "early_stopping_patience": 15,
})

num_classes = len(label_cols)

# -----------------------------
# 6) Train
# -----------------------------
model, scaler = run_ft_transformer(
    df_train=df_train,
    df_val=df_val,
    cont_cols=cont_cols,
    cat_cols=cat_cols,
    label_col='target',
    num_classes=num_classes,
    cfg=CFG
)

# -----------------------------
# 7) OPTIONAL: Temperature scaling for calibrated probs
#    (Good when you later want reliable confidence estimates)
# -----------------------------
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

class TemperatureScaler(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.T = nn.Parameter(torch.ones(1) * 1.0)

    def forward(self, x_cont, x_cat):
        logits = self.model(x_cont, x_cat)
        return logits / self.T.clamp(min=1e-6)

    def fit(self, loader, device):
        self.to(device)
        self.model.eval()
        nll = nn.CrossEntropyLoss()
        optimizer = torch.optim.LBFGS([self.T], lr=0.01, max_iter=50)

        logits_list, labels_list = [], []
        with torch.no_grad():
            for xc, xa, y in loader:
                xc, xa, y = xc.to(device), xa.to(device), y.to(device)
                logits_list.append(self.model(xc, xa))
                labels_list.append(y)
        logits = torch.cat(logits_list)
        labels = torch.cat(labels_list)

        def closure():
            optimizer.zero_grad()
            loss = nll(logits / self.T.clamp(min=1e-6), labels)
            loss.backward()
            return loss

        optimizer.step(closure)
        print(f"Fitted temperature: {self.T.item():.4f}")

# Build a val loader with the same scaler to fit T
val_ds = TabularDataset(df_val, cont_cols, cat_cols, label_col='target', scaler=scaler)
val_loader = DataLoader(val_ds, batch_size=CFG["batch_size"], shuffle=False)

temp_model = TemperatureScaler(model)
temp_model.fit(val_loader, device=CFG["device"])

# -----------------------------
# 8) Example: get calibrated predictions on validation
# -----------------------------
import numpy as np
from sklearn.metrics import classification_report

model.eval()
temp_model.eval()
all_probs, all_preds, all_true = [], [], []

with torch.no_grad():
    for xc, xa, y in val_loader:
        xc, xa = xc.to(CFG["device"]), xa.to(CFG["device"])
        logits = temp_model(xc, xa)
        probs = torch.softmax(logits, dim=1).cpu().numpy()
        all_probs.append(probs)
        all_preds.append(probs.argmax(axis=1))
        all_true.append(y.numpy())

all_probs = np.vstack(all_probs)
all_preds = np.concatenate(all_preds)
all_true  = np.concatenate(all_true)

print("\nValidation (calibrated) report:")
print(classification_report(all_true, all_preds, target_names=label_cols, digits=4))




Epoch 001 | train_loss=1.0023 f1=0.5884 | val_loss=1.9947 f1=0.3712




Epoch 002 | train_loss=0.5938 f1=0.7560 | val_loss=2.0737 f1=0.4111




Epoch 003 | train_loss=0.4806 f1=0.8061 | val_loss=2.4612 f1=0.4086


