In [8]:
# Hybrid BiLSTM (text) + MLP (tabular) for disaster tweet classification using your engineered features.
# Expects preprocessed DataFrames `train_fe` and `test_fe` in memory (as created by your build_features()).
# Will prefer the 'text_kw' column if present, else fallback to 'text_clean'.
from __future__ import annotations

import html
import re
import os
import math
import random
from typing import List, Dict, Tuple, Optional

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, precision_recall_fscore_support, classification_report

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


In [2]:
# -------------------------
# Reproducibility and device
# -------------------------
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

Using device: cuda


In [3]:
# -------------------------
# Tokenization (simple whitespace; text is already normalized)
# -------------------------
def tokenize(text: str) -> List[str]:
    if not isinstance(text, str):
        return []
    return text.strip().split()

PAD_TOKEN = "<pad>"
UNK_TOKEN = "<unk>"

class Vocab:
    def __init__(self, counter: Dict[str, int], min_freq: int = 2, max_size: Optional[int] = 30000):
        # specials first
        self.itos = [PAD_TOKEN, UNK_TOKEN]
        self.stoi = {PAD_TOKEN: 0, UNK_TOKEN: 1}
        # sort by freq desc then lexicographically
        items = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
        if max_size is not None:
            items = items[:max_size]
        for tok, freq in items:
            if freq < min_freq:
                continue
            if tok not in self.stoi:
                self.stoi[tok] = len(self.itos)
                self.itos.append(tok)
        self.pad_index = self.stoi[PAD_TOKEN]
        self.unk_index = self.stoi[UNK_TOKEN]

    def __len__(self):
        return len(self.itos)

    def encode(self, tokens: List[str]) -> List[int]:
        return [self.stoi.get(t, self.unk_index) for t in tokens]

def build_vocab(texts: List[str], min_freq: int = 2, max_size: int = 30000) -> Vocab:
    from collections import Counter
    counter = Counter()
    for t in texts:
        counter.update(tokenize(t))
    return Vocab(counter, min_freq=min_freq, max_size=max_size)


In [4]:
# -------------------------
# Dataset / Collate
# -------------------------
class HybridDataset(Dataset):
    def __init__(
        self,
        texts: List[str],
        tabular: np.ndarray,
        vocab: Vocab,
        labels: Optional[np.ndarray] = None,
        max_len: int = 60
    ):
        self.texts = texts
        self.tabular = tabular.astype(np.float32)
        self.vocab = vocab
        self.labels = labels.astype(np.float32) if labels is not None else None
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        toks = tokenize(self.texts[idx])
        if self.max_len is not None and len(toks) > self.max_len:
            toks = toks[: self.max_len]
        ids = self.vocab.encode(toks)
        item = {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "length": len(ids),
            "tab": torch.tensor(self.tabular[idx], dtype=torch.float),
        }
        if self.labels is not None:
            item["label"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

def collate_fn(batch, pad_idx: int):
    lengths = [b["length"] for b in batch]
    max_len = max(lengths) if lengths else 0
    B = len(batch)
    input_ids = torch.full((B, max_len), pad_idx, dtype=torch.long)
    tabs = []
    labels = []
    for i, b in enumerate(batch):
        ids = b["input_ids"]
        L = ids.size(0)
        input_ids[i, :L] = ids
        tabs.append(b["tab"].unsqueeze(0))
        if "label" in b:
            labels.append(b["label"].unsqueeze(0))
    out = {
        "input_ids": input_ids,
        "lengths": torch.tensor(lengths, dtype=torch.long),
        "tab": torch.cat(tabs, dim=0),
    }
    if labels:
        out["labels"] = torch.cat(labels, dim=0)
    return out

In [5]:
# -------------------------
# Model: BiLSTM encoder + Tabular MLP + Fusion head
# -------------------------
class TextEncoderBiLSTM(nn.Module):
    def __init__(self, vocab_size: int, embed_dim: int, hidden_size: int, pad_idx: int, dropout: float = 0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embed_dim, hidden_size, batch_first=True, bidirectional=True, num_layers=1)
        self.dropout = nn.Dropout(dropout)
        self.out_dim = hidden_size * 2

    def forward(self, input_ids: torch.Tensor, lengths: torch.Tensor) -> torch.Tensor:
        emb = self.embedding(input_ids)
        packed = nn.utils.rnn.pack_padded_sequence(emb, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, (h_n, _) = self.lstm(packed)
        # h_n: (num_layers*2, B, hidden)
        h_f = h_n[-2]  # (B, hidden)
        h_b = h_n[-1]  # (B, hidden)
        h = torch.cat([h_f, h_b], dim=1)  # (B, 2*hidden)
        return self.dropout(h)

class TabularMLP(nn.Module):
    def __init__(self, in_dim: int, hidden: int = 64, dropout: float = 0.3):
        super().__init__()
        self.net = nn.Sequential(
            nn.BatchNorm1d(in_dim),
            nn.Linear(in_dim, hidden),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(hidden, hidden),
            nn.ReLU(inplace=True),
        )
        self.out_dim = hidden

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.net(x)

class HybridClassifier(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        pad_idx: int,
        embed_dim: int = 128,
        hidden_size: int = 128,
        txt_dropout: float = 0.3,
        tab_in_dim: int = 0,
        tab_hidden: int = 64,
        tab_dropout: float = 0.3,
        fusion_dropout: float = 0.3,
    ):
        super().__init__()
        self.text_encoder = TextEncoderBiLSTM(vocab_size, embed_dim, hidden_size, pad_idx, dropout=txt_dropout)
        self.use_tab = tab_in_dim > 0
        if self.use_tab:
            self.tab_encoder = TabularMLP(tab_in_dim, hidden=tab_hidden, dropout=tab_dropout)
            fusion_in = self.text_encoder.out_dim + self.tab_encoder.out_dim
        else:
            self.tab_encoder = None
            fusion_in = self.text_encoder.out_dim
        self.fusion = nn.Sequential(
            nn.Dropout(fusion_dropout),
            nn.Linear(fusion_in, 1)
        )

    def forward(self, input_ids: torch.Tensor, lengths: torch.Tensor, tab: Optional[torch.Tensor] = None):
        h_txt = self.text_encoder(input_ids, lengths)
        if self.use_tab and tab is not None:
            h_tab = self.tab_encoder(tab)
            h = torch.cat([h_txt, h_tab], dim=1)
        else:
            h = h_txt
        logit = self.fusion(h).squeeze(1)
        return logit

In [6]:
# -------------------------
# Training / Evaluation
# -------------------------
def batch_f1_from_logits(logits: torch.Tensor, labels: torch.Tensor, thr: float = 0.5) -> float:
    probs = torch.sigmoid(logits).detach().cpu().numpy()
    preds = (probs >= thr).astype(np.int32)
    y = labels.detach().cpu().numpy().astype(np.int32)
    return f1_score(y, preds)

def train_one_epoch(model, loader, optimizer, criterion, device, clip: float = 1.0, thr: float = 0.5):
    model.train()
    losses, f1s = [], []
    for batch in loader:
        input_ids = batch["input_ids"].to(device)
        lengths = batch["lengths"].to(device)
        tab = batch["tab"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad(set_to_none=True)
        logits = model(input_ids, lengths, tab)
        loss = criterion(logits, labels)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        losses.append(loss.item())
        f1s.append(batch_f1_from_logits(logits, labels, thr=thr))
    return float(np.mean(losses)), float(np.mean(f1s))

@torch.no_grad()
def eval_epoch(model, loader, criterion, device, thr: float = 0.5):
    model.eval()
    losses, f1s = [], []
    all_probs, all_true = [], []
    for batch in loader:
        input_ids = batch["input_ids"].to(device)
        lengths = batch["lengths"].to(device)
        tab = batch["tab"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, lengths, tab)
        loss = criterion(logits, labels)
        losses.append(loss.item())
        probs = torch.sigmoid(logits).cpu().numpy()
        f1s.append(f1_score(labels.cpu().numpy().astype(np.int32), (probs >= thr).astype(np.int32)))
        all_probs.extend(probs.tolist())
        all_true.extend(labels.cpu().numpy().astype(np.int32).tolist())
    pr, rc, f1, _ = precision_recall_fscore_support(all_true, (np.array(all_probs) >= thr).astype(int), average="binary", zero_division=0)
    return float(np.mean(losses)), float(np.mean(f1s)), pr, rc, f1, np.array(all_probs), np.array(all_true)

def find_best_threshold(probs: np.ndarray, y_true: np.ndarray, grid=None) -> float:
    if grid is None:
        grid = np.linspace(0.2, 0.8, 61)  # 0.2 to 0.8 step 0.01
    best_thr, best_f1 = 0.5, -1.0
    for t in grid:
        f1 = f1_score(y_true, (probs >= t).astype(int))
        if f1 > best_f1:
            best_f1, best_thr = f1, float(t)
    return best_thr

In [19]:
# -------------------------
# Glue: prepare data and run
# -------------------------
DEFAULT_TABULAR_FEATURES = [
    "text_len", "word_count",
    "url_count", "mention_count", "hashtag_count",
    "has_url", "has_mention", "has_hashtag",
    "has_location", "has_keyword",
    "keyword_te",
]

def select_text_column(df: pd.DataFrame, prefer: str = "text_kw", fallback: str = "text_clean") -> str:
    if prefer in df.columns:
        return prefer
    if fallback in df.columns:
        return fallback
    raise KeyError(f"Neither {prefer} nor {fallback} found in DataFrame.")

def build_tab_matrix(df: pd.DataFrame, feature_names: List[str], stats: Optional[Dict[str, Tuple[float,float]]] = None):
    X = df[feature_names].copy()
    # separate continuous vs binary
    continuous = ["text_len", "word_count", "url_count", "mention_count", "hashtag_count", "keyword_te"]
    binary = ["has_url", "has_mention", "has_hashtag", "has_location", "has_keyword"]
    # ensure existence
    for col in feature_names:
        if col not in X.columns:
            raise KeyError(f"Required feature '{col}' is missing from input DataFrame.")
    # standardize continuous with provided stats or fit
    fitted_stats = {} if stats is None else dict(stats)
    for col in continuous:
        if col not in X.columns:
            continue
        if stats is None:
            mu = float(X[col].mean())
            sd = float(X[col].std(ddof=0)) or 1.0
            fitted_stats[col] = (mu, sd)
        else:
            mu, sd = stats[col]
        X[col] = (X[col] - mu) / (sd if sd != 0 else 1.0)
    # cast binary to float
    for col in binary:
        if col in X.columns:
            X[col] = X[col].astype(float)
    return X.values.astype(np.float32), fitted_stats

In [27]:
def run_training(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    target_col: str = "target",
    min_freq: int = 2,              # controls vocab pruning (used in build_vocab)
    max_vocab_size: int = 30000,    # caps vocab size (used in build_vocab)
    max_len: int = 60,              # max sequence length (used in HybridDataset)
    embedding_dim: int = 128,       # text embedding size (used in HybridClassifier)
    hidden_size: int = 128,         # BiLSTM hidden size per direction (used in HybridClassifier)
    txt_dropout: float = 0.3,       # dropout on text encoder output (used in HybridClassifier)
    tab_hidden: int = 64,           # hidden width for tabular MLP (used in HybridClassifier)
    tab_dropout: float = 0.3,       # dropout in tabular MLP (used in HybridClassifier)
    fusion_dropout: float = 0.3,    # dropout before final linear head (used in HybridClassifier)
    batch_size: int = 64,           # DataLoader batch size
    lr: float = 1e-3,               # optimizer learning rate
    epochs: int = 10,               # max training epochs
    early_stopping_patience: int = 3,  # early stopping patience on val F1
    use_pos_weight: bool = False,   # toggles BCEWithLogitsLoss(pos_weight=...)
    initial_threshold: float = 0.5, # starting decision threshold for F1 computation
    feature_names: Optional[List[str]] = None,  # which tabular features to use
    val_size: float = 0.15,         # validation split size
    seed: int = 42,                 # random seed (reproducibility and stratified split)
):
    set_seed(seed)  # uses: seed

    # Choose text column (prefers 'text_kw' if present)
    text_col = select_text_column(train_df)
    print(f"Using text column: {text_col}")

    # Define tabular features
    if feature_names is None:  # uses: feature_names
        feature_names = [c for c in DEFAULT_TABULAR_FEATURES if c in train_df.columns]

    # Split train/val
    tr_df, val_df = train_test_split(  # uses: val_size and seed
        train_df, test_size=val_size, random_state=seed, stratify=train_df[target_col]
    )
    print(f"Train size: {len(tr_df)} | Val size: {len(val_df)}")

    # Build vocab on training text only
    vocab = build_vocab(                # uses: min_freq, max_vocab_size
        tr_df[text_col].tolist(), min_freq=min_freq, max_size=max_vocab_size
    )
    print(f"Vocab size: {len(vocab)} (min_freq={min_freq}, max_size={max_vocab_size})")

    # Prepare tabular matrices with standardization (fit on train, apply to val/test)
    Xtr_tab, tab_stats = build_tab_matrix(tr_df, feature_names, stats=None)
    Xval_tab, _ = build_tab_matrix(val_df, feature_names, stats=tab_stats)
    Xte_tab, _ = build_tab_matrix(test_df, feature_names, stats=tab_stats)

    # Prepare datasets/loaders
    tr_ds = HybridDataset(              # uses: max_len
        tr_df[text_col].tolist(), Xtr_tab, vocab, labels=tr_df[target_col].values, max_len=max_len
    )
    val_ds = HybridDataset(
        val_df[text_col].tolist(), Xval_tab, vocab, labels=val_df[target_col].values, max_len=max_len
    )
    te_ds = HybridDataset(
        test_df[text_col].tolist(), Xte_tab, vocab, labels=None, max_len=max_len
    )

    tr_loader = DataLoader(tr_ds, batch_size=batch_size, shuffle=True,
                           collate_fn=lambda b: collate_fn(b, vocab.pad_index))  # uses: batch_size
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False,
                            collate_fn=lambda b: collate_fn(b, vocab.pad_index))
    te_loader = DataLoader(te_ds, batch_size=batch_size, shuffle=False,
                           collate_fn=lambda b: collate_fn(b, vocab.pad_index))

    # Model
    model = HybridClassifier(           # uses: embedding_dim, hidden_size, txt_dropout, tab_hidden, tab_dropout, fusion_dropout
        vocab_size=len(vocab),
        pad_idx=vocab.pad_index,
        embed_dim=embedding_dim,
        hidden_size=hidden_size,
        txt_dropout=txt_dropout,
        tab_in_dim=Xtr_tab.shape[1],
        tab_hidden=tab_hidden,
        tab_dropout=tab_dropout,
        fusion_dropout=fusion_dropout,
    ).to(DEVICE)

    # Loss
    if use_pos_weight:                  # uses: use_pos_weight
        pos_ratio = tr_df[target_col].mean()
        pos_weight = torch.tensor([(1.0 - pos_ratio) / max(pos_ratio, 1e-6)], dtype=torch.float, device=DEVICE)
        criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        print(f"Using pos_weight={pos_weight.item():.4f}")
    else:
        criterion = nn.BCEWithLogitsLoss()

    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=1
)

    # Training loop with early stopping on val F1
    best_val_f1, best_state, patience = -1.0, None, 0
    thr = initial_threshold                                   # uses: initial_threshold

    for epoch in range(1, epochs + 1):                        # uses: epochs
        tr_loss, tr_f1 = train_one_epoch(model, tr_loader, optimizer, criterion, DEVICE, thr=thr)
        val_loss, _, pr, rc, f1, val_probs, val_true = eval_epoch(model, val_loader, criterion, DEVICE, thr=thr)
        scheduler.step(val_loss)

        # Threshold tuning each epoch to maximize F1 on current model
        thr = find_best_threshold(val_probs, val_true)
        _, _, pr, rc, f1, _, _ = eval_epoch(model, val_loader, criterion, DEVICE, thr=thr)

        print(f"Epoch {epoch:02d}/{epochs} | "
              f"train_loss={tr_loss:.4f} train_f1~={tr_f1:.4f} | "
              f"val_loss={val_loss:.4f} val_f1={f1:.4f} (P={pr:.3f}, R={rc:.3f}) | thr={thr:.3f}")

        if f1 > best_val_f1:
            best_val_f1 = f1
            best_state = {k: v.detach().cpu() for k, v in model.state_dict().items()}
            patience = 0
        else:
            patience += 1
            if patience >= early_stopping_patience:           # uses: early_stopping_patience
                print(f"Early stopping at epoch {epoch}. Best val F1: {best_val_f1:.4f}")
                break

    # Load best model
    if best_state is not None:
        model.load_state_dict({k: v.to(DEVICE) for k, v in best_state.items()})

    # Final val metrics at tuned threshold
    val_loss, _, pr, rc, f1, val_probs, val_true = eval_epoch(model, val_loader, criterion, DEVICE, thr=thr)
    thr = find_best_threshold(val_probs, val_true)  # final tune
    _, _, pr, rc, f1, _, _ = eval_epoch(model, val_loader, criterion, DEVICE, thr=thr)

    print("\nFinal Validation Metrics:")
    print(f"Threshold={thr:.3f} Precision={pr:.4f} Recall={rc:.4f} F1={f1:.4f}")

    # Detailed report
    y_pred = (val_probs >= thr).astype(int)
    print("\nClassification report (val):")
    print(classification_report(val_true, y_pred, digits=4))

    # Inference on test
    model.eval()
    test_probs = []
    with torch.no_grad():
        for batch in te_loader:
            logits = model(batch["input_ids"].to(DEVICE), batch["lengths"].to(DEVICE), batch["tab"].to(DEVICE))
            probs = torch.sigmoid(logits).cpu().numpy().tolist()
            test_probs.extend(probs)
    test_preds = [1 if p >= thr else 0 for p in test_probs]

    # Save submission
    if "id" in test_df.columns:
        submission = pd.DataFrame({"id": test_df["id"].values, "target": test_preds})
    else:
        submission = pd.DataFrame({"target": test_preds})
    out_path = "submission_hybrid_lstm.csv"
    submission.to_csv(out_path, index=False)
    print(f"\nSaved submission to: {out_path}")

    artifacts = {
        "vocab": vocab,
        "feature_names": feature_names,
        "tab_stats": tab_stats,
        "threshold": thr,
        "best_val_f1": best_val_f1,
    }
    return model, artifacts

In [24]:
# Precompiled regex patterns
_URL_RE = re.compile(r"(https?://\S+|www\.\S+)", flags=re.IGNORECASE)
_MENTION_RE = re.compile(r"@\w+")
_HASHTAG_RE = re.compile(r"#\w+")
_HASHTAG_TOKEN_RE = re.compile(r"#(\w+)")  # capture token without '#'

def _to_str(x: object) -> str:
    if pd.isna(x):
        return ""
    return str(x)


def normalize_text(s: str) -> str:
    """
    Normalize tweet text:
      - HTML unescape
      - URLs -> 'URL'
      - @mentions -> 'USER'
      - hashtags: remove '#' but keep the word
      - lowercase
      - normalize whitespace
    """
    s = _to_str(s)
    s = html.unescape(s)

    # Replace URLs and mentions
    s = _URL_RE.sub("URL", s)
    s = _MENTION_RE.sub("USER", s)

    # Keep hashtag token (remove '#')
    s = _HASHTAG_TOKEN_RE.sub(r"\1", s)

    # Lowercase and collapse whitespace
    s = s.lower()
    s = " ".join(s.split())
    return s

def normalize_keyword(s: str) -> str:
    """
    Normalize keyword string:
      - Replace %20 and '-' with space
      - lowercase
      - normalize whitespace
    """
    s = _to_str(s)
    if not s:
        return ""
    s = s.replace("%20", " ")
    s = s.replace("-", " ")
    s = s.lower()
    s = " ".join(s.split())
    return s

def normalize_location(s: str) -> str:
    """
    Light location normalization:
      - lowercase
      - normalize whitespace
    """
    s = _to_str(s)
    if not s:
        return ""
    s = s.lower()
    s = " ".join(s.split())
    return s

def extract_text_counters(raw_text: str) -> Dict[str, int]:
    """
    Count social/media tokens from the RAW text (pre-cleaning):
      - url_count
      - mention_count
      - hashtag_count
    """
    s = _to_str(raw_text)
    url_count = len(_URL_RE.findall(s))
    mention_count = len(_MENTION_RE.findall(s))
    hashtag_count = len(_HASHTAG_RE.findall(s))
    return {
        "url_count": url_count,
        "mention_count": mention_count,
        "hashtag_count": hashtag_count,
    }

def _cv_target_encode(
    cat_series: pd.Series,
    y: pd.Series,
    n_splits: int = 5,
    random_state: int = 42,
    smoothing_min_samples: int = 1,
) -> Tuple[pd.Series, Dict[str, float], float]:
    """
    Cross-validated target encoding for a single categorical column.
    - Returns:
        oof_te: pd.Series of out-of-fold encoded values for training data.
        full_mapping: dict mapping category -> mean(target) on full training.
        global_mean: float, overall positive rate used as fallback.
    - Note: simple unsmoothed mean; can be extended with smoothing if needed.
    """
    cat_series = cat_series.astype(str).fillna("")
    y = y.astype(int).values

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    oof = np.zeros(len(cat_series), dtype=float)

    # Precompute global mean
    global_mean = float(np.mean(y))

    # Build OOF means
    for train_idx, valid_idx in skf.split(np.zeros(len(y)), y):
        # Means in the training fold
        fold_map = (
            pd.DataFrame({"cat": cat_series.iloc[train_idx].values, "y": y[train_idx]})
            .groupby("cat")["y"]
            .mean()
            .to_dict()
        )
        # Assign to validation fold
        oof_fold = [
            fold_map.get(cat_series.iloc[i], global_mean) for i in valid_idx
        ]
        oof[valid_idx] = np.array(oof_fold, dtype=float)

    # Full mapping for inference on test
    full_mapping = (
        pd.DataFrame({"cat": cat_series.values, "y": y})
        .groupby("cat")["y"]
        .mean()
        .to_dict()
    )

    return pd.Series(oof, index=cat_series.index), full_mapping, global_mean

def build_features(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    *,
    text_col: str = "text",
    keyword_col: str = "keyword",
    location_col: str = "location",
    target_col: str = "target",
    add_keyword_to_text: bool = True,
    n_splits: int = 5,
    random_state: int = 42,
) -> Tuple[pd.DataFrame, pd.DataFrame, Dict[str, float], float]:
    """
    Build cleaned columns and numeric features for the Kaggle disaster tweets dataset.

    Returns:
      train_out, test_out, keyword_te_mapping, keyword_te_global_mean

    Output columns:
      - text_clean, keyword_clean, location_clean
      - text_kw (if add_keyword_to_text=True)
      - text_len, word_count
      - url_count, mention_count, hashtag_count
      - has_url, has_mention, has_hashtag, has_location, has_keyword
      - keyword_te
      - id (if present), target (train only)
    """
    # Copy to avoid mutating user inputs
    train = train_df.copy()
    test = test_df.copy()

    # 1) Normalize keyword and location
    train["keyword_clean"] = train[keyword_col].map(normalize_keyword)
    test["keyword_clean"] = test[keyword_col].map(normalize_keyword)

    train["location_clean"] = train[location_col].map(normalize_location)
    test["location_clean"] = test[location_col].map(normalize_location)

    # 2) Extract counters from RAW text first (before cleaning replacements)
    train_counts = train[text_col].map(extract_text_counters).apply(pd.Series)
    test_counts = test[text_col].map(extract_text_counters).apply(pd.Series)

    # 3) Clean text
    train["text_clean"] = train[text_col].map(normalize_text)
    test["text_clean"] = test[text_col].map(normalize_text)

    # 4) Optionally prepend keyword tag to text
    if add_keyword_to_text:
        def _prepend_kw(row):
            kw = row["keyword_clean"]
            t = row["text_clean"]
            return f"[kw={kw}] {t}" if kw else t

        train["text_kw"] = train[["keyword_clean", "text_clean"]].apply(_prepend_kw, axis=1)
        test["text_kw"] = test[["keyword_clean", "text_clean"]].apply(_prepend_kw, axis=1)

    # 5) Length and token counts from CLEANED text
    train["text_len"] = train["text_clean"].str.len()
    test["text_len"] = test["text_clean"].str.len()

    train["word_count"] = train["text_clean"].str.split().str.len()
    test["word_count"] = test["text_clean"].str.split().str.len()

    # 6) Social token counts and binary flags
    for col in ["url_count", "mention_count", "hashtag_count"]:
        train[col] = train_counts[col].astype(int)
        test[col] = test_counts[col].astype(int)

    train["has_url"] = (train["url_count"] > 0).astype(int)
    test["has_url"] = (test["url_count"] > 0).astype(int)

    train["has_mention"] = (train["mention_count"] > 0).astype(int)
    test["has_mention"] = (test["mention_count"] > 0).astype(int)

    train["has_hashtag"] = (train["hashtag_count"] > 0).astype(int)
    test["has_hashtag"] = (test["hashtag_count"] > 0).astype(int)

    # 7) Presence flags for keyword/location
    train["has_keyword"] = (train["keyword_clean"] != "").astype(int)
    test["has_keyword"] = (test["keyword_clean"] != "").astype(int)

    train["has_location"] = (train["location_clean"] != "").astype(int)
    test["has_location"] = (test["location_clean"] != "").astype(int)

    # 8) Leakage-safe target encoding for keyword
    if target_col in train.columns:
        oof_te, mapping, global_mean = _cv_target_encode(
            train["keyword_clean"], train[target_col],
            n_splits=n_splits, random_state=random_state
        )
        train["keyword_te"] = oof_te.values
        test["keyword_te"] = test["keyword_clean"].map(mapping).fillna(global_mean).astype(float)
    else:
        # In case target is missing (rare), just pass through means as NaN
        mapping, global_mean = {}, np.nan
        train["keyword_te"] = np.nan
        test["keyword_te"] = np.nan

    # 9) Keep id/target if present
    keep_cols_train = []
    keep_cols_test = []

    for col in ["id"]:
        if col in train.columns:
            keep_cols_train.append(col)
        if col in test.columns:
            keep_cols_test.append(col)

    # Order columns for readability
    base_cols = ["text_clean", "keyword_clean", "location_clean"]
    if add_keyword_to_text:
        base_cols.append("text_kw")

    feat_cols = [
        "text_len", "word_count",
        "url_count", "mention_count", "hashtag_count",
        "has_url", "has_mention", "has_hashtag",
        "has_location", "has_keyword",
        "keyword_te",
    ]

    ordered_train_cols = keep_cols_train + base_cols + feat_cols
    ordered_test_cols = keep_cols_test + base_cols + feat_cols

    if target_col in train.columns:
        ordered_train_cols = keep_cols_train + [target_col] + base_cols + feat_cols

    train_out = train[ordered_train_cols].copy()
    test_out = test[ordered_test_cols].copy()

    return train_out, test_out, mapping, global_mean


In [13]:
from pathlib import Path
DATA_DIR = Path("data")
assert (DATA_DIR / "train.csv").exists(), "Expected data/train.csv to exist"
assert (DATA_DIR / "test.csv").exists(), "Expected data/test.csv to exist"
assert (DATA_DIR / "sample_submission.csv").exists(), "Expected data/sample_submission.csv to exist"

In [14]:
import os, warnings, logging
os.environ["PYTHONWARNINGS"] = "ignore"      # env-level suppression
warnings.filterwarnings("ignore")            # blanket ignore
logging.getLogger().setLevel(logging.ERROR)  # silence most logged warnings

In [16]:
# Load data
train = pd.read_csv(DATA_DIR / "train.csv")
test = pd.read_csv(DATA_DIR / "test.csv")

train_fe, test_fe, kw_mapping, kw_global = build_features(
    train, test,
    add_keyword_to_text=True,
    n_splits=5,
    random_state=42,
)

In [28]:
model, artifacts = run_training(
    train_df=train_fe,
    test_df=test_fe,
    target_col="target",
    # text capacity/noise
    min_freq=3,            # was 2
    max_vocab_size=30000,
    max_len=50,            # was 60
    embedding_dim=128,
    hidden_size=112,       # was 128
    # regularization
    txt_dropout=0.45,      # was 0.3
    tab_hidden=64,
    tab_dropout=0.4,       # was 0.3
    fusion_dropout=0.45,   # was 0.3
    # optimization
    batch_size=64,
    lr=8e-4,               # was 1e-3 (weâ€™ll also add weight_decay below)
    epochs=10,
    early_stopping_patience=2,  # was 3; your peak comes early
    use_pos_weight=False,
    initial_threshold=0.5,
    feature_names=None,
    val_size=0.15,
    seed=42,
)

Using text column: text_kw
Train size: 6471 | Val size: 1142
Vocab size: 4346 (min_freq=3, max_size=30000)
Epoch 01/10 | train_loss=0.6110 train_f1~=0.5448 | val_loss=0.5266 val_f1=0.7168 (P=0.736, R=0.699) | thr=0.500
Epoch 02/10 | train_loss=0.5096 train_f1~=0.7025 | val_loss=0.4948 val_f1=0.7378 (P=0.694, R=0.788) | thr=0.290
Epoch 03/10 | train_loss=0.4670 train_f1~=0.7254 | val_loss=0.4703 val_f1=0.7445 (P=0.730, R=0.760) | thr=0.400
Epoch 04/10 | train_loss=0.4343 train_f1~=0.7651 | val_loss=0.4682 val_f1=0.7522 (P=0.799, R=0.711) | thr=0.480
Epoch 05/10 | train_loss=0.4026 train_f1~=0.7788 | val_loss=0.4852 val_f1=0.7513 (P=0.768, R=0.735) | thr=0.430
Epoch 06/10 | train_loss=0.3610 train_f1~=0.8083 | val_loss=0.4848 val_f1=0.7393 (P=0.739, R=0.739) | thr=0.340
Early stopping at epoch 6. Best val F1: 0.7522

Final Validation Metrics:
Threshold=0.480 Precision=0.7986 Recall=0.7108 F1=0.7522

Classification report (val):
              precision    recall  f1-score   support

     