In [None]:
import torch
import torch.nn as nn
class MetaTower(nn.Module):
    def __init__(self, in_dim, hidden_dim=32, out_dim=16):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_dim, out_dim),
            nn.ReLU(),
        )
        self.classifier = nn.Linear(out_dim, 1)  # for metadata-only training

    def forward(self, x, return_logits=False):
        h = self.mlp(x)              # h_meta
        logits = self.classifier(h)  # metadata-only prediction
        if return_logits:
            return h, logits
        return h

In [2]:
import numpy as np
import pandas as pd
import re, ast, hashlib
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# ============================================================
# 1. Parsing + author_pseudo_id + basic metadata columns
# ============================================================

def parse_tweets_meta(path, expect_label=True):
    """
    Parse the raw train.jsonl and build a flat DataFrame with:
    - author_pseudo_id
    - basic tweet/user fields
    - simple structural flags (is_reply, has_media, etc.)
    """
    path = Path(path)
    df = pd.read_json(path, lines=True)
    df = pd.json_normalize(df.to_dict(orient="records"), sep=".")

    # Ensure some nested columns exist
    for col in [
        "text", "extended_tweet.full_text", "source",
        "entities.hashtags", "entities.user_mentions", "entities.urls",
        "extended_entities.media",
        "user.created_at", "user.description", "user.url", "user.location",
        "user.favourites_count", "user.statuses_count", "user.listed_count",
        "user.default_profile", "user.geo_enabled",
        "in_reply_to_status_id", "in_reply_to_user_id",
    ]:
        if col not in df.columns:
            df[col] = np.nan

    # -------- full text ----------
    df["full_text"] = df["extended_tweet.full_text"].fillna(df["text"]).fillna("")
    df["text_len"] = df["full_text"].str.len()

    # -------- counts / list lengths ----------
    def safe_len(x):
        if isinstance(x, list):
            return len(x)
        if isinstance(x, str):
            try:
                v = ast.literal_eval(x)
                return len(v) if isinstance(v, (list, tuple)) else 1
            except Exception:
                return 0
        return 0

    df["n_hashtags"] = df["entities.hashtags"].apply(safe_len)
    df["n_mentions"] = df["entities.user_mentions"].apply(safe_len)
    df["n_urls"]     = df["entities.urls"].apply(safe_len)

    # media flag
    df["has_media"] = df["extended_entities.media"].apply(lambda x: safe_len(x) > 0)

    # -------- author_pseudo_id ----------
    def make_user_key(row):
        key = (
            str(row.get("user.created_at", "")) + "|" +
            str(row.get("user.description", "")) + "|" +
            str(row.get("user.url", "")) + "|" +
            str(row.get("user.location", ""))
        )
        return hashlib.md5(key.encode("utf-8")).hexdigest()

    df["author_pseudo_id"] = df.apply(make_user_key, axis=1)

    # -------- structural flags ----------
    df["is_reply"] = (
        df["in_reply_to_status_id"].notna() |
        df["in_reply_to_user_id"].notna()
    )

    # approximate retweet flag (not used here, but harmless)
    df["is_retweet"] = df["text"].fillna("").str.startswith("RT @")

    # ensure boolean columns exist (even if missing)
    for col in ["user.default_profile", "user.geo_enabled"]:
        if col not in df.columns:
            df[col] = np.nan

    # -------- source_app (HTML → readable name) ----------
    def extract_source(x):
        if not isinstance(x, str):
            return "Unknown"
        m = re.search(r'>([^<]+)<', x)
        return m.group(1) if m else x

    df["source_app"] = df["source"].apply(extract_source)

    # keep only metadata we care about:
    keep_cols = [
        "author_pseudo_id", "full_text",
        "challenge_id" if "challenge_id" in df.columns else None,
        "n_hashtags", "n_mentions", "n_urls", "has_media",
        "user.favourites_count", "user.statuses_count", "user.listed_count",
        "user.default_profile", "user.geo_enabled",
        "is_reply", "text_len", "source_app",
    ]
    keep_cols = [c for c in keep_cols if c is not None]

    out = df[keep_cols].copy()

    # attach label if present
    if expect_label and "label" in df.columns:
        out["label"] = df["label"]
    elif expect_label and "label" not in df.columns:
        print("Warning: 'label' not found in this file; returning features only.")

    return out


# ============================================================
# 2. Feature engineering for metadata features
# ============================================================

def build_meta_features(df, fit_stats=None, src2idx=None, K=15):
    """
    Build metadata features:
      - log_status (from user.statuses_count)
      - log_listed (from user.listed_count)
      - log_fav   (from user.favourites_count)
      - n_mentions, n_hashtags
      - booleans as 0/1
      - source_idx: bucketized from source_app with top-K on TRAIN
    """
    df = df.copy()

    # ensure numeric columns exist
    for col in ["user.statuses_count", "user.listed_count", "user.favourites_count"]:
        if col not in df.columns:
            df[col] = 0

    # fit_stats = dict with p99s, learned from train set only
    if fit_stats is None:
        fit_stats = {}
        for col in ["user.statuses_count", "user.listed_count", "user.favourites_count"]:
            fit_stats[f"{col}_p99"] = float(df[col].quantile(0.995))

    # log transforms with clipping
    df["log_status"] = np.log1p(
        np.clip(df["user.statuses_count"].fillna(0), 0, fit_stats["user.statuses_count_p99"])
    )
    df["log_listed"] = np.log1p(
        np.clip(df["user.listed_count"].fillna(0), 0, fit_stats["user.listed_count_p99"])
    )
    df["log_fav"] = np.log1p(
        np.clip(df["user.favourites_count"].fillna(0), 0, fit_stats["user.favourites_count_p99"])
    )

    # counts
    df["n_mentions"] = df["n_mentions"].fillna(0).astype(int)
    df["n_hashtags"] = df["n_hashtags"].fillna(0).astype(int)

    # booleans → 0/1
    for bcol in ["has_media", "is_reply", "user.default_profile", "user.geo_enabled"]:
        if bcol not in df.columns:
            df[bcol] = False
        df[bcol] = df[bcol].fillna(False).astype(int)

    # ---- source_idx from source_app ----
    if "source_app" not in df.columns:
        df["source_app"] = "Unknown"

    if src2idx is None:
        top_src = df["source_app"].fillna("Unknown").value_counts().head(K).index.tolist()
        src2idx = {s: i + 1 for i, s in enumerate(top_src)}  # 0 reserved for "Other"

    df["source_idx"] = (
        df["source_app"]
        .fillna("Unknown")
        .map(src2idx)
        .fillna(0)
        .astype(int)
    )

    return df, fit_stats, src2idx


# ============================================================
# 3. Author-based split (no user overlap)
# ============================================================

def author_based_split(df, val_size=0.1, random_state=42):
    """
    Split df into train/val such that authors (author_pseudo_id)
    do not overlap between train and val.
    """
    assert "author_pseudo_id" in df.columns, "author_pseudo_id column missing"

    user_ids = df["author_pseudo_id"].astype(str)
    unique_users = user_ids.unique()

    train_users, val_users = train_test_split(
        unique_users, test_size=val_size, random_state=random_state
    )

    train_mask = user_ids.isin(train_users)
    val_mask   = user_ids.isin(val_users)

    train_df = df[train_mask].reset_index(drop=True)
    val_df   = df[val_mask].reset_index(drop=True)

    print("User-level split:")
    print("  #train tweets:", len(train_df))
    print("  #val tweets:  ", len(val_df))
    print("  #unique users train:", len(np.unique(user_ids[train_mask])))
    print("  #unique users val:  ", len(np.unique(user_ids[val_mask])))

    return train_df, val_df


# ============================================================
# 4. MetaTower (PyTorch MLP)
# ============================================================

class MetaTower(nn.Module):
    def __init__(self, in_dim, hidden_dim=32, out_dim=16, dropout=0.1):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, out_dim),
            nn.ReLU(),
        )
        self.classifier = nn.Linear(out_dim, 1)  # metadata-only training

    def forward(self, x, return_logits=False):
        h = self.mlp(x)              # h_meta
        logits = self.classifier(h)  # (B, 1)
        if return_logits:
            return h, logits
        return h


# ============================================================
# 5. Training loop for MetaTower on best_with_seven
# ============================================================

def train_meta_tower(
    train_df,
    val_df,
    feature_cols,
    batch_size=512,
    lr=1e-3,
    n_epochs=10,
    dropout=0.1,
    hidden_dim=32,
    device=None,
):
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using device:", device)

    # --- prepare numpy arrays ---
    X_train = train_df[feature_cols].values.astype(np.float32)
    X_val   = val_df[feature_cols].values.astype(np.float32)

    y_train = train_df["label"].astype(np.float32).values
    y_val   = val_df["label"].astype(np.float32).values

    # --- standardize features (fit on train only) ---
    scaler = StandardScaler()
    X_train_sc = scaler.fit_transform(X_train)
    X_val_sc   = scaler.transform(X_val)

    # --- to tensors ---
    X_train_t = torch.from_numpy(X_train_sc)
    y_train_t = torch.from_numpy(y_train).unsqueeze(1)  # (N,1)
    X_val_t   = torch.from_numpy(X_val_sc)
    y_val_t   = torch.from_numpy(y_val).unsqueeze(1)

    train_ds = TensorDataset(X_train_t, y_train_t)
    val_ds   = TensorDataset(X_val_t, y_val_t)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

    # --- model, loss, optimizer ---
    model = MetaTower(in_dim=len(feature_cols), hidden_dim=hidden_dim, out_dim=16, dropout=dropout).to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # --- training loop ---
    for epoch in range(1, n_epochs + 1):
        model.train()
        train_losses = []

        for xb, yb in train_loader:
            xb = xb.to(device)
            yb = yb.to(device)

            _, logits = model(xb, return_logits=True)
            loss = criterion(logits, yb)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_losses.append(loss.item())

        # quick validation loss each epoch
        model.eval()
        val_losses = []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb = xb.to(device)
                yb = yb.to(device)
                _, logits = model(xb, return_logits=True)
                loss = criterion(logits, yb)
                val_losses.append(loss.item())

        print(
            f"Epoch {epoch:02d} | "
            f"train_loss={np.mean(train_losses):.4f} | "
            f"val_loss={np.mean(val_losses):.4f}"
        )

    # --- final evaluation (accuracy + AUC on val) ---
    model.eval()
    with torch.no_grad():
        _, logits_val = model(X_val_t.to(device), return_logits=True)
        probs_val = torch.sigmoid(logits_val).cpu().numpy().ravel()
        preds_val = (probs_val >= 0.5).astype(int)
        y_true    = y_val.astype(int)

    acc = accuracy_score(y_true, preds_val)
    auc = roc_auc_score(y_true, probs_val)

    print("\n=== MetaTower (best_with_seven) performance on user-based val ===")
    print(f"Accuracy: {acc:.4f}")
    print(f"AUC:      {auc:.4f}")

    return model, scaler, {"accuracy": acc, "auc": auc}


# ============================================================
# 6. Main: load data, build features, split, train MetaTower
# ============================================================

# ---- adjust path to your train.jsonl ----
train_path = "../../train.jsonl"   # <-- change if needed

# 1) parse with author_pseudo_id + raw meta
raw_df = parse_tweets_meta(train_path, expect_label=True)

# 2) author-based split BEFORE computing stats (important!)
train_raw, val_raw = author_based_split(raw_df, val_size=0.1, random_state=42)

# 3) build metadata features, with p99 stats fitted on TRAIN only
train_meta, stats, src2idx = build_meta_features(train_raw, fit_stats=None, src2idx=None, K=15)
val_meta, _, _             = build_meta_features(val_raw, fit_stats=stats, src2idx=src2idx, K=15)

# 4) sanity check: label distribution
print("\nTrain label distribution:")
print(train_meta["label"].value_counts(normalize=True).rename("proportion"))
print("\nVal label distribution:")
print(val_meta["label"].value_counts(normalize=True).rename("proportion"))

# 5) define your best_with_seven feature set
best_with_seven = [
    "log_status",
    "log_listed",
    "user.default_profile",
    "is_reply",
    "n_mentions",
    "log_fav",
    "source_idx",
]

# 6) train MetaTower on these features with user-based validation
meta_model, meta_scaler, meta_metrics = train_meta_tower(
    train_meta,
    val_meta,
    feature_cols=best_with_seven,
    batch_size=128,
    lr=1e-3,
    n_epochs=10,
    dropout=0.1,
)

User-level split:
  #train tweets: 139426
  #val tweets:   15488
  #unique users train: 44158
  #unique users val:   4907

Train label distribution:
label
0    0.534556
1    0.465444
Name: proportion, dtype: float64

Val label distribution:
label
0    0.525762
1    0.474238
Name: proportion, dtype: float64
Using device: cuda
Epoch 01 | train_loss=0.4495 | val_loss=0.4175
Epoch 02 | train_loss=0.4210 | val_loss=0.4148
Epoch 03 | train_loss=0.4180 | val_loss=0.4129
Epoch 04 | train_loss=0.4160 | val_loss=0.4117
Epoch 05 | train_loss=0.4154 | val_loss=0.4120
Epoch 06 | train_loss=0.4147 | val_loss=0.4114
Epoch 07 | train_loss=0.4141 | val_loss=0.4119
Epoch 08 | train_loss=0.4130 | val_loss=0.4099
Epoch 09 | train_loss=0.4129 | val_loss=0.4098
Epoch 10 | train_loss=0.4117 | val_loss=0.4115

=== MetaTower (best_with_seven) performance on user-based val ===
Accuracy: 0.8208
AUC:      0.8937


In [None]:
# Performing Grid Search -> hidden_dim=32

def grid_search_meta_tower(
    train_df,
    val_df,
    feature_cols,
    lr_list=(1e-2, 5e-3, 1e-3, 5e-4),
    n_epochs_list=(5, 10, 15),
    dropout_list=(0.0, 0.1, 0.2),
    batch_size=512,
    device=None,
):
    """
    Brute-force grid over (lr, n_epochs, dropout).
    Reuses train_meta_tower and returns the best config by AUC.
    """
    results = []
    best_auc = -np.inf
    best_result = None
    best_model = None
    best_scaler = None

    for lr in lr_list:
        for n_epochs in n_epochs_list:
            for dp in dropout_list:
                print("\n" + "=" * 60)
                print(f"Trying config: lr={lr}, n_epochs={n_epochs}, dropout={dp}")
                print("=" * 60)

                model, scaler, metrics = train_meta_tower(
                    train_df=train_df,
                    val_df=val_df,
                    feature_cols=feature_cols,
                    batch_size=batch_size,
                    lr=lr,
                    n_epochs=n_epochs,
                    dropout=dp,
                    device=device,
                )

                acc = metrics["accuracy"]
                auc = metrics["auc"]

                res = {
                    "lr": lr,
                    "n_epochs": n_epochs,
                    "dropout": dp,
                    "accuracy": acc,
                    "auc": auc,
                }
                results.append(res)

                if auc > best_auc:
                    best_auc = auc
                    best_result = res
                    best_model = model
                    best_scaler = scaler

    results_df = pd.DataFrame(results).sort_values(
        ["auc", "accuracy"], ascending=False
    ).reset_index(drop=True)

    print("\n=== Grid search summary (sorted by AUC, then accuracy) ===")
    print(results_df.head(10))
    print("\nBest config found:", best_result)

    return best_model, best_scaler, best_result, results_df


# ============================================================
# 6. Main: load data, build features, split, run grid search
# ============================================================

# ---- adjust path to your train.jsonl ----
train_path = "../../train.jsonl"   # <-- change if needed

# 1) parse with author_pseudo_id + raw meta
raw_df = parse_tweets_meta(train_path, expect_label=True)

# 2) author-based split BEFORE computing stats (important!)
train_raw, val_raw = author_based_split(raw_df, val_size=0.1, random_state=42)

# 3) build metadata features, with p99 stats fitted on TRAIN only
train_meta, stats, src2idx = build_meta_features(train_raw, fit_stats=None, src2idx=None, K=15)
val_meta, _, _             = build_meta_features(val_raw, fit_stats=stats, src2idx=src2idx, K=15)

# 4) sanity check: label distribution
print("\nTrain label distribution:")
print(train_meta["label"].value_counts(normalize=True).rename("proportion"))
print("\nVal label distribution:")
print(val_meta["label"].value_counts(normalize=True).rename("proportion"))

# 5) define your best_with_seven feature set
best_with_seven = [
    "log_status",
    "log_listed",
    "user.default_profile",
    "is_reply",
    "n_mentions",
    "log_fav",
    "source_idx",
]

# 6) run grid search instead of a single training run
best_model, best_scaler, best_result, grid_df = grid_search_meta_tower(
    train_meta,
    val_meta,
    feature_cols=best_with_seven,
    lr_list=(1e-2, 5e-3, 1e-3, 5e-4),
    n_epochs_list=(5, 10, 15),
    dropout_list=(0.0, 0.1, 0.2),
    batch_size=256,
)

User-level split:
  #train tweets: 139426
  #val tweets:   15488
  #unique users train: 44158
  #unique users val:   4907

Train label distribution:
label
0    0.534556
1    0.465444
Name: proportion, dtype: float64

Val label distribution:
label
0    0.525762
1    0.474238
Name: proportion, dtype: float64

Trying config: lr=0.01, n_epochs=5, dropout=0.0
Using device: cuda
Epoch 01 | train_loss=0.4218 | val_loss=0.4126
Epoch 02 | train_loss=0.4116 | val_loss=0.4092
Epoch 03 | train_loss=0.4090 | val_loss=0.4108
Epoch 04 | train_loss=0.4062 | val_loss=0.4115
Epoch 05 | train_loss=0.4054 | val_loss=0.4134

=== MetaTower (best_with_seven) performance on user-based val ===
Accuracy: 0.8205
AUC:      0.8940

Trying config: lr=0.01, n_epochs=5, dropout=0.1
Using device: cuda
Epoch 01 | train_loss=0.4266 | val_loss=0.4134
Epoch 02 | train_loss=0.4167 | val_loss=0.4150
Epoch 03 | train_loss=0.4150 | val_loss=0.4113
Epoch 04 | train_loss=0.4125 | val_loss=0.4139
Epoch 05 | train_loss=0.4121 | v

In [None]:
# Performing Grid Search

def grid_search_meta_tower(
    train_df,
    val_df,
    feature_cols,
    lr_list=(1e-2, 5e-3, 1e-3, 5e-4),
    n_epochs_list=(5, 10, 15),
    dropout_list=(0.0, 0.1, 0.2),
    hidden_dim_list=(16, 32, 64),
    batch_size=256,
    device=None,
):
    """
    Brute-force grid over (lr, n_epochs, dropout).
    Reuses train_meta_tower and returns the best config by AUC.
    """
    results = []
    best_auc = -np.inf
    best_result = None
    best_model = None
    best_scaler = None

    for lr in lr_list:
        for hidden_dim in hidden_dim_list:
            for n_epochs in n_epochs_list:
                for dp in dropout_list:
                    print("\n" + "=" * 60)
                    print(f"Trying config: lr={lr}, n_epochs={n_epochs}, dropout={dp}, hidden_dim={hidden_dim}")
                    print("=" * 60)

                    model, scaler, metrics = train_meta_tower(
                        train_df=train_df,
                        val_df=val_df,
                        feature_cols=feature_cols,
                        batch_size=batch_size,
                        lr=lr,
                        n_epochs=n_epochs,
                        dropout=dp,
                        hidden_dim=hidden_dim,
                        device=device,
                    )

                    acc = metrics["accuracy"]
                    auc = metrics["auc"]

                    res = {
                        "lr": lr,
                        "n_epochs": n_epochs,
                        "dropout": dp,
                        "hidden_dim": hidden_dim,
                        "accuracy": acc,
                        "auc": auc,
                    }
                    results.append(res)

                    if auc > best_auc:
                        best_auc = auc
                        best_result = res
                        best_model = model
                        best_scaler = scaler

    results_df = pd.DataFrame(results).sort_values(
        ["auc", "accuracy"], ascending=False
    ).reset_index(drop=True)

    print("\n=== Grid search summary (sorted by AUC, then accuracy) ===")
    print(results_df.head(10))
    print("\nBest config found:", best_result)

    return best_model, best_scaler, best_result, results_df


# ============================================================
# 6. Main: load data, build features, split, run grid search
# ============================================================

# ---- adjust path to your train.jsonl ----
train_path = "../../train.jsonl"   # <-- change if needed

# 1) parse with author_pseudo_id + raw meta
raw_df = parse_tweets_meta(train_path, expect_label=True)

# 2) author-based split BEFORE computing stats (important!)
train_raw, val_raw = author_based_split(raw_df, val_size=0.1, random_state=42)

# 3) build metadata features, with p99 stats fitted on TRAIN only
train_meta, stats, src2idx = build_meta_features(train_raw, fit_stats=None, src2idx=None, K=15)
val_meta, _, _             = build_meta_features(val_raw, fit_stats=stats, src2idx=src2idx, K=15)

# 4) sanity check: label distribution
print("\nTrain label distribution:")
print(train_meta["label"].value_counts(normalize=True).rename("proportion"))
print("\nVal label distribution:")
print(val_meta["label"].value_counts(normalize=True).rename("proportion"))

# 5) define your best_with_seven feature set
best_with_seven = [
    "log_status",
    "log_listed",
    "user.default_profile",
    "is_reply",
    "n_mentions",
    "log_fav",
    "source_idx",
]

# 6) run grid search instead of a single training run
best_model, best_scaler, best_result, grid_df = grid_search_meta_tower(
    train_meta,
    val_meta,
    feature_cols=best_with_seven,
    lr_list=(1e-2, 5e-3, 1e-3),
    n_epochs_list=(5, 10),
    dropout_list=(0.0, 0.1),
    hidden_dim_list=(16, 64),
    batch_size=256,
)

User-level split:
  #train tweets: 139426
  #val tweets:   15488
  #unique users train: 44158
  #unique users val:   4907

Train label distribution:
label
0    0.534556
1    0.465444
Name: proportion, dtype: float64

Val label distribution:
label
0    0.525762
1    0.474238
Name: proportion, dtype: float64

Trying config: lr=0.01, n_epochs=5, dropout=0.0
Using device: cuda
Epoch 01 | train_loss=0.4238 | val_loss=0.4170
Epoch 02 | train_loss=0.4130 | val_loss=0.4151
Epoch 03 | train_loss=0.4104 | val_loss=0.4162
Epoch 04 | train_loss=0.4082 | val_loss=0.4117
Epoch 05 | train_loss=0.4074 | val_loss=0.4080

=== MetaTower (best_with_seven) performance on user-based val ===
Accuracy: 0.8211
AUC:      0.8950

Trying config: lr=0.01, n_epochs=5, dropout=0.1
Using device: cuda
Epoch 01 | train_loss=0.4339 | val_loss=0.4165
Epoch 02 | train_loss=0.4178 | val_loss=0.4173
Epoch 03 | train_loss=0.4162 | val_loss=0.4164
Epoch 04 | train_loss=0.4154 | val_loss=0.4142
Epoch 05 | train_loss=0.4140 | v

In [3]:
# Performing Grid Search

def grid_search_meta_tower(
    train_df,
    val_df,
    feature_cols,
    lr_list=(1e-2, 5e-3, 1e-3, 5e-4),
    n_epochs_list=(5, 10, 15),
    dropout_list=(0.0, 0.1, 0.2),
    hidden_dim_list=(16, 32, 64),
    batch_size=256,
    device=None,
):
    """
    Brute-force grid over (lr, n_epochs, dropout).
    Reuses train_meta_tower and returns the best config by AUC.
    """
    results = []
    best_auc = -np.inf
    best_result = None
    best_model = None
    best_scaler = None

    for lr in lr_list:
        for hidden_dim in hidden_dim_list:
            for n_epochs in n_epochs_list:
                for dp in dropout_list:
                    print("\n" + "=" * 60)
                    print(f"Trying config: lr={lr}, n_epochs={n_epochs}, dropout={dp}, hidden_dim={hidden_dim}")
                    print("=" * 60)

                    model, scaler, metrics = train_meta_tower(
                        train_df=train_df,
                        val_df=val_df,
                        feature_cols=feature_cols,
                        batch_size=batch_size,
                        lr=lr,
                        n_epochs=n_epochs,
                        dropout=dp,
                        hidden_dim=hidden_dim,
                        device=device,
                    )

                    acc = metrics["accuracy"]
                    auc = metrics["auc"]

                    res = {
                        "lr": lr,
                        "n_epochs": n_epochs,
                        "dropout": dp,
                        "hidden_dim": hidden_dim,
                        "accuracy": acc,
                        "auc": auc,
                    }
                    results.append(res)

                    if auc > best_auc:
                        best_auc = auc
                        best_result = res
                        best_model = model
                        best_scaler = scaler

    results_df = pd.DataFrame(results).sort_values(
        ["auc", "accuracy"], ascending=False
    ).reset_index(drop=True)

    print("\n=== Grid search summary (sorted by AUC, then accuracy) ===")
    print(results_df.head(10))
    print("\nBest config found:", best_result)

    return best_model, best_scaler, best_result, results_df


# ============================================================
# 6. Main: load data, build features, split, run grid search
# ============================================================

# ---- adjust path to your train.jsonl ----
train_path = "../../train.jsonl"   # <-- change if needed

# 1) parse with author_pseudo_id + raw meta
raw_df = parse_tweets_meta(train_path, expect_label=True)

# 2) author-based split BEFORE computing stats (important!)
train_raw, val_raw = author_based_split(raw_df, val_size=0.1, random_state=42)

# 3) build metadata features, with p99 stats fitted on TRAIN only
train_meta, stats, src2idx = build_meta_features(train_raw, fit_stats=None, src2idx=None, K=15)
val_meta, _, _             = build_meta_features(val_raw, fit_stats=stats, src2idx=src2idx, K=15)

# 4) sanity check: label distribution
print("\nTrain label distribution:")
print(train_meta["label"].value_counts(normalize=True).rename("proportion"))
print("\nVal label distribution:")
print(val_meta["label"].value_counts(normalize=True).rename("proportion"))

# 5) define your best_with_seven feature set
best_with_seven = [
    "log_status",
    "log_listed",
    "user.default_profile",
    "is_reply",
    "n_mentions",
    "log_fav",
    "source_idx",
]

# 6) run grid search instead of a single training run
best_model, best_scaler, best_result, grid_df = grid_search_meta_tower(
    train_meta,
    val_meta,
    feature_cols=best_with_seven,
    lr_list=(1e-2, 5e-3),
    n_epochs_list=(5, 10, 15),
    dropout_list=(0.0, 0.1),
    hidden_dim_list=(32, 64),
    batch_size=256,
)

User-level split:
  #train tweets: 139426
  #val tweets:   15488
  #unique users train: 44158
  #unique users val:   4907

Train label distribution:
label
0    0.534556
1    0.465444
Name: proportion, dtype: float64

Val label distribution:
label
0    0.525762
1    0.474238
Name: proportion, dtype: float64

Trying config: lr=0.01, n_epochs=5, dropout=0.0, hidden_dim=32
Using device: cuda
Epoch 01 | train_loss=0.4236 | val_loss=0.4163
Epoch 02 | train_loss=0.4128 | val_loss=0.4183
Epoch 03 | train_loss=0.4093 | val_loss=0.4097
Epoch 04 | train_loss=0.4063 | val_loss=0.4073
Epoch 05 | train_loss=0.4045 | val_loss=0.4125

=== MetaTower (best_with_seven) performance on user-based val ===
Accuracy: 0.8235
AUC:      0.8964

Trying config: lr=0.01, n_epochs=5, dropout=0.1, hidden_dim=32
Using device: cuda
Epoch 01 | train_loss=0.4263 | val_loss=0.4191
Epoch 02 | train_loss=0.4154 | val_loss=0.4129
Epoch 03 | train_loss=0.4141 | val_loss=0.4138
Epoch 04 | train_loss=0.4134 | val_loss=0.4113
Ep

In [9]:
meta_model, meta_scaler, meta_metrics = train_meta_tower(
    train_meta,
    val_meta,
    feature_cols=best_with_seven,
    batch_size=128,
    lr=5e-3,
    n_epochs=10,
    hidden_dim=32,
    dropout=0.0,
)
print(meta_metrics)

Using device: cuda
Epoch 01 | train_loss=0.4228 | val_loss=0.4159
Epoch 02 | train_loss=0.4112 | val_loss=0.4129
Epoch 03 | train_loss=0.4088 | val_loss=0.4111
Epoch 04 | train_loss=0.4067 | val_loss=0.4059
Epoch 05 | train_loss=0.4044 | val_loss=0.4029
Epoch 06 | train_loss=0.4026 | val_loss=0.4060
Epoch 07 | train_loss=0.4017 | val_loss=0.4038
Epoch 08 | train_loss=0.4012 | val_loss=0.4029
Epoch 09 | train_loss=0.4003 | val_loss=0.4043
Epoch 10 | train_loss=0.3996 | val_loss=0.4042

=== MetaTower (best_with_seven) performance on user-based val ===
Accuracy: 0.8244
AUC:      0.8965
{'accuracy': 0.8244447314049587, 'auc': 0.8965374796847402}


In [10]:
def save_meta_checkpoint(
    path,
    model,
    scaler,
    feature_cols,
    stats,
    src2idx,
    config,
    metrics,
):
    """
    Save everything needed to reuse the metadata tower later.
    """
    checkpoint = {
        "model_state_dict": model.state_dict(),
        "scaler": scaler,
        "feature_cols": feature_cols,
        "stats": stats,
        "src2idx": src2idx,
        "config": config,
        "metrics": metrics,
    }
    torch.save(checkpoint, path)
    print(f"Saved metadata tower checkpoint to: {path}")


best_config = {
    "hidden_dim": 32,
    "out_dim": 16,
    "lr": 3e-5,
    "n_epochs": 10,
    "dropout": 0.0,
}

save_meta_checkpoint(
    path="meta_tower_best.pt",
    model=best_model,
    scaler=best_scaler,
    feature_cols=best_with_seven,
    stats=stats,
    src2idx=src2idx,
    config=best_config,
    metrics=meta_metrics,
)

Saved metadata tower checkpoint to: meta_tower_best.pt


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
ckpt = torch.load("meta_tower_best.pt", map_location=device)

feature_cols = ckpt["feature_cols"]
stats        = ckpt["stats"]
src2idx      = ckpt["src2idx"]
config       = ckpt["config"]
scaler       = ckpt["scaler"]

model = MetaTower(
    in_dim=len(feature_cols),
    hidden_dim=config["hidden_dim"],
    out_dim=config["out_dim"],
    dropout=config["dropout"],
).to(device)

model.load_state_dict(ckpt["model_state_dict"])
model.eval()