In [2]:
# sweep_train_and_submit.py
# Hyperparameter sweep for your NER trainer + per-run submission files and a master CSV log.

import os, sys, ast, json, math, inspect, hashlib, random, time, gc
from datetime import datetime
from typing import List, Tuple, Dict, Any
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from sklearn.model_selection import GroupShuffleSplit
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForTokenClassification,
    DataCollatorForTokenClassification, TrainingArguments, Trainer,
    EarlyStoppingCallback, TrainerCallback
)
from seqeval.metrics import f1_score, precision_score, recall_score, accuracy_score

# ======================== CONSTANTS (can be overridden per-run) ========================
DEFAULTS = dict(
    NOISY_PATH  = "train.csv",
    CLEAN_PATH  = "train-clean-volperc.csv",
    MODEL_NAME  = "bert-base-multilingual-cased",

    VAL_NOISY_FRAC = 0.12,
    VAL_CLEAN_FRAC = 0.05,
    ENFORCE_VAL_MIX = True,
    VAL_TARGET_NOISY_RATIO = 0.8,   # ≈80% noisy in validation

    ENTITY_TYPES = ["TYPE", "BRAND", "VOLUME", "PERCENT"],
    MAX_LEN   = 128,
    EPOCHS    = 6,
    BATCH     = 16,
    LR        = 1.5e-5,
    SEED      = 42,
    LABEL_SMOOTHING = 0.05,
    DROPOUT_H = 0.2,
    DROPOUT_A = 0.2,
    MAX_WEIGHT = 3.0,
    MIN_WEIGHT = 0.5,
    GRAD_NORM_CLIP = 1.0,

    OUTPUT_ROOT = "./runs",           # each run gets its own subdir here
    SUBMIT_DIR  = "./submissions-final",    # all submission files collected here
    SUBMISSION_PATH = "submission.csv",
    BATCH_INFER = 64,
)

BIO_LABELS_CACHE = {}

# ======================== UTILS ========================
def set_all_seeds(seed: int):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

def safe_mkdir(p: str):
    os.makedirs(p, exist_ok=True)

def now_tag():
    return datetime.now().strftime("%Y%m%d-%H%M%S")

def label_set(entity_types: List[str]) -> List[str]:
    # cache per entity_types signature
    key = tuple(entity_types)
    if key not in BIO_LABELS_CACHE:
        BIO_LABELS_CACHE[key] = ["O"] + [f"B-{e}" for e in entity_types] + [f"I-{e}" for e in entity_types]
    return BIO_LABELS_CACHE[key]

def read_span_csv(path: str) -> pd.DataFrame:
    try:
        df = pd.read_csv(path, sep=';', engine="python", dtype=str, keep_default_na=False)
    except Exception:
        df = None
        for sep in [";", "|", "\t", ","]:
            try:
                df = pd.read_csv(path, sep=sep, engine="python", dtype=str, keep_default_na=False); break
            except Exception:
                pass
        if df is None:
            rows=[]
            with open(path, "r", encoding="utf-8") as f:
                for line in f:
                    line=line.rstrip("\n")
                    for sep in ["\t",";","|",","]:
                        if sep in line:
                            a,b = line.split(sep,1); rows.append({"sample":a,"annotation":b}); break
                    else:
                        rows.append({"sample":line,"annotation":"[]"})
            df = pd.DataFrame(rows)
    cols = {c.lower(): c for c in df.columns}
    s_col = cols.get("sample", list(df.columns)[0])
    a_col = cols.get("annotation", list(df.columns)[1] if len(df.columns)>1 else None)
    if a_col is None: raise ValueError(f"{path}: need sample;annotation")
    df = df[[s_col,a_col]].rename(columns={s_col:"sample", a_col:"annotation"}).astype(str)
    def parse_ann(s):
        s=(s or "").strip()
        if not s or s.lower()=="nan": return []
        try:
            arr = ast.literal_eval(s.replace("’","'").replace("“",'"').replace("”",'"'))
            return [(int(a),int(b),str(t)) for (a,b,t) in arr]
        except Exception: return []
    df["annotation"]=df["annotation"].apply(parse_ann)
    return df

def split_with_offsets(text: str):
    words=[]; i=0; n=len(text)
    while i<n:
        if text[i].isspace(): i+=1; continue
        s=i
        while i<n and not text[i].isspace(): i+=1
        e=i; words.append((text[s:e], s, e))
    return words

def char_spans_to_bio_words(text: str, spans, ENTITY_TYPES, BIO_LABELS):
    words = split_with_offsets(text)
    labels = ["O"]*len(words)
    norm=[]
    for (s,e,tag) in spans:
        tag=str(tag)
        if tag=="O": continue
        if "-" in tag:
            bi,et = tag.split("-",1); et=et.upper(); bi="B" if bi not in ("B","I") else bi
            if et in ENTITY_TYPES: norm.append((s,e,bi,et))
        else:
            et=tag.upper()
            if et in ENTITY_TYPES: norm.append((s,e,"B",et))
    norm.sort(key=lambda x:(x[0],x[1]))
    for (s,e,bi,et) in norm:
        hits=[]
        for wi,(_,ws,we) in enumerate(words):
            if not (we<=s or ws>=e): hits.append(wi)
        if not hits: continue
        labels[hits[0]] = f"B-{et}"
        for wi in hits[1:]: labels[wi] = f"I-{et}"
    return [w for (w,_,__) in words], [l if l in BIO_LABELS else "O" for l in labels]

def to_sequences(df: pd.DataFrame, source: str, ENTITY_TYPES=None, BIO_LABELS=None):
    rows=[]
    for _,r in df.iterrows():
        toks, bio = char_spans_to_bio_words(r["sample"], r["annotation"], ENTITY_TYPES, BIO_LABELS)
        rows.append({"tokens":toks, "labels":bio, "sample":r["sample"], "source":source})
    return pd.DataFrame(rows)

def text_group(text: str) -> str:
    t=''.join(ch.lower() for ch in text if ch.isalpha() or ch.isspace())
    return hashlib.md5(t.encode()).hexdigest()[:8]

def group_split(df: pd.DataFrame, frac: float, seed: int):
    if len(df)==0 or frac<=0: return df.index.values, np.array([],dtype=int)
    gss = GroupShuffleSplit(n_splits=1, test_size=frac, random_state=seed)
    tr, va = next(gss.split(df, groups=df["group"]))
    return tr, va

def compute_token_freqs(train_enc, n_labels: int):
    cnt=Counter()
    for row in train_enc:
        for l in row["labels"]:
            if l != -100: cnt[int(l)] += 1
    total = sum(cnt.values()) if cnt else 1
    freqs = np.ones(n_labels, dtype=np.float64) / n_labels
    for i in range(n_labels):
        if i in cnt: freqs[i] = max(cnt[i]/total, 1e-8)
    return freqs

def bio_to_entities(seq):
    ents=[]; cur=None; start=None
    for i,tag in enumerate(seq+["O"]):
        if not isinstance(tag,str) or "-" not in tag:
            if cur is not None: ents.append((cur,start,i)); cur=None; start=None
            continue
        bi,et = tag.split("-",1)
        if bi=="B":
            if cur is not None: ents.append((cur,start,i))
            cur, start = et, i
        elif bi=="I":
            if cur!=et:
                if cur is not None: ents.append((cur,start,i))
                cur, start = et, i
        else:
            if cur is not None: ents.append((cur,start,i))
            cur=None; start=None
    return ents

def prf_by_type(true_bio, pred_bio, entity_types):
    tp=defaultdict(int); fp=defaultdict(int); fn=defaultdict(int)
    for y_t, y_p in zip(true_bio, y_pred := pred_bio):
        ts={(t,s,e) for (t,s,e) in bio_to_entities(y_t) if t in entity_types}
        ps={(t,s,e) for (t,s,e) in bio_to_entities(y_p) if t in entity_types}
        for et in entity_types:
            T={x for x in ts if x[0]==et}; P={x for x in ps if x[0]==et}
            tp[et]+=len(T & P); fp[et]+=len(P-T); fn[et]+=len(T-P)
    rows=[]
    for et in entity_types:
        T,Fp,Fn = tp[et], fp[et], fn[et]
        prec = T/(T+Fp) if T+Fp>0 else 0.0
        rec  = T/(T+Fn) if T+Fn>0 else 0.0
        f1   = 2*prec*rec/(prec+rec) if (prec+rec)>0 else 0.0
        rows.append((et,T,Fp,Fn,prec,rec,f1))
    macro_f1 = float(np.mean([r[6] for r in rows])) if rows else 0.0
    return rows, macro_f1

def make_compute_metrics(id2label: Dict[int,str], entity_types: List[str]):
    def compute_metrics(p):
        preds = p[0] if isinstance(p, tuple) else p.predictions
        labels= p[1] if isinstance(p, tuple) else p.label_ids
        preds = np.argmax(preds, axis=-1)
        y_true, y_pred = [], []
        for pr, lb in zip(preds, labels):
            t,p = [],[]
            for pi, li in zip(pr, lb):
                if li == -100: continue
                t.append(id2label[int(li)]); p.append(id2label[int(pi)])
            y_true.append(t); y_pred.append(p)
        res = {
            "precision_token": precision_score(y_true, y_pred),
            "recall_token":    recall_score(y_true, y_pred),
            "f1_token":        f1_score(y_true, y_pred),
            "accuracy_token":  accuracy_score(y_true, y_pred),
        }
        rows, macro_f1 = prf_by_type(y_true, y_pred, entity_types)
        res["macro_f1_entity"] = macro_f1
        header = "Entity-level (STRICT spans):\nType       TP   FP   FN   Prec     Rec      F1"
        lines=[header]+[f"{et:<10} {tp_:<4} {fp_:<4} {fn_:<4} {pr:>6.3f}  {rc:>6.3f}  {f1v:>6.3f}"
                        for (et,tp_,fp_,fn_,pr,rc,f1v) in rows]
        lines.append(f"\nMacro-F1 (over {', '.join(entity_types)}): {macro_f1:.4f}")
        res["entity_report_text"] = "\n".join(lines)
        return res
    return compute_metrics

def samplewise_label_counts(train_sequences_df: pd.DataFrame, label2id: Dict[str,int]) -> Counter:
    c = Counter()
    for _, row in train_sequences_df.iterrows():
        seen = set(l for l in row["labels"] if l in label2id)
        for lab in seen:
            c[label2id[lab]] += 1
    return c

def compute_label_weights_samplewise(train_sequences_df: pd.DataFrame, label2id: Dict[str,int],
                                     min_w: float, max_w: float) -> torch.Tensor:
    n_samples = max(len(train_sequences_df), 1)
    counts = samplewise_label_counts(train_sequences_df, label2id)
    w = torch.ones(len(label2id), dtype=torch.float32)
    for lab, idx in label2id.items():
        k = counts.get(idx, 0)
        if k == 0:
            w[idx] = max_w
        else:
            freq = k / n_samples
            inv  = 1.0 / max(freq, 1e-8)
            w[idx] = float(min(max_w, max(min_w, inv)))
    w = w / w.mean()
    return w

class WeightedSmoothedTrainer(Trainer):
    def __init__(self, *args, class_weights=None, label_smoothing=0.0, **kwargs):
        self._class_weights = class_weights
        self._label_smoothing = float(label_smoothing or 0.0)
        super().__init__(*args, **kwargs)
        self._ce = None
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs); logits=outputs.logits
        if self._ce is None:
            cw = self._class_weights.to(logits.device) if self._class_weights is not None else None
            try:
                self._ce = nn.CrossEntropyLoss(weight=cw, ignore_index=-100, label_smoothing=self._label_smoothing)
            except TypeError:
                self._ce = nn.CrossEntropyLoss(weight=cw, ignore_index=-100)
        loss = self._ce(logits.view(-1, logits.size(-1)), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

def build_training_args(args_over: Dict[str,Any], output_dir: str, grad_norm_clip: float, epochs: int, lr: float, batch: int, seed: int):
    sig = inspect.signature(TrainingArguments.__init__).parameters
    def has(k): return k in sig
    args = dict(
        output_dir=output_dir,
        learning_rate=lr,
        num_train_epochs=epochs,
        weight_decay=0.01,
        seed=seed,
        gradient_accumulation_steps=1,
        fp16=torch.cuda.is_available(),
        logging_steps=50,
    )
    if has("per_device_train_batch_size"): args["per_device_train_batch_size"]=batch
    if has("per_device_eval_batch_size"):  args["per_device_eval_batch_size"]=batch
    if has("evaluation_strategy"): args["evaluation_strategy"] = "epoch"
    if has("eval_strategy"):       args["eval_strategy"]       = "epoch"
    if has("save_strategy"):       args["save_strategy"]       = "epoch"
    if has("logging_strategy"):    args["logging_strategy"]    = "steps"
    if has("save_total_limit"):         args["save_total_limit"]=2
    if has("load_best_model_at_end"):   args["load_best_model_at_end"]=True
    if has("metric_for_best_model"):    args["metric_for_best_model"]="macro_f1_entity"
    if has("greater_is_better"):        args["greater_is_better"]=True
    if has("warmup_ratio"):             args["warmup_ratio"]=0.1
    if has("report_to"):                args["report_to"]="none"
    if has("max_grad_norm"):            args["max_grad_norm"]=grad_norm_clip
    args.update(args_over or {})
    return TrainingArguments(**args)

def tokenize_and_align_factory(tokenizer, label2id, max_len: int):
    def tokenize_and_align(examples):
        enc = tokenizer(examples["tokens"], is_split_into_words=True, truncation=True, max_length=max_len)
        new_labels=[]
        for i,labs in enumerate(examples["labels"]):
            word_ids = enc.word_ids(batch_index=i)
            prev=None; ids=[]
            for wi in word_ids:
                if wi is None:
                    ids.append(-100)
                else:
                    if wi!=prev:
                        lab = labs[wi] if wi < len(labs) else "O"
                        ids.append(label2id.get(lab, label2id["O"]))
                    else:
                        ids.append(-100)
                    prev=wi
            new_labels.append(ids)
        enc["labels"]=new_labels
        return enc
    return tokenize_and_align

def read_submission(path: str) -> pd.DataFrame:
    df = None
    for sep in [";", ",", "\t", "|"]:
        try:
            tmp = pd.read_csv(path, sep=sep, engine="python", dtype=str, keep_default_na=False)
            if len(tmp) and tmp.shape[1] >= 1:
                df = tmp
                break
        except Exception:
            pass
    if df is None:
        raise ValueError(f"Не удалось прочитать {path} стандартными разделителями.")
    lower = {c.lower(): c for c in df.columns}
    sample_col = lower.get("sample", df.columns[0])
    out = df[[sample_col]].rename(columns={sample_col: "sample"}).copy()
    out["sample"] = out["sample"].astype(str)
    return out

def predict_bio_for_texts(model, tokenizer, id2label: Dict[int,str], texts: List[str], max_len: int, batch_size: int, device: str):
    def split_with_offsets_local(text: str) -> List[Tuple[str, int, int]]:
        words, i, n = [], 0, len(text)
        while i < n:
            if text[i].isspace():
                i += 1
                continue
            s = i
            while i < n and not text[i].isspace():
                i += 1
            e = i
            words.append((text[s:e], s, e))
        return words

    all_labels, i = [], 0
    model.eval()
    while i < len(texts):
        batch_texts = texts[i:i+batch_size]
        batch_words = [[w for (w, _, __) in split_with_offsets_local(t)] for t in batch_texts]

        enc = tokenizer(
            batch_words,
            is_split_into_words=True,
            truncation=True,
            max_length=max_len,
            return_tensors="pt",
            padding=True
        )
        word_ids_batches = [enc.word_ids(batch_index=j) for j in range(len(batch_texts))]
        enc_inputs = {k: v.to(device) for k, v in enc.items()}

        with torch.no_grad():
            out = model(**enc_inputs)
            pred_ids = out.logits.argmax(-1).cpu().numpy()

        for j in range(len(batch_texts)):
            wi_list = word_ids_batches[j]
            first_label_by_word = {}
            for pos, wi in enumerate(wi_list):
                if wi is None: continue
                if wi not in first_label_by_word:
                    first_label_by_word[wi] = int(pred_ids[j, pos])
            num_words = len(batch_words[j])
            pred_labels = [id2label.get(first_label_by_word.get(widx, 0), "O") for widx in range(num_words)]
            all_labels.append(pred_labels)
        i += batch_size
    return all_labels

def bio_words_to_char_spans(text: str, bio_labels: List[str]) -> List[Tuple[int,int,str]]:
    words = split_with_offsets(text)
    spans = []
    for k, (_, s, e) in enumerate(words):
        lab = bio_labels[k] if k < len(bio_labels) else "O"
        spans.append((int(s), int(e), str(lab)))
    return spans

def fix_bi_chains(ann_str: str) -> str:
    B_PREFIX = "B-"; I_PREFIX = "I-"
    try:
        spans = ast.literal_eval(ann_str)
    except Exception:
        return ann_str
    if not isinstance(spans, list):
        return ann_str
    new_spans = []
    prev_type = None
    prev_is_entity = False
    for (s, e, lab) in spans:
        if not isinstance(lab, str):
            new_spans.append((int(s), int(e), lab))
            prev_type, prev_is_entity = None, False
            continue
        if lab == "O":
            new_spans.append((int(s), int(e), lab))
            prev_type, prev_is_entity = None, False
            continue
        if lab.startswith(B_PREFIX):
            cur_type = lab[len(B_PREFIX):]
            if prev_is_entity and prev_type == cur_type:
                lab = f"{I_PREFIX}{cur_type}"
            prev_type, prev_is_entity = cur_type, True
            new_spans.append((int(s), int(e), lab))
            continue
        if lab.startswith(I_PREFIX):
            cur_type = lab[len(I_PREFIX):]
            prev_type, prev_is_entity = cur_type, True
            new_spans.append((int(s), int(e), lab))
            continue
        new_spans.append((int(s), int(e), lab))
        prev_type, prev_is_entity = None, False
    return "[" + ", ".join(f"({s}, {e}, {repr(l)})" for (s, e, l) in new_spans) + "]"

# ======================== ONE RUN ========================
def run_single(hp: Dict[str,Any]) -> Dict[str,Any]:
    """Train, evaluate, save model, produce submission; return artifacts + metrics."""
    cfg = DEFAULTS.copy()
    cfg.update(hp or {})
    set_all_seeds(cfg["SEED"])

    ENTITY_TYPES = cfg["ENTITY_TYPES"]
    BIO_LABELS   = label_set(ENTITY_TYPES)

    # Paths and names
    safe_mkdir(cfg["OUTPUT_ROOT"])
    safe_mkdir(cfg["SUBMIT_DIR"])
    tag = (
        f"lr{cfg['LR']}_bs{cfg['BATCH']}_ep{cfg['EPOCHS']}"
        f"_ls{cfg['LABEL_SMOOTHING']}_do{cfg['DROPOUT_H']}-{cfg['DROPOUT_A']}"
        f"_ml{cfg['MAX_LEN']}_seed{cfg['SEED']}"
    ).replace(".", "p")
    run_dir = os.path.join(cfg["OUTPUT_ROOT"], f"{now_tag()}_{tag}")
    safe_mkdir(run_dir)

    # ----- Load data
    noisy_raw = read_span_csv(cfg["NOISY_PATH"])
    clean_raw = read_span_csv(cfg["CLEAN_PATH"])

    noisy_df  = to_sequences(noisy_raw, "noisy", ENTITY_TYPES, BIO_LABELS)
    clean_df  = to_sequences(clean_raw, "clean", ENTITY_TYPES, BIO_LABELS)
    noisy_df["group"]=noisy_raw["sample"].apply(text_group).values
    clean_df["group"]=clean_raw["sample"].apply(text_group).values

    n_tr, n_val = group_split(noisy_df, cfg["VAL_NOISY_FRAC"], cfg["SEED"])
    c_tr, c_val = group_split(clean_df, cfg["VAL_CLEAN_FRAC"], cfg["SEED"])

    if cfg["ENFORCE_VAL_MIX"] and len(n_val)>0:
        target_clean = int(round(len(n_val) * (1 - cfg["VAL_TARGET_NOISY_RATIO"]) / max(cfg["VAL_TARGET_NOISY_RATIO"],1e-9)))
        if target_clean == 0 and len(c_val)>0:
            target_clean = min(max(1, len(c_val)//10), len(c_val))
        if target_clean < len(c_val):
            rng = np.random.default_rng(cfg["SEED"])
            c_val = rng.choice(c_val, size=target_clean, replace=False)

    val_df   = pd.concat([noisy_df.iloc[n_val], clean_df.iloc[c_val]], ignore_index=True)
    train_df = pd.concat([noisy_df.iloc[n_tr], clean_df.iloc[c_tr]], ignore_index=True)

    # ----- Tokenize
    label2id = {l:i for i,l in enumerate(BIO_LABELS)}
    id2label = {i:l for i,l in enumerate(BIO_LABELS)}
    tokenizer = AutoTokenizer.from_pretrained(cfg["MODEL_NAME"], use_fast=True)
    tok_fn = tokenize_and_align_factory(tokenizer, label2id, cfg["MAX_LEN"])

    ds = DatasetDict({
        "train":      Dataset.from_pandas(train_df.drop(columns=["group"])),
        "validation": Dataset.from_pandas(val_df.drop(columns=["group"]))
    })
    tok = ds.map(tok_fn, batched=True, remove_columns=["tokens","labels","sample","source"])
    collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

    # ----- Model & config
    config = AutoConfig.from_pretrained(
        cfg["MODEL_NAME"],
        num_labels=len(BIO_LABELS),
        id2label=id2label, label2id=label2id,
        hidden_dropout_prob=cfg["DROPOUT_H"],
        attention_probs_dropout_prob=cfg["DROPOUT_A"]
    )
    model = AutoModelForTokenClassification.from_pretrained(cfg["MODEL_NAME"], config=config)

    # init classifier bias by train priors
    train_freq = compute_token_freqs(tok["train"], len(BIO_LABELS))
    with torch.no_grad():
        bias = torch.log(torch.tensor(train_freq, dtype=torch.float32))
        model.classifier.bias.copy_(bias)

    # ----- class weights
    class_weights = compute_label_weights_samplewise(train_df[["tokens","labels"]], label2id,
                                                     min_w=cfg["MIN_WEIGHT"], max_w=cfg["MAX_WEIGHT"])

    # ----- metrics
    compute_metrics = make_compute_metrics(id2label, ENTITY_TYPES)

    # ----- args + trainer
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
    training_args = build_training_args(
        args_over={},
        output_dir=run_dir,
        grad_norm_clip=cfg["GRAD_NORM_CLIP"],
        epochs=cfg["EPOCHS"],
        lr=cfg["LR"],
        batch=cfg["BATCH"],
        seed=cfg["SEED"],
    )
    trainer = WeightedSmoothedTrainer(
        model=model,
        args=training_args,
        train_dataset=tok["train"],
        eval_dataset=tok["validation"],
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
        class_weights=class_weights,
        label_smoothing=cfg["LABEL_SMOOTHING"],
        callbacks=callbacks
    )

    # ----- train + eval
    trainer.train()
    metrics = trainer.evaluate()
    pref = "eval_" if any(k.startswith("eval_") for k in metrics.keys()) else ""
    macro = float(metrics.get(pref+"macro_f1_entity", 0.0))
    f1tok = float(metrics.get(pref+"f1_token", 0.0))

    # ----- save model (+ id2label/label2id for local load)
    trainer.save_model(run_dir)
    with open(os.path.join(run_dir, "id2label.json"), "w", encoding="utf-8") as f:
        json.dump({int(k): v for k, v in id2label.items()}, f, ensure_ascii=False, indent=2)
    with open(os.path.join(run_dir, "label2id.json"), "w", encoding="utf-8") as f:
        json.dump({k: int(v) for k, v in label2id.items()}, f, ensure_ascii=False, indent=2)

    # ======================== Inference → submission ========================
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device); model.eval()

    # Guard MAX_LEN by model's positional limit
    cfg_max = getattr(config, "max_position_embeddings", None)
    if not isinstance(cfg_max, int) or cfg_max <= 0:
        MAX_LEN_EVAL = min(cfg["MAX_LEN"], 512)
    else:
        MAX_LEN_EVAL = min(cfg["MAX_LEN"], cfg_max, 512)

    sub_df = read_submission(cfg["SUBMISSION_PATH"])
    texts  = sub_df["sample"].tolist()
    pred_bio = predict_bio_for_texts(model, tokenizer, id2label, texts, MAX_LEN_EVAL, cfg["BATCH_INFER"], device)
    pred_spans = [bio_words_to_char_spans(t, labs) for t, labs in zip(texts, pred_bio)]

    out_df = pd.DataFrame({
        "sample": sub_df["sample"],
        "annotation": ["[" + ", ".join(f"({s}, {e}, {repr(lab)})" for (s, e, lab) in sp) + "]" for sp in pred_spans]
    })
    raw_path  = os.path.join(run_dir, "submission-bert.csv")
    post_path = os.path.join(run_dir, "submission-bert-post.csv")
    out_df.to_csv(raw_path, index=False, sep=";")

    # Postprocess B→I chains
    post_df = out_df.copy()
    post_df["annotation"] = post_df["annotation"].apply(fix_bi_chains)
    post_df.to_csv(post_path, index=False, sep=";")

    # Copy to submissions with informative filename
    short_macro = f"{macro:.4f}".replace(".", "p")
    short_f1tok = f"{f1tok:.4f}".replace(".", "p")
    submit_name = f"submission_{now_tag()}_{tag}_macro{short_macro}_f1tok{short_f1tok}.csv"
    final_submit_path = os.path.join(cfg["SUBMIT_DIR"], submit_name)
    post_df.to_csv(final_submit_path, index=False, sep=";")

    # Free GPU memory between runs
    del trainer, model
    torch.cuda.empty_cache()
    gc.collect()

    return dict(
        run_dir=run_dir,
        raw_path=raw_path,
        post_path=post_path,
        final_submit_path=final_submit_path,
        metrics=metrics,
        macro_f1_entity=macro,
        f1_token=f1tok,
        params=cfg,
        tag=tag,
    )

# ======================== SWEEP ========================
def main():
    safe_mkdir(DEFAULTS["OUTPUT_ROOT"])
    safe_mkdir(DEFAULTS["SUBMIT_DIR"])
    log_csv = os.path.join(DEFAULTS["SUBMIT_DIR"], "tune_log.csv")

    # Define your search space here (edit freely!)
    # Keep it modest if running on a single GPU overnight.
    SEARCH_SPACE = [
    dict(
        LR=1.65e-5,
        BATCH=48,
        EPOCHS=5,
        LABEL_SMOOTHING=0.03,
        DROPOUT_H=0.1,
        DROPOUT_A=0.1,
        MAX_LEN=128,
        GRAD_NORM_CLIP=0.8,
    )
]

    # Optionally, set different seeds per run:
    # for i, s in enumerate(SEARCH_SPACE): s["SEED"] = 42 + i

    # Prepare log
    if not os.path.isfile(log_csv):
        pd.DataFrame(columns=[
            "timestamp","tag","run_dir","submission_file",
            "macro_f1_entity","f1_token","params_json"
        ]).to_csv(log_csv, index=False)

    for i, hp in enumerate(SEARCH_SPACE, start=1):
        print(f"\n========== RUN {i}/{len(SEARCH_SPACE)}: {hp} ==========")
        try:
            res = run_single(hp)
            row = {
                "timestamp": datetime.now().isoformat(timespec="seconds"),
                "tag": res["tag"],
                "run_dir": res["run_dir"],
                "submission_file": res["final_submit_path"],
                "macro_f1_entity": res.get("macro_f1_entity", None),
                "f1_token": res.get("f1_token", None),
                "params_json": json.dumps(hp, ensure_ascii=False),
            }
            # append
            pd.DataFrame([row]).to_csv(log_csv, mode="a", header=False, index=False)
            print(f"✔ Saved submission → {res['final_submit_path']}")
            print(f"✔ Logged to        → {log_csv}")
        except Exception as e:
            print(f"✖ Run failed: {hp}\n{e}")
            # log failure too
            row = {
                "timestamp": datetime.now().isoformat(timespec="seconds"),
                "tag": f"FAILED_{now_tag()}",
                "run_dir": "",
                "submission_file": "",
                "macro_f1_entity": None,
                "f1_token": None,
                "params_json": json.dumps({**hp, "_error": str(e)}, ensure_ascii=False),
            }
            pd.DataFrame([row]).to_csv(log_csv, mode="a", header=False, index=False)
            # attempt to continue next run
            torch.cuda.empty_cache()
            gc.collect()

if __name__ == "__main__":
    main()


2025-10-02 15:17:34.989322: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-02 15:17:34.997462: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759411055.006926  697725 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759411055.010066  697725 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1759411055.018446  697725 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 




Map:   0%|          | 0/37063 [00:00<?, ? examples/s]

Map:   0%|          | 0/3328 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Precision Token,Recall Token,F1 Token,Accuracy Token,Macro F1 Entity,Entity Report Text
1,0.3542,0.481256,0.923027,0.940667,0.931763,0.925317,0.932993,"Entity-level (STRICT spans): Type TP FP FN Prec Rec F1 TYPE 3274 180 151 0.948 0.956 0.952 BRAND 838 168 110 0.833 0.884 0.858 VOLUME 76 5 4 0.938 0.950 0.944 PERCENT 45 0 2 1.000 0.957 0.978 Macro-F1 (over TYPE, BRAND, VOLUME, PERCENT): 0.9330"
2,0.3304,0.43567,0.947045,0.957778,0.952381,0.943468,0.9586,"Entity-level (STRICT spans): Type TP FP FN Prec Rec F1 TYPE 3316 136 109 0.961 0.968 0.964 BRAND 870 103 78 0.894 0.918 0.906 VOLUME 78 2 2 0.975 0.975 0.975 PERCENT 46 0 1 1.000 0.979 0.989 Macro-F1 (over TYPE, BRAND, VOLUME, PERCENT): 0.9586"
3,0.2927,0.454476,0.948348,0.962889,0.955563,0.947627,0.96032,"Entity-level (STRICT spans): Type TP FP FN Prec Rec F1 TYPE 3360 168 65 0.952 0.981 0.966 BRAND 848 64 100 0.930 0.895 0.912 VOLUME 78 4 2 0.951 0.975 0.963 PERCENT 47 0 0 1.000 1.000 1.000 Macro-F1 (over TYPE, BRAND, VOLUME, PERCENT): 0.9603"
4,0.2812,0.444901,0.949517,0.961333,0.955389,0.946871,0.963488,"Entity-level (STRICT spans): Type TP FP FN Prec Rec F1 TYPE 3323 130 102 0.962 0.970 0.966 BRAND 878 98 70 0.900 0.926 0.913 VOLUME 78 2 2 0.975 0.975 0.975 PERCENT 47 0 0 1.000 1.000 1.000 Macro-F1 (over TYPE, BRAND, VOLUME, PERCENT): 0.9635"
5,0.261,0.457028,0.951578,0.965111,0.958297,0.950085,0.964208,"Entity-level (STRICT spans): Type TP FP FN Prec Rec F1 TYPE 3345 140 80 0.960 0.977 0.968 BRAND 872 77 76 0.919 0.920 0.919 VOLUME 79 4 1 0.952 0.988 0.969 PERCENT 47 0 0 1.000 1.000 1.000 Macro-F1 (over TYPE, BRAND, VOLUME, PERCENT): 0.9642"


✔ Saved submission → ./submissions-final/submission_20251002-152119_lr1p65e-05_bs48_ep5_ls0p03_do0p1-0p1_ml128_seed42_macro0p9642_f1tok0p9583.csv
✔ Logged to        → ./submissions-final/tune_log.csv
