In [None]:
# 1. Fresh PyTorch with flex_attention support
!pip install -U "torch==2.5.1" "torchvision==0.20.1" "torchaudio==2.5.1" \
  --index-url https://download.pytorch.org/whl/cu121

!pip install -U "transformers==4.47.0" \
               "accelerate<1.0" \
               "huggingface_hub<0.26" \
               "mamba-ssm==2.2.2" \
               "causal-conv1d==1.4.0" \
               "flash-attn==2.8.3" \
               einops

Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting flash-attn==2.8.3
  Using cached flash_attn-2.8.3.tar.gz (8.4 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash-attn: filename=flash_attn-2.8.3-cp312-cp312-linux_x86_64.whl size=255984554 sha256=51f6422861ed951b968428adc9fa7406027f73f2145be5e163810df6f459abea
  Stored in directory: /root/.cache/pip/wheels/3d/59/46/f282c12c73dd4bb3c2e3fe199f1a0d0f8cec06df0cccfeee27
Successfully built flash-attn
Installing collected packages: flash-attn
  Attempting uninstall: flash-attn
    Found existing installation: flash-attn 2.6.3
    Uninstalling flash-attn-2.6.3:
      Successfully uninstalled flash-attn-2.6.3
Successfully installed flash-attn-2.8.3


In [None]:
!nvidia-smi

Sun Nov 16 01:33:11 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   33C    P0             47W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# --- Clone Ada-LEval ---
import os

%cd /content
if not os.path.exists("Ada-LEval"):
    !git clone https://github.com/open-compass/Ada-LEval.git

%cd /content/Ada-LEval

# Download data (if you already did this before, this will just no-op / re-download)
if os.path.exists("fetch_data.sh"):
    !bash fetch_data.sh
else:
    print("WARNING: fetch_data.sh not found – make sure data/*.json exists.")

/content
/content/Ada-LEval
--2025-11-16 01:33:11--  http://opencompass.openxlab.space/utils/AdaLEval/stackselect_1k.json
Resolving opencompass.openxlab.space (opencompass.openxlab.space)... 47.102.9.81
Connecting to opencompass.openxlab.space (opencompass.openxlab.space)|47.102.9.81|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3170252 (3.0M) [application/json]
Saving to: ‘data/stackselect_1k.json’


2025-11-16 01:33:12 (5.67 MB/s) - ‘data/stackselect_1k.json’ saved [3170252/3170252]

--2025-11-16 01:33:12--  http://opencompass.openxlab.space/utils/AdaLEval/stackselect_2k.json
Resolving opencompass.openxlab.space (opencompass.openxlab.space)... 47.102.9.81
Connecting to opencompass.openxlab.space (opencompass.openxlab.space)|47.102.9.81|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7632671 (7.3M) [application/json]
Saving to: ‘data/stackselect_2k.json’


2025-11-16 01:33:13 (11.8 MB/s) - ‘data/stackselect_2k.json’ saved [7632671/76

In [None]:
import json
import re
import time
from pathlib import Path

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm.auto import tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

# Use TF32 for speed on A100
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# How many examples per task to run (change if needed)
MAX_SAMPLES_PER_TASK = 250

# Generation parameters
MAX_NEW_TOKENS = 32   # enough for "Answer: [4,1,3,2]" etc.
TEMPERATURE = 0.0     # greedy (deterministic)

Device: cuda


In [None]:
import json
import itertools
from collections import Counter
import time
import torch


In [None]:
DATA_DIR = Path("/content/Ada-LEval/data")

def load_adaleval_task(task_name: str, max_samples: int = None):
    data_path = DATA_DIR / f"{task_name}.json"
    if not data_path.exists():
        raise FileNotFoundError(f"{data_path} not found. Check that data is downloaded.")

    with open(data_path, "r") as f:
        data = json.load(f)

    # In Ada-LEval, the JSON is a list of samples
    samples = data if isinstance(data, list) else data["data"]
    if max_samples is not None:
        samples = samples[:max_samples]
    return samples

def textsort_extract_prediction(pred: str):
    """
    Mirror ada_leval.TextSort.evaluate.extract:

    - If the model returns a clean JSON list after 'Answer:', parse it.
    - Otherwise search for a unique subsequence among all 4! permutations.
    - If ambiguous or not found, return [0,0,0,0].
    """
    raw = pred

    # Strip "Answer:" prefix if present
    if "Answer:" in raw:
        raw_tail = raw.split("Answer:", 1)[1].strip()
    else:
        raw_tail = raw.strip()

    # 1) Try JSON parse (e.g. "[4, 1, 3, 2]")
    try:
        parsed = json.loads(raw_tail)
        return parsed
    except Exception:
        pass

    # 2) Subsequence heuristic over all permutations 1..4
    def is_subseq(needle: str, haystack: str) -> bool:
        current_pos = 0
        for c in needle:
            idx = haystack.find(c, current_pos)
            if idx == -1:
                return False
            current_pos = idx + 1
        return True

    perms = list(itertools.permutations(range(1, 5)))
    perm_strs = [''.join(str(x) for x in p) for p in perms]
    subseq_flags = [is_subseq(p, raw) for p in perm_strs]

    if sum(subseq_flags) == 1:
        for p, flag in zip(perms, subseq_flags):
            if flag:
                return list(p)

    # 3) Fallback
    return [0, 0, 0, 0]


def stackselect_extract_prediction(pred: str, num_choice: int):
    """
    Mirror ada_leval.StackSelect.evaluate.extract:

    - Look for A1..An, last match wins.
    - If none, look for bare '1'..'n', last match wins, return 'A#'.
    - If nothing, return '???'.
    """
    nc = num_choice
    # First try "A1", "A2", ...
    cands = [f"A{i}" for i in range(1, nc + 1)]
    finds = [pred.find(c) for c in cands]
    matched = sum(x >= 0 for x in finds)
    if matched >= 1:
        for i in range(nc - 1, -1, -1):
            if finds[i] >= 0:
                return cands[i]

    # Then try plain "1", "2", ..., and map to "A#"
    cands = [str(i) for i in range(1, nc + 1)]
    finds = [pred.find(c) for c in cands]
    matched = sum(x >= 0 for x in finds)
    if matched >= 1:
        for i in range(nc - 1, -1, -1):
            if finds[i] >= 0:
                return "A" + cands[i]

    return "???"


def f1_lists(pred_list, gold_list):
    """
    F1 over lists of labels, order-insensitive (set / multiset style).
    Not part of Ada-LEval, but useful as a secondary metric.
    """
    if pred_list is None:
        return 0.0

    gold = list(gold_list)
    pred = list(pred_list)

    if len(gold) == 0 and len(pred) == 0:
        return 1.0

    gold_c = Counter(gold)
    pred_c = Counter(pred)
    common = sum((gold_c & pred_c).values())

    if common == 0:
        return 0.0

    precision = common / len(pred)
    recall = common / len(gold)
    return 2 * precision * recall / (precision + recall)


In [None]:
def load_causal_model(model_id: str):
    """
    Loads a causal LM with bfloat16 on CUDA and a tokenizer.
    Uses trust_remote_code=True to support custom Mamba/Hymba code.
    """
    tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,   # keep using torch_dtype (dtype kw is not supported by some custom models)
        device_map="cuda",
        trust_remote_code=True,
    )
    model.eval()
    return tok, model

In [None]:
@torch.inference_mode()
def eval_model_on_adaleval_task(
    model,
    tok,
    task_name: str,
    max_samples: int = MAX_SAMPLES_PER_TASK,
    max_new_tokens: int = MAX_NEW_TOKENS,
    device: str = DEVICE,
):
    samples = load_adaleval_task(task_name, max_samples=max_samples)
    n = len(samples)
    print(f"\n=== Task: {task_name} | num_samples = {n} ===")

    is_textsort = task_name.startswith("textsort")
    is_stackselect = task_name.startswith("stackselect")

    total_tokens = 0
    num_correct = 0.0
    f1_sum = 0.0  # for TextSort; for StackSelect this will equal accuracy

    if device == "cuda":
        torch.cuda.reset_peak_memory_stats()

    start_time = time.perf_counter()

    for i, sample in enumerate(tqdm(samples, desc=f"{task_name}", unit="ex")):
        # ----- Build prompt (mirror Ada-Leval) -----
        if is_textsort:
            # TextSort stores the full prompt string in the JSON already
            prompt = sample["prompt"]
        elif is_stackselect:
            # Reconstruct prompt like StackSelect.build_prompt
            question = sample["question"]
            all_answers = sample["all_answers"]

            meta_prompt = """
You are an AI assistant. Your job is to find out the most helpful answer to a given question.
Each time, you will be provided with a question and n answers to this question.
Each answer begins with an 'A' and a number(e.g. A4), which represents its designation.
You need to determine which answer is the most helpful one to the question.
The case sample is shown below and you should give me the answer in the format exactly the same as the sample. \n
However, you should NOT focus on the content of sample answer. \n
Sample Input (format only): \n
The question is given below.
XXX(The content of question)
Possible answers are given below.
A1:
XXX(The content of answer 1)
A2:
XXX(The content of answer 2)
.
.
.
An:
XXX(The content of answer n)
Now the answers are over, please decide which answer is the most helpful one to the question.
You must give me only the designation of the MOST helpful answer.
Sample Output (format only): \n
Answer: The designation of the most helpful answer.(e.g. A4 means answer 4 is the most helpful answer) \n\n
"""

            prompt = meta_prompt
            prompt += "The question is given below.\n"
            prompt += question + "\n\n"
            prompt += "Possible answers are given below.\n"
            for j, ans in enumerate(all_answers, start=1):
                prompt += f"A{j}:\n\n{ans}\n\n"
            prompt += """
Now the answers are over, please decide which answer is the most helpful one to the question.
You must give me only the designation of the MOST helpful answer.
"""
        else:
            raise ValueError(f"Unknown Ada-LEval task type: {task_name}")

        # ----- Tokenize & generate -----
        inputs = tok(prompt, return_tensors="pt", truncation=False).to(device)
        input_len = inputs["input_ids"].shape[1]

        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            temperature=None,
            use_cache=True,
        )

        full_seq = out[0]
        gen_ids = full_seq[input_len:]  # generated tokens only

        total_tokens += gen_ids.numel()

        pred_text = tok.decode(gen_ids, skip_special_tokens=True)

        # ----- Gold + extraction -----
        if is_textsort:
            # Gold answer: list or JSON string
            gold = sample["answer"]
            if isinstance(gold, str):
                gold = json.loads(gold)

            pred_extracted = textsort_extract_prediction(pred_text)

            # EM
            correct = int(list(pred_extracted) == list(gold))
            num_correct += correct

            # F1 (order-insensitive) for extra signal
            f1_sum += f1_lists(pred_extracted, gold)

        elif is_stackselect:
            # Gold is like "A4"
            gold = sample["answer"]
            num_choice = len(sample["all_answers"])

            pred_extracted = stackselect_extract_prediction(pred_text, num_choice)

            correct = int(pred_extracted == gold)
            num_correct += correct
            # For single-label classification, F1 per example is 1 if correct else 0,
            # so average F1 == accuracy.
            f1_sum += correct

        # Optional logging
        if (i + 1) % 50 == 0:
            print(
                f"[{i + 1}/{n}] "
                f"Acc: {num_correct / (i + 1):.4f} | "
                f"F1: {f1_sum / (i + 1):.4f}"
            )

    elapsed = time.perf_counter() - start_time
    elapsed = max(elapsed, 1e-8)

    if device == "cuda":
        peak_mem_gb = torch.cuda.max_memory_allocated() / (1024 ** 3)
    else:
        peak_mem_gb = 0.0

    accuracy = num_correct / n
    mean_f1 = f1_sum / n
    throughput = total_tokens / elapsed

    print(f"Total accuracy: {accuracy:.4f}")
    print(f"Total F1:       {mean_f1:.4f}")
    print(f"Throughput:     {throughput:.2f} tokens/sec")
    print(f"Peak memory:    {peak_mem_gb:.2f} GB")

    return {
        "task": task_name,
        "num_examples": n,
        "accuracy": accuracy,          # matches Ada-Leval logic
        "f1": mean_f1,                 # extra metric (TextSort real F1, StackSelect == acc)
        "throughput_toks_per_sec": throughput,
        "peak_mem_gb": peak_mem_gb,
    }


In [None]:
# You can change these if you want other sizes
MODEL_CONFIGS = {
    "hymba-1.5b": "nvidia/Hymba-1.5B-Base",
}

# 6 tasks = 2 types × 3 context lengths
TASK_NAMES = [
    "textsort_1k",
    "textsort_2k",
    "stackselect_1k",
    "stackselect_4k",
    "stackselect_8k",
]

print("Models:", MODEL_CONFIGS)
print("Tasks:", TASK_NAMES)

Models: {'hymba-1.5b': 'nvidia/Hymba-1.5B-Base'}
Tasks: ['textsort_1k', 'textsort_2k', 'stackselect_1k', 'stackselect_4k', 'stackselect_8k']


In [None]:
import os
import json
import pandas as pd

SAVE_PATH = "/content/adaleval_results.json"

# Optionally resume from existing results
if os.path.exists(SAVE_PATH):
    with open(SAVE_PATH, "r") as f:
        all_results = json.load(f)
    print(f"Loaded {len(all_results)} existing results from {SAVE_PATH}")
else:
    all_results = []

def save_results_to_json():
    with open(SAVE_PATH, "w") as f:
        json.dump(all_results, f, indent=2)
    print(f"✔ Saved {len(all_results)} results to {SAVE_PATH}")

for model_label, model_id in MODEL_CONFIGS.items():
    print("\n" + "#" * 80)
    print(f"Loading model: {model_label} -> {model_id}")
    print("#" * 80)

    tok, model = load_causal_model(model_id)

    # ensure pad token is set
    if getattr(model, "config", None) is not None:
        if model.config.pad_token_id is None and tok.pad_token_id is not None:
            model.config.pad_token_id = tok.pad_token_id

    # Evaluate across tasks
    for task_name in TASK_NAMES:
        # Skip if this (model, task) is already present (optional, but handy)
        if any(r.get("model") == model_label and r.get("task") == task_name for r in all_results):
            print(f"Skipping {model_label} on {task_name} (already in results).")
            continue

        res = eval_model_on_adaleval_task(
            model,
            tok,
            task_name=task_name,
            max_samples=MAX_SAMPLES_PER_TASK,
        )
        res["model"] = model_label

        # Make sure f1 exists even if eval function didn't return it for some reason
        if "f1" not in res:
            res["f1"] = 0.0

        all_results.append(res)
        save_results_to_json()

    # cleanup for next model
    del model
    del tok
    if DEVICE == "cuda":
        torch.cuda.empty_cache()

# Final summary as pandas DataFrame
df = pd.DataFrame(all_results)

# Ensure all expected columns exist
for col in ["model", "task", "num_examples", "accuracy", "f1",
            "throughput_toks_per_sec", "peak_mem_gb"]:
    if col not in df.columns:
        df[col] = float("nan")

df = df[
    ["model", "task", "num_examples", "accuracy", "f1",
     "throughput_toks_per_sec", "peak_mem_gb"]
]
df.sort_values(["model", "task"], inplace=True)
df.reset_index(drop=True, inplace=True)

df



################################################################################
Loading model: hymba-1.5b -> nvidia/Hymba-1.5B-Base
################################################################################


  @custom_fwd
  @custom_bwd
  @custom_fwd
  @custom_bwd
  @custom_fwd
  @custom_bwd
  @custom_fwd
  @custom_bwd


generation_config.json:   0%|          | 0.00/154 [00:00<?, ?B/s]


=== Task: textsort_1k | num_samples = 250 ===


textsort_1k:   0%|          | 0/250 [00:00<?, ?ex/s]

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48


[50/250] Acc: 0.0600 | F1: 0.7800
[100/250] Acc: 0.0600 | F1: 0.8200
[150/250] Acc: 0.0600 | F1: 0.8600
[200/250] Acc: 0.0550 | F1: 0.8650
[250/250] Acc: 0.0600 | F1: 0.8600
Total accuracy: 0.0600
Total F1:       0.8600
Throughput:     10.78 tokens/sec
Peak memory:    3.36 GB
✔ Saved 1 results to /content/adaleval_results.json

=== Task: textsort_2k | num_samples = 250 ===


textsort_2k:   0%|          | 0/250 [00:00<?, ?ex/s]

[50/250] Acc: 0.0000 | F1: 0.0000
[100/250] Acc: 0.0000 | F1: 0.0000
[150/250] Acc: 0.0000 | F1: 0.0000
[200/250] Acc: 0.0000 | F1: 0.0000
[250/250] Acc: 0.0000 | F1: 0.0000
Total accuracy: 0.0000
Total F1:       0.0000
Throughput:     10.79 tokens/sec
Peak memory:    3.80 GB
✔ Saved 2 results to /content/adaleval_results.json

=== Task: stackselect_1k | num_samples = 250 ===


stackselect_1k:   0%|          | 0/250 [00:00<?, ?ex/s]

[50/250] Acc: 0.0800 | F1: 0.0800
[100/250] Acc: 0.0900 | F1: 0.0900
[150/250] Acc: 0.0933 | F1: 0.0933
[200/250] Acc: 0.1000 | F1: 0.1000
[250/250] Acc: 0.1000 | F1: 0.1000
Total accuracy: 0.1000
Total F1:       0.1000
Throughput:     10.97 tokens/sec
Peak memory:    3.46 GB
✔ Saved 3 results to /content/adaleval_results.json

=== Task: stackselect_4k | num_samples = 250 ===


stackselect_4k:   0%|          | 0/250 [00:00<?, ?ex/s]

[50/250] Acc: 0.0000 | F1: 0.0000
[100/250] Acc: 0.0100 | F1: 0.0100
[150/250] Acc: 0.0333 | F1: 0.0333
[200/250] Acc: 0.0350 | F1: 0.0350
[250/250] Acc: 0.0360 | F1: 0.0360
Total accuracy: 0.0360
Total F1:       0.0360
Throughput:     10.23 tokens/sec
Peak memory:    5.03 GB
✔ Saved 4 results to /content/adaleval_results.json

=== Task: stackselect_8k | num_samples = 250 ===


stackselect_8k:   0%|          | 0/250 [00:00<?, ?ex/s]

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (8192). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


[50/250] Acc: 0.0400 | F1: 0.0400
[100/250] Acc: 0.0200 | F1: 0.0200
[150/250] Acc: 0.0133 | F1: 0.0133
[200/250] Acc: 0.0150 | F1: 0.0150
[250/250] Acc: 0.0320 | F1: 0.0320
Total accuracy: 0.0320
Total F1:       0.0320
Throughput:     9.30 tokens/sec
Peak memory:    7.33 GB
✔ Saved 5 results to /content/adaleval_results.json


Unnamed: 0,model,task,num_examples,accuracy,f1,throughput_toks_per_sec,peak_mem_gb
0,hymba-1.5b,stackselect_1k,250,0.1,0.1,10.972929,3.458462
1,hymba-1.5b,stackselect_4k,250,0.036,0.036,10.225979,5.033111
2,hymba-1.5b,stackselect_8k,250,0.032,0.032,9.303454,7.334136
3,hymba-1.5b,textsort_1k,250,0.06,0.86,10.778139,3.361273
4,hymba-1.5b,textsort_2k,250,0.0,0.0,10.794505,3.804214
