In [1]:
!pip -q install "datasets>=2.19.0" "transformers>=4.43.0" "accelerate>=0.33.0" \
                "evaluate>=0.4.1" "groq>=0.9.0" einops

import os, time, numpy as np, re, string, torch
from datasets import load_dataset


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/134.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.9/134.9 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
os.environ["GROQ_API_KEY"] = "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"           # your Groq key
HF_TOKEN = os.environ.get("HF_TOKEN", None)      # optional if TinyLlama is public


In [3]:
ds = load_dataset("squad")
dev = ds["validation"]

# keep it quick while testing; bump to 1000 later if you want
N_SAMPLES = 300
eval_data = dev.select(range(N_SAMPLES))

def normalize_answer(s):
    def lower(t): return t.lower()
    def remove_punc(t): return "".join(ch for ch in t if ch not in set(string.punctuation))
    def remove_articles(t): return re.sub(r"\b(a|an|the)\b", " ", t)
    def ws(t): return " ".join(t.split())
    return ws(remove_articles(remove_punc(lower(s))))

def f1(pred, gold):
    pt = normalize_answer(pred).split()
    gt = normalize_answer(gold).split()
    common = set(pt) & set(gt)
    num_same = sum(min(pt.count(w), gt.count(w)) for w in common)
    if not pt and not gt: return 1.0
    if num_same == 0: return 0.0
    prec, rec = num_same/len(pt), num_same/len(gt)
    return 2*prec*rec/(prec+rec)

def em(pred, gold):
    return float(normalize_answer(pred) == normalize_answer(gold))

def score(preds, refs):
    EM = 100*np.mean([em(p, g) for p, g in zip(preds, refs)])
    F1 = 100*np.mean([f1(p, g) for p, g in zip(preds, refs)])
    return {"EM": round(EM, 2), "F1": round(F1, 2)}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [4]:
PROMPT = (
    "You are an extractive QA system.\n"
    "Answer with the shortest phrase copied exactly from the context.\n"
    "Output only the phrase, no quotes or extra words.\n\n"
    "Context: {context}\n"
    "Question: {question}\n"
    "Answer:"
)


In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tiny_id = "Vidyuth/TinyLlama-finetuned-squad"   # your found checkpoint
tok_tiny = AutoTokenizer.from_pretrained(tiny_id, use_auth_token=HF_TOKEN)
mdl_tiny = AutoModelForCausalLM.from_pretrained(
    tiny_id, torch_dtype=torch.float16, device_map="auto", use_auth_token=HF_TOKEN
).eval()

@torch.inference_mode()
def run_hf(model, tok, data):
    preds, refs, lat = [], [], []
    if torch.cuda.is_available(): torch.cuda.reset_peak_memory_stats()
    for ex in data:
        prompt = PROMPT.format(context=ex["context"], question=ex["question"])
        inp = tok(prompt, return_tensors="pt").to(model.device)
        t0 = time.perf_counter()
        out = model.generate(**inp, max_new_tokens=16, temperature=0.0, top_p=1.0, do_sample=False)
        lat.append(time.perf_counter() - t0)
        ans = tok.decode(out[0][inp["input_ids"].shape[1]:], skip_special_tokens=True).strip().split("\n")[0].strip()
        preds.append(ans)
        refs.append(ex["answers"]["text"][0])
    res = score(preds, refs)
    p95 = round(float(np.percentile(lat, 95)), 4)
    vram = round(torch.cuda.max_memory_reserved()/(1024**3), 2) if torch.cuda.is_available() else None
    return {"metrics":res, "p95_s":p95, "vram_GB":vram, "n":len(data)}

tiny_res = run_hf(mdl_tiny, tok_tiny, eval_data)
print("TinyLlama-SQuAD:", tiny_res)


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


pytorch_model.bin:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


TinyLlama-SQuAD: {'metrics': {'EM': np.float64(29.33), 'F1': np.float64(42.33)}, 'p95_s': 0.6163, 'vram_GB': 2.33, 'n': 300}


In [6]:
from groq import Groq
client = Groq(api_key=os.environ["GROQ_API_KEY"])

def groq_answer(context, question):
    msg = [
        {"role":"system","content":"You are an extractive QA system. Return the shortest phrase copied exactly from the context; no quotes, no extra words."},
        {"role":"user","content": f"Context: {context}\nQuestion: {question}\nAnswer:"}
    ]
    r = client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=msg,
        temperature=0.0,
        top_p=1.0,
        max_completion_tokens=16,   # if this errors, switch to max_tokens=16
        stream=False,
    )
    return r.choices[0].message.content.strip().split("\n")[0].strip()

def run_groq(data):
    preds, refs, lat = [], [], []
    for ex in data:
        t0 = time.perf_counter()
        ans = groq_answer(ex["context"], ex["question"])
        lat.append(time.perf_counter() - t0)
        preds.append(ans)
        refs.append(ex["answers"]["text"][0])
    res = score(preds, refs)
    p95 = round(float(np.percentile(lat, 95)), 4)
    med = round(float(np.median(lat)), 4)
    return {"metrics":res, "p95_s":p95, "median_s":med, "n":len(data)}

groq_res = run_groq(eval_data)
print("Groq Llama-3.1-8B-instant:", groq_res)


Groq Llama-3.1-8B-instant: {'metrics': {'EM': np.float64(79.33), 'F1': np.float64(87.2)}, 'p95_s': 3.4352, 'median_s': 2.3615, 'n': 300}


In [7]:
print("\n=== Summary (SQuAD v1.1 dev, N=", len(eval_data), ") ===", sep="")
print("TinyLlama-SQuAD  -> EM:", tiny_res["metrics"]["EM"], " F1:", tiny_res["metrics"]["F1"], " p95(s):", tiny_res["p95_s"])
print("Llama-3.1-8B(Groq)-> EM:", groq_res["metrics"]["EM"], " F1:", groq_res["metrics"]["F1"], " p95(s):", groq_res["p95_s"])


=== Summary (SQuAD v1.1 dev, N=300) ===
TinyLlama-SQuAD  -> EM: 29.33  F1: 42.33  p95(s): 0.6163
Llama-3.1-8B(Groq)-> EM: 79.33  F1: 87.2  p95(s): 3.4352
