In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import lm_eval
from lm_eval.utils import setup_logging
from lm_eval.models.huggingface import HFLM

In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval().to(device)
# prompt = "Explain the importance of reinforcement learning in AI."
# inputs = tokenizer(prompt, return_tensors="pt").to(device)
# output = model.generate(
#     **inputs,
#     max_length=100
# )

# # Decode the output tokens into text
# generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

# print("output:")
# print(generated_text)


# Optional: configure logging

In [None]:
setup_logging("DEBUG")

hf_model = HFLM(pretrained=model, tokenizer=tokenizer)

results = lm_eval.simple_evaluate(
    model=hf_model,
    tasks=["hellaswag"],   # List of task names
    num_fewshot=0,                 # Zero-shot by default
    batch_size=8,
    limit=16                     # Use full datasets
)

In [None]:
print(results['results'])

In [None]:
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
)

# 1) After training or fine-tuning, you have:
#    - `model` : a PreTrainedModel (e.g. GPT2LMHeadModel, T5ForConditionalGeneration, etc.)
#    - `tokenizer` : its corresponding PreTrainedTokenizer

# Example: load a base model, fine-tune it somehow...
# ... your training loop here ...

# 2) Choose a directory to save to (it can be local or a path for a Hub repo)
save_directory = "./my_llama"

# 3) Save the model + config
model.save_pretrained(save_directory)
#    → writes config.json and pytorch_model.bin

# 4) Save the tokenizer
tokenizer.save_pretrained(save_directory)
#    → writes tokenizer.json (or vocab.txt + merges.txt, depending on tokenizer)

print(f"Model + tokenizer saved to {save_directory}")

In [None]:
import evaluate
from datasets import load_dataset

# 1. Load the metric
perplexity = evaluate.load("perplexity", module_type="metric")

# 2. Load WikiText-2 test set and filter out empty lines
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
texts = [t for t in dataset["text"] if t and not t.isspace()]

# 3. Compute perplexity in one shot, letting the metric handle internal batching
results = perplexity.compute(
    model_id="./my_llama",
    predictions=texts,          # List[str] of input texts
    batch_size=8,               # How many texts per device-forward
    device="cuda",              # or "cpu"
    add_start_token=False       # match your use case; defaults to True
)

print(f"Mean perplexity: {results['mean_perplexity']:.2f}")
# Optionally inspect per‐example scores:
# print(results["perplexities"][:5])

In [None]:
len(results["perplexities"])