In [13]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import lm_eval
from lm_eval.utils import setup_logging
from lm_eval.models.huggingface import HFLM

In [14]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval().to(device)
# prompt = "Explain the importance of reinforcement learning in AI."
# inputs = tokenizer(prompt, return_tensors="pt").to(device)
# output = model.generate(
#     **inputs,
#     max_length=100
# )

# # Decode the output tokens into text
# generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

# print("output:")
# print(generated_text)


# Optional: configure logging

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm):

In [4]:
setup_logging("DEBUG")

hf_model = HFLM(pretrained=model, tokenizer=tokenizer)

results = lm_eval.simple_evaluate(
    model=hf_model,
    tasks=["hellaswag"],   # List of task names
    num_fewshot=0,                 # Zero-shot by default
    batch_size=8,
    limit=16                     # Use full datasets
)

2025-04-30:14:06:55,406 DEBUG    [lm_eval.models.huggingface:492] Using model type 'causal'
2025-04-30:14:06:55,429 INFO     [lm_eval.evaluator:169] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-04-30:14:06:55,429 INFO     [lm_eval.evaluator:222] Using pre-initialized model
2025-04-30:14:06:56,221 DEBUG    [lm_eval.tasks:523] File _evalita-mp_ner_wn.yaml in /home/jaeyongjang/.conda/envs/spinquant/lib/python3.10/site-packages/lm_eval/tasks/evalita_llm could not be loaded
2025-04-30:14:06:56,223 DEBUG    [lm_eval.tasks:523] File _evalita-mp_ner_fic.yaml in /home/jaeyongjang/.conda/envs/spinquant/lib/python3.10/site-packages/lm_eval/tasks/evalita_llm could not be loaded
2025-04-30:14:06:56,280 DEBUG    [lm_eval.tasks:523] File _evalita-mp_ner_adg.yaml in /home/jaeyongjang/.conda/envs/spinquant/lib/python3.10/site-packages/lm_eval/tasks/evalita_llm could not be loaded
2025-04-30:14:07:12,447 DEBUG    [lm

In [6]:
print(results['results'])

{'hellaswag': {'alias': 'hellaswag', 'acc,none': 0.4375, 'acc_stderr,none': 0.128086884574495, 'acc_norm,none': 0.5, 'acc_norm_stderr,none': 0.12909944487358055}}


In [15]:
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
)

# 1) After training or fine-tuning, you have:
#    - `model` : a PreTrainedModel (e.g. GPT2LMHeadModel, T5ForConditionalGeneration, etc.)
#    - `tokenizer` : its corresponding PreTrainedTokenizer

# Example: load a base model, fine-tune it somehow...
# ... your training loop here ...

# 2) Choose a directory to save to (it can be local or a path for a Hub repo)
save_directory = "./my_llama"

# 3) Save the model + config
model.save_pretrained(save_directory)
#    → writes config.json and pytorch_model.bin

# 4) Save the tokenizer
tokenizer.save_pretrained(save_directory)
#    → writes tokenizer.json (or vocab.txt + merges.txt, depending on tokenizer)

print(f"Model + tokenizer saved to {save_directory}")

Model + tokenizer saved to ./my_llama


In [16]:
import evaluate
from datasets import load_dataset

# 1. Load the metric
perplexity = evaluate.load("perplexity", module_type="metric")

# 2. Load WikiText-2 test set and filter out empty lines
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
texts = [t for t in dataset["text"] if t and not t.isspace()]

# 3. Compute perplexity in one shot, letting the metric handle internal batching
results = perplexity.compute(
    model_id="./my_llama",
    predictions=texts,          # List[str] of input texts
    batch_size=8,               # How many texts per device-forward
    device="cuda",              # or "cpu"
    add_start_token=False       # match your use case; defaults to True
)

print(f"Mean perplexity: {results['mean_perplexity']:.2f}")
# Optionally inspect per‐example scores:
# print(results["perplexities"][:5])

  0%|          | 0/362 [00:00<?, ?it/s]

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Mean perplexity: 1968.02


In [None]:
len(results["perplexities"])

2891

: 