In [1]:
import transformers
from transformers import pipeline
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset
from transformers import Trainer, TrainingArguments
print(transformers.__version__)

4.53.1


In [2]:
generator = pipeline('text-generation', model='gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
# generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)

Device set to use mps:0


In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))
# text = "Replace me by any text you'd like."
# encoded_input = tokenizer(text, return_tensors='pt')
# output = model(**encoded_input)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [4]:
dataset = load_dataset("Salesforce/wikitext", "wikitext-2-raw-v1")

In [5]:
def tokenize(examples):
    tokens = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens
tokenized_dataset = dataset.map(tokenize, batched=True)

In [None]:
training_args = TrainingArguments(output_dir="./results", learning_rate=2e-5, per_device_train_batch_size=4, num_train_epochs=3, use_cpu=True)
trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset["train"])
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,4.0551
1000,1.3531
1500,1.38
2000,1.2816
2500,1.2746
3000,1.2546
3500,1.2863
4000,1.313
4500,1.2699
5000,1.2739


In [None]:
import torch
import numpy as np
from transformers import GPT2LMHeadModel, GPT2Tokenizer

def compute_perplexity(model, tokenizer, texts):
    model.eval()
    losses = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True)
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
        losses.append(loss.item())
    return np.exp(np.mean(losses))  # Exponentiate average loss


In [None]:
def top_k_accuracy(model, tokenizer, text, k=5):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs["input_ids"]
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    shift_logits = logits[:, :-1, :]
    shift_labels = input_ids[:, 1:]

    _, top_k = torch.topk(shift_logits, k, dim=-1)  # shape: [batch, seq_len-1, k]

    match = (top_k == shift_labels.unsqueeze(-1)).any(-1).float()  # [batch, seq_len-1]
    acc = match.mean().item()
    return acc


In [None]:
texts = ["The quick brown fox", "Once upon a time"]
ppl = compute_perplexity(model, tokenizer, texts)
print("Perplexity:", ppl)


In [None]:
text = "The quick brown fox jumps over the lazy"
acc = top_k_accuracy(model, tokenizer, text, k=5)
print("Top-5 Accuracy:", acc)


In [None]:
import evaluate
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model.eval()

# Load perplexity metric
perplexity_metric = evaluate.load("perplexity")

# Sample inputs
texts = ["The quick brown fox jumps over the lazy dog.", 
         "Once upon a time in a land far away..."]

# Tokenize
encodings = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Evaluate perplexity
results = perplexity_metric.compute(model=model, input_texts=texts)
print("Perplexity:", results["perplexities"])


In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torchmetrics.classification import MulticlassAccuracy

# Load model/tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model.eval()

# Input text
text = "The quick brown fox jumps over the lazy dog"
inputs = tokenizer(text, return_tensors="pt")
input_ids = inputs["input_ids"]

# Get logits
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Prepare logits and labels
shift_logits = logits[:, :-1, :]           # [batch, seq_len-1, vocab_size]
shift_labels = input_ids[:, 1:]            # [batch, seq_len-1]

# Flatten
logits_flat = shift_logits.reshape(-1, shift_logits.size(-1))  # [tokens, vocab]
labels_flat = shift_labels.reshape(-1)                         # [tokens]

# Top-k accuracy (e.g., top-5)
topk_metric = MulticlassAccuracy(top_k=5, num_classes=logits_flat.size(-1), average="micro")
acc = topk_metric(logits_flat, labels_flat)
print(f"Top-5 Accuracy: {acc.item():.4f}")
