In [30]:
import transformers
from transformers import pipeline
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset
from transformers import Trainer, TrainingArguments
import torch
import numpy as np

In [2]:
generator = pipeline('text-generation', model='gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
# generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)

Device set to use mps:0


In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [4]:
dataset = load_dataset("Salesforce/wikitext", "wikitext-2-raw-v1")

In [5]:
def tokenize(examples):
    tokens = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens
tokenized_dataset = dataset.map(tokenize, batched=True)

In [6]:
training_args = TrainingArguments(output_dir="./results", learning_rate=2e-5, per_device_train_batch_size=4, num_train_epochs=3, use_cpu=True)
trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset["train"])
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,4.0551
1000,1.3531
1500,1.38
2000,1.2816
2500,1.2746
3000,1.2546
3500,1.2863
4000,1.313
4500,1.2699
5000,1.2739


TrainOutput(global_step=27540, training_loss=1.2525294587156786, metrics={'train_runtime': 19345.6122, 'train_samples_per_second': 5.694, 'train_steps_per_second': 1.424, 'total_flos': 7195590623232000.0, 'train_loss': 1.2525294587156786, 'epoch': 3.0})

In [38]:
def compute_perplexity(model, tokenizer, texts):
    model.eval()
    losses = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True)
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
        losses.append(loss.item())
    return np.exp(np.mean(losses))

In [31]:
def top_k_accuracy(model, tokenizer, text, k=5):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs["input_ids"]
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    shift_logits = logits[:, :-1, :]
    shift_labels = input_ids[:, 1:]

    _, top_k = torch.topk(shift_logits, k, dim=-1)

    match = (top_k == shift_labels.unsqueeze(-1)).any(-1).float()
    acc = match.mean().item()
    return acc

In [33]:
text = "The quick brown fox jumps over the lazy dog"
acc = top_k_accuracy(model, tokenizer, text, k=5)
print("Top-5 Accuracy:", acc)

Top-5 Accuracy: 0.375


In [39]:
texts = ["The quick brown fox", "Once upon a time"]
ppl = compute_perplexity(model, tokenizer, texts)
print("Perplexity:", ppl)

Perplexity: 319.62936322046545
