In [2]:
import os
import numpy as np
import torch
from tqdm import tqdm

from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from datasets import load_dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
models = {
    "small": "openai-community/gpt2",
    "medium": "openai-community/gpt2-medium",
    "large": "openai-community/gpt2-large",
    "xl": "openai-community/gpt2-xl"
}

print("Loading dataset...")
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
test = load_dataset("wikitext", "wikitext-103-raw-v1", split="validation")
encodings = tokenizer("".join(test["text"]), return_tensors="pt")
print("Dataset loaded")

for size in models:
    print(f"Loading gpt2-{size}...")
    model = GPT2LMHeadModel.from_pretrained(models[size]).to(device)
    print(f"gpt2-{size} loaded")

    max_length = model.config.n_positions
    stride = max_length
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_len, stride), desc = f"Evaluating gpt2-{size}"):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels = target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean())
    print(f"gpt2-{size} perplexity score:", ppl.item())

Loading dataset...


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 733k/733k [00:00<00:00, 2.63MB/s]
Downloading data: 100%|██████████| 157M/157M [00:03<00:00, 44.6MB/s] 
Downloading data: 100%|██████████| 157M/157M [00:03<00:00, 44.0MB/s] 
Downloading data: 100%|██████████| 657k/657k [00:00<00:00, 3.80MB/s]


Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (247289 > 1024). Running this sequence through the model will result in indexing errors


Dataset loaded
Loading gpt2-small...


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

gpt2-small loaded


Evaluating gpt2-small: 100%|█████████▉| 241/242 [00:14<00:00, 16.09it/s]


gpt2-small perplexity score: 30.58563804626465
Loading gpt2-medium...


config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

gpt2-medium loaded


Evaluating gpt2-medium: 100%|█████████▉| 241/242 [00:38<00:00,  6.28it/s]


gpt2-medium perplexity score: 22.348369598388672
Loading gpt2-large...


config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

gpt2-large loaded


Evaluating gpt2-large: 100%|█████████▉| 241/242 [01:20<00:00,  3.01it/s]


gpt2-large perplexity score: 19.330533981323242
Loading gpt2-xl...


config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

gpt2-xl loaded


Evaluating gpt2-xl: 100%|█████████▉| 241/242 [02:33<00:00,  1.57it/s]


gpt2-xl perplexity score: 17.458951950073242


In [3]:
# the difference between the previous cell and this cell is in the joining of the test text by "\n\n"
import os
import numpy as np
import torch
from tqdm import tqdm

from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from datasets import load_dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
models = {
    "small": "openai-community/gpt2",
    "medium": "openai-community/gpt2-medium",
    "large": "openai-community/gpt2-large",
    "xl": "openai-community/gpt2-xl"
}

print("Loading dataset...")
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
test = load_dataset("wikitext", "wikitext-103-raw-v1", split="validation")
encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")
print("Dataset loaded")

for size in models:
    print(f"Loading gpt2-{size}...")
    model = GPT2LMHeadModel.from_pretrained(models[size]).to(device)
    print(f"gpt2-{size} loaded")

    max_length = model.config.n_positions
    stride = max_length
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_len, stride), desc = f"Evaluating gpt2-{size}"):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels = target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean())
    print(f"gpt2-{size} perplexity score:", ppl.item())

Loading dataset...


Token indices sequence length is longer than the specified maximum sequence length for this model (251048 > 1024). Running this sequence through the model will result in indexing errors


Dataset loaded
Loading gpt2-small...
gpt2-small loaded


Evaluating gpt2-small: 100%|█████████▉| 245/246 [00:14<00:00, 16.78it/s]


gpt2-small perplexity score: 31.042251586914062
Loading gpt2-medium...
gpt2-medium loaded


Evaluating gpt2-medium: 100%|█████████▉| 245/246 [00:39<00:00,  6.27it/s]


gpt2-medium perplexity score: 22.5109920501709
Loading gpt2-large...
gpt2-large loaded


Evaluating gpt2-large: 100%|█████████▉| 245/246 [01:21<00:00,  3.01it/s]


gpt2-large perplexity score: 20.088329315185547
Loading gpt2-xl...
gpt2-xl loaded


Evaluating gpt2-xl: 100%|█████████▉| 245/246 [02:35<00:00,  1.57it/s]

gpt2-xl perplexity score: 17.91388702392578



