In [32]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.6.0


In [37]:
import os
import re
import requests
import argparse
import zipfile
from tqdm import tqdm

import tiktoken
import numpy as np

DATA_CACHE_DIR = "/kaggle/working/data"
enc = tiktoken.get_encoding("gpt2")
encode = lambda s: enc.encode(s, allowed_special = {"<|endoftext|>"})

def download_file(url : str, fname : str, chunk_size = 1024):
    """Helper function to download a file from a given url"""
    resp = requests.get(url, stream = True)
    total = int(resp.headers.get("content-length", 0))
    with open(fname, "wb") as file, tqdm(
        desc = fname,
        total = total,
        unit = "iB",
        unit_scale = True,
        unit_divisor = 1024,
    ) as bar:
        for data in resp.iter_content(chunk_size = chunk_size):
            size = file.write(data)
            bar.update(size)
            
def download():
    """Downloads the WikiText-103 dataset to DATA_CACHE_DIR"""
    os.makedirs(DATA_CACHE_DIR, exist_ok = True)

    # download the WikiText-103 dataset, unless it's already downloaded
    data_url = "https://wikitext.smerity.com/wikitext-103-raw-v1.zip"
    data_filename = os.path.join(DATA_CACHE_DIR, "WikiText-103.zip")
    if not os.path.exists(data_filename):
        print(f"Downloading {data_url} to {data_filename}...")
        download_file(data_url, data_filename)
    else:
        print(f"{data_filename} already exists, skipping download...")

    # unzip the file
    data_dir = os.path.join(DATA_CACHE_DIR, "wikitext-103")
    if not os.path.exists(data_dir):
        os.makedirs(data_dir, exist_ok = True)
        print(f"Unzipping {data_filename}...")
        with zipfile.ZipFile(data_filename, "r") as zip_ref:
            zip_ref.extractall(data_dir)
    else:
        print(f"{data_dir} already exists, skipping unzipping...")

def tokenize(preprocess : bool):
    # special token
    eot = enc._special_tokens["<|endoftext|>"]

    # fetch validation text
    val_data_filename = os.path.join(DATA_CACHE_DIR, "wikitext-103/wikitext-103-raw/wiki.valid.raw")
    val_text = open(val_data_filename, "r", encoding = "utf-8").read()

    if preprocess:
        print("Cleaning validation data...")
        # cleanup the training text
        val_text = val_text.strip() # remove leading and trailing whitespace
        val_text = val_text.replace(" \n \n ", "\n<|endoftext|>") # injecting special token in between sections
        val_text = val_text.replace("@-@", "-")
        val_text = val_text.replace("@.@", ".")
        val_text = val_text.replace("@,@", ",")
        val_text = "<|endoftext|>" + val_text # adding special token at start
        val_split = val_text.split("<|endoftext|>") # splitting the text by special token to remove the extraneous headers/titles

        # remove the awkward headers/titles that came from the original parquet format
        for chunk in tqdm([item for item in reversed(range(len(val_split)))], desc = "Removing artifacts", unit = "iB"):
            # if the chunk is of the form of the headers/titles we will pop this chunk out
            if bool(re.match(r"^\s*= +(.{1,}) +=\s*$", val_split[chunk])):
                val_split.pop(chunk)

        # now join the remaining chunks via eot
        val_text = "<|endoftext|>".join(val_split[i] for i in range(len(val_split)))
    print("Tokenizing validation text...")
    val_tokens = encode(val_text)
    print("Validation text tokenized")
    val_tokens_np = np.array(val_tokens, dtype = np.int32)

    print("Dumping text into text files to observe readable output")
    with open(os.path.join(DATA_CACHE_DIR, "wikitext-103-preprocessed_val.txt" if preprocess else "wikitext-103-raw_val.txt"), "w") as f:
        f.write(val_text)

    # now just dump the encoded tokens into binary files
    val_filename = os.path.join(DATA_CACHE_DIR, "wikitext-103-preprocessed_val.bin" if preprocess else "wikitext-103-raw_val.bin")

    with open(val_filename, "wb") as f:
        for chunk in tqdm([val_tokens_np[i : i + 1024] for i in range(0, len(val_tokens_np), 1024)], desc = "Writing validation data to wikitext-103_val.bin", unit = "iB"):
            f.write(chunk.tobytes())
    
    print(f"Saved {len(val_tokens_np)} tokens to {val_filename}")
    

In [38]:
download()
tokenize(True)
tokenize(False)

/kaggle/working/data/WikiText-103.zip already exists, skipping download...
/kaggle/working/data/wikitext-103 already exists, skipping unzipping...
Cleaning validation data...


Removing artifacts: 100%|██████████| 1161/1161 [00:00<00:00, 286766.79iB/s]

Tokenizing validation text...





Validation text tokenized
Dumping text into text files to observe readable output


Writing validation data to wikitext-103_val.bin: 100%|██████████| 231/231 [00:00<00:00, 91300.81iB/s]

Saved 236191 tokens to /kaggle/working/data/wikitext-103-preprocessed_val.bin
Tokenizing validation text...





Validation text tokenized
Dumping text into text files to observe readable output


Writing validation data to wikitext-103_val.bin: 100%|██████████| 245/245 [00:00<00:00, 76835.99iB/s]

Saved 249887 tokens to /kaggle/working/data/wikitext-103-raw_val.bin





In [43]:
# on raw wikitext-103
import os
import numpy as np
import torch
from tqdm import tqdm

from transformers import GPT2LMHeadModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
models = {
    "small": "openai-community/gpt2",
    "medium": "openai-community/gpt2-medium",
    "large": "openai-community/gpt2-large",
    "xl": "openai-community/gpt2-xl"
}

print("Loading dataset...")
with open("/kaggle/working/data/wikitext-103-raw_val.bin", "rb") as f:
    eval_text = np.frombuffer(f.read(), dtype=np.int32)
    eval_text = torch.tensor(eval_text, dtype = torch.long).unsqueeze(0)
print("Dataset loaded")

for size in models:
    print(f"Loading gpt2-{size}...")
    model = GPT2LMHeadModel.from_pretrained(models[size]).to(device)
    print(f"gpt2-{size} loaded")

    max_length = model.config.n_positions
    stride = max_length
    seq_len = eval_text.shape[1]

    nlls = []
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_len, stride), desc = f"Evaluating gpt2-{size}"):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = eval_text[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels = target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean())
    print(f"gpt2-{size} perplexity score:", ppl.item())

Loading dataset...
Dataset loaded
Loading gpt2-small...


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

gpt2-small loaded


Evaluating gpt2-small: 100%|█████████▉| 244/245 [00:15<00:00, 16.05it/s]


gpt2-small perplexity score: 30.130029678344727
Loading gpt2-medium...


config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

gpt2-medium loaded


Evaluating gpt2-medium: 100%|█████████▉| 244/245 [00:38<00:00,  6.29it/s]


gpt2-medium perplexity score: 21.772069931030273
Loading gpt2-large...


config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

gpt2-large loaded


Evaluating gpt2-large: 100%|█████████▉| 244/245 [01:21<00:00,  3.01it/s]

gpt2-large perplexity score: 18.740623474121094
Loading gpt2-xl...





config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

gpt2-xl loaded


Evaluating gpt2-xl: 100%|█████████▉| 244/245 [02:34<00:00,  1.58it/s]

gpt2-xl perplexity score: 16.912160873413086





In [44]:
# on my preprocessed wikitext-103
import os
import numpy as np
import torch
from tqdm import tqdm

from transformers import GPT2LMHeadModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
models = {
    "small": "openai-community/gpt2",
    "medium": "openai-community/gpt2-medium",
    "large": "openai-community/gpt2-large",
    "xl": "openai-community/gpt2-xl"
}

print("Loading dataset...")
with open("/kaggle/working/data/wikitext-103-preprocessed_val.bin", "rb") as f:
    eval_text = np.frombuffer(f.read(), dtype=np.int32)
    eval_text = torch.tensor(eval_text, dtype = torch.long).unsqueeze(0)
print("Dataset loaded")

for size in models:
    print(f"Loading gpt2-{size}...")
    model = GPT2LMHeadModel.from_pretrained(models[size]).to(device)
    print(f"gpt2-{size} loaded")

    max_length = model.config.n_positions
    stride = max_length
    seq_len = eval_text.shape[1]

    nlls = []
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_len, stride), desc = f"Evaluating gpt2-{size}"):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = eval_text[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels = target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean())
    print(f"gpt2-{size} perplexity score:", ppl.item())

Loading dataset...
Dataset loaded
Loading gpt2-small...
gpt2-small loaded


Evaluating gpt2-small: 100%|█████████▉| 230/231 [00:13<00:00, 16.80it/s]


gpt2-small perplexity score: 33.19379425048828
Loading gpt2-medium...
gpt2-medium loaded


Evaluating gpt2-medium: 100%|█████████▉| 230/231 [00:36<00:00,  6.29it/s]


gpt2-medium perplexity score: 24.309528350830078
Loading gpt2-large...
gpt2-large loaded


Evaluating gpt2-large: 100%|█████████▉| 230/231 [01:16<00:00,  3.01it/s]


gpt2-large perplexity score: 21.38955307006836
Loading gpt2-xl...
gpt2-xl loaded


Evaluating gpt2-xl: 100%|█████████▉| 230/231 [02:26<00:00,  1.57it/s]


gpt2-xl perplexity score: 19.324647903442383
