# Question 5

## Loading Data

In [1]:
with open("data/wiki2.train.txt", "r") as file:
    wiki_train = file.read()

with open("data/examples.txt", "r") as file:
    example_test = file.read()

In [2]:
# first 100 characters
example_test[0:100]

'01. Best known for developing the theory of relativity, Einstein also made important contributions t'

## Tokenization

In [3]:
import spacy
from utils.tokenization import chunked_tokenization

In [4]:
nlp = spacy.load("xx_ent_wiki_sm")

In [5]:
spacy_train = chunked_tokenization(wiki_train, nlp)
spacy_test = chunked_tokenization(example_test, nlp)

In [6]:
from transformers import GPT2TokenizerFast
from utils.tokenization import chunked_tokenization_gpt2

In [7]:
gpt2_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

In [8]:
gpt2_train = chunked_tokenization_gpt2(wiki_train, gpt2_tokenizer)
gpt2_test = chunked_tokenization_gpt2(example_test, gpt2_tokenizer)

Token indices sequence length is longer than the specified maximum sequence length for this model (1134371 > 1024). Running this sequence through the model will result in indexing errors


## Testing LaPlace Smoothened Models

In [9]:
from models.ngrams.laplace_ngrams import (
    calculate_laplace_perplexities,
    test_laplace_ngram_model,
)

In [10]:
gpt2_perplexities = calculate_laplace_perplexities(gpt2_train, gpt2_test)
print("GPT-2 Perplexities:")
print(gpt2_perplexities)

GPT-2 Perplexities:
{'1-gram': 2154.4990254150816, '2-gram': 7662.707268043866, '3-gram': 455220.731698504, '7-gram': 2249736.3442193987}


In [11]:
spacy_perplexities = calculate_laplace_perplexities(spacy_train, spacy_test)
print("SpaCy Perplexities:")
print(spacy_perplexities)

SpaCy Perplexities:
{'1-gram': 2154.0385663529423, '2-gram': 5292.625690807777, '3-gram': 378472.2068505432, '7-gram': 2092276.7487780796}


## Testing Fine-Tuned GPT-2 Model

In [12]:
import math

import torch
from tqdm import tqdm
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [13]:
tokenizer = GPT2Tokenizer.from_pretrained("models/gpt2/")
model = GPT2LMHeadModel.from_pretrained("models/gpt2/")

In [14]:
model.eval()
device = torch.device("cuda")
model.to(device)
print(model.device)

cuda:0


In [15]:
max_length = model.config.n_positions - 1
stride = 32
n = 0
total_loss = 0.0

In [16]:
for i in tqdm(range(0, len(example_test), stride)):
    encoded_chunk = tokenizer.encode(example_test[i : i + max_length], return_tensors="pt")

    encoded_chunk = encoded_chunk.to(model.device)

    with torch.no_grad():
        outputs = model(encoded_chunk, labels=encoded_chunk)
        total_loss += outputs.loss.item()
        n += 1

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 63/63 [00:01<00:00, 48.31it/s]


In [17]:
average_loss = total_loss / n

In [18]:
perplexity = torch.exp(torch.tensor(average_loss)).item()
print(f"Perplexity: {perplexity}")

Perplexity: 44.567405700683594
