In [1]:
import tiktoken
import sentencepiece as spm
import os

from gpt_model import GPTModel
from data_loader_v1 import create_dataloader_v1
from generate_text import generate

In [2]:
import torch

### Detect if GPU is available

In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using {device} device.")

Using mps device.


### Set up model configuration 

In [4]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 256,  # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 8,          # Number of attention heads
    "n_layers": 8,         # Number of layers
    "drop_rate": 0.2,       # Dropout rate
    "qkv_bias": True,      # Query-Key-Value bias
    "device": device,
}

### Train a custom tokenizer

In [5]:
spm.SentencePieceTrainer.train(
    input='all_books.txt',
    model_prefix='gpt_custom_tokenizer',
    vocab_size=10000,
    model_type='bpe',
    character_coverage=1.0,
    bos_id=-1,
    eos_id=-1
)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: all_books.txt
  input_format: 
  model_prefix: gpt_custom_tokenizer
  model_type: BPE
  vocab_size: 10000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: -1
  eos_id: -1
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0


bpe_model_trainer.cc(268) LOG(INFO) Added: freq=96 size=2220 all=19994 active=1068 piece=itten
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=95 size=2240 all=20055 active=1129 piece=anity
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=94 size=2260 all=20098 active=1172 piece=▁Bart
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=93 size=2280 all=20153 active=1227 piece=▁disag
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=91 size=2300 all=20166 active=1240 piece=▁perfor
bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=91 min_freq=51
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=89 size=2320 all=20208 active=1046 piece=▁suf
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=88 size=2340 all=20238 active=1076 piece=▁pause
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=86 size=2360 all=20301 active=1139 piece=Wh
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=86 size=2380 all=20328 active=1166 piece=▁expectation
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=84 si

In [6]:
sp = spm.SentencePieceProcessor()
sp.load('gpt_custom_tokenizer.model')

text = "It is a truth universally acknowledged..."
token_ids = sp.encode(text, out_type=int)
decoded_text = sp.decode(token_ids)

print(token_ids)
print(decoded_text)

[334, 109, 3, 1651, 7095, 2489, 4179, 9942]
It is a truth universally acknowledged...


### Load training and validation data files

In [5]:
train_file_path = 'train_text_data_all_txt.txt'
val_file_path = 'val_text_data_all_txt.txt'

with open(train_file_path, "r", encoding="utf-8") as file:
    train_data = file.read()
with open(val_file_path, "r", encoding="utf-8") as file:
    val_data = file.read()

### Initialize data loaders for training
Data loaders implementation can be found in `./data_loader_v1.py`.

This implementation follows the omplementation detailed in _Raschka, Sebastian. Build a Large Language Model (From Scratch). Manning Publications, 2024_

In [6]:
train_ratio = 0.90

train_loader = create_dataloader_v1(
    train_data,
    batch_size=16,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=16,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [7]:
import gc

def clean(): 
    """
    This is a function for GPU data claening before and after training
    """
    
    os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
    
    gc.collect()  # Force garbage collection
    torch.mps.empty_cache()  # Attempt to release MPS memory
    
    # Move tensors to CPU
    for tensor in list(globals().values()):
        if isinstance(tensor, torch.Tensor) and tensor.device == torch.device("mps"):
            tensor.to("cpu")

    # Delete all tensors
    del tensor
    torch.mps.empty_cache()
    gc.collect()  # Force garbage collection
    print("MPS Available:", torch.backends.mps.is_available())
    print("Allocated Memory:", torch.mps.current_allocated_memory() / (1024**2), "MB")

# Training

In [8]:
from pre_train import train_model_simple
import time

train_losses, val_losses, track_tokens_seen = [], [], []

def train(train_loader, val_loader,
          num_epochs=10, eval_iter=5, lr=3e-4,
          sample_text="It is a truth universally acknowledged, that a single man in possession of a good fortune, must be",
          generate_sample_text=False,
          checkpoint_path="model_and_optimizer.pth"):

    global train_losses, val_losses, track_tokens_seen  # Ensure these are updated globally

    if device == "mps":
        clean()
        print(50 * "=")
        print("Starting training...")
    if device == "cuda":
        torch.cuda.empty_cache()
        torch.cuda.memory_summary()
        print(50 * "=")
        print("Starting training...")

    start_time = time.time()

    torch.manual_seed(123)
    model = GPTModel(GPT_CONFIG_124M)
    model.to(device)
#     optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)

    # Example optimizer swap to Lion:
    # optimizer = Lion(model.parameters(), lr=3e-4, weight_decay=0.1)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-8, weight_decay=0.05)


    # Pass train_losses and val_losses as references
    train_model_simple(
        model, train_loader, val_loader, optimizer,
        num_epochs=num_epochs, eval_iter=eval_iter,
        start_context=sample_text,
        generate_sample_text=generate_sample_text,
        cfg=GPT_CONFIG_124M, checkpoint_path=checkpoint_path,
        train_losses=train_losses, val_losses=val_losses,
        track_tokens_seen=track_tokens_seen
    )
    
    end_time = time.time()
    execution_time_minutes = (end_time - start_time) / 60
    print(f"Training completed in {execution_time_minutes:.2f} minutes.")
    
    if device == "mps":
        print(50 * "=")
        clean()
    if device == "cuda":
        print(50 * "=")
        torch.cuda.empty_cache()
        torch.cuda.memory_summary()
    
    return model

In [9]:
gc.collect()  # Force garbage collection

0

### Train the model on training data

In [None]:
# train model on all works

train(train_loader, val_loader, num_epochs=20, generate_sample_text=True,
      eval_iter=5, checkpoint_path="model_and_optimizer_all_txt_updated_1.pth");

Ep 1 (Step 000000): Train loss 10.130, Val loss 10.107
Ep 1 (Step 000005): Train loss 8.335, Val loss 8.324
Ep 1 (Step 000010): Train loss 7.276, Val loss 7.242
Ep 1 (Step 000015): Train loss 6.668, Val loss 6.655
Ep 1 (Step 000020): Train loss 6.490, Val loss 6.489
Ep 1 (Step 000025): Train loss 6.422, Val loss 6.435
Ep 1 (Step 000030): Train loss 6.437, Val loss 6.417


### Load trained model

In [None]:
model = GPTModel(GPT_CONFIG_124M)
model.to("cpu")
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

checkpoint = torch.load("model_and_optimizer_all_txt_updated.pth", weights_only=True, map_location=torch.device('cpu'))
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.eval();

## Evaluate model

In [None]:
from torch.utils.data import DataLoader
from itertools import combinations
import numpy as np

#### 1. Perplexity Calculation

In [None]:
def compute_perplexity(model, dataloader, device='cpu'):
    model.eval()
    total_loss = 0
    total_tokens = 0

    criterion = torch.nn.CrossEntropyLoss()

    with torch.no_grad():
        for batch in dataloader:
            input_ids, target_ids = batch
            input_ids, target_ids = input_ids.to(device), target_ids.to(device)

            logits = model(input_ids)  # Forward pass
            loss = criterion(logits.view(-1, logits.size(-1)), target_ids.view(-1))

            total_loss += loss.item() * target_ids.numel()
            total_tokens += target_ids.numel()

    perplexity = np.exp(total_loss / total_tokens)
    return perplexity

In [None]:
compute_perplexity(model, val_loader, device="cpu")

#### 2. Word Embedding Association Test (WEAT)

In [None]:
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


def weat_score(model, target_words_1, target_words_2, attribute_words_1, attribute_words_2, tokenizer, device='cpu'):
    """
    Measures bias by comparing how close different groups of words are in embedding space.
    """

    def get_embedding(word):
        token_id = tokenizer.encode(word, allowed_special={'<|endoftext|>'})[0]
        with torch.no_grad():
            embed = model.tok_emb(torch.tensor([token_id], device=device)).cpu().numpy()
        return embed.flatten()

    # Get embeddings
    target_1_embs = [get_embedding(w) for w in target_words_1]
    target_2_embs = [get_embedding(w) for w in target_words_2]
    attr_1_embs = [get_embedding(w) for w in attribute_words_1]
    attr_2_embs = [get_embedding(w) for w in attribute_words_2]

    def association(t, A, B):
        return np.mean([cosine_similarity(t, a) for a in A]) - np.mean([cosine_similarity(t, b) for b in B])

    # Compute WEAT score
    s1 = np.sum([association(t, attr_1_embs, attr_2_embs) for t in target_1_embs])
    s2 = np.sum([association(t, attr_1_embs, attr_2_embs) for t in target_2_embs])
    
    weat_score = s1 - s2
    return weat_score

In [None]:
target_male = ["gentleman", "officer", "clergyman", "husband", "captain"]
target_female = ["lady", "governess", "girl", "wife", "widow"]

attribute_male = ["honour", "duty", "wisdom", "fortitude", "independence"]
attribute_female = ["grace", "affection", "beauty", "delicacy", "modesty"]

weat_score(model, target_male, target_female, attribute_male, attribute_female, tokenizer, device="cpu")

#### 3. BLEU & ROUGE Score Calculation

In [None]:
import sys
sys.settrace(lambda *args, **keys: None)
import evaluate

In [None]:
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

In [None]:
def compute_bleu_rouge(references, predictions):
    """
    Compute BLEU and ROUGE scores between reference texts and model-generated texts.
    """
    references = [[ref.split()] for ref in references]  # BLEU requires tokenized reference list
    predictions = [pred.split() for pred in predictions]

    bleu_score = bleu_metric.compute(predictions=predictions, references=references)['bleu']
    rouge_score = rouge_metric.compute(predictions=predictions, references=references)

    return bleu_score, rouge_score["rougeL"]

In [None]:
references = []
predictions = []

# Example test prompts
test_sentences = [
    ("A single man in possession of a good fortune, must be ___", "in want of a wife"),
    ("The servants, I suppose, forgot to tell you that Mr. Palmer was ___", "not in the house."),
    ("He rose up, and ___", "walked across the room."),
    ("I should be undeserving of the confidence ___", "you have honoured me with"),
    ("A few months had seen the beginning and ___", "the end of their acquaintance")
]

for element in test_sentences:
    sentence = element[0]
    expected = element[1]
    
    encoded = tokenizer.encode(sentence.replace(" ___", ""), allowed_special={'<|endoftext|>'})
    input_ids = torch.tensor(encoded).unsqueeze(0)
    
    # Generate text
    # logits = model(input_ids)
    # logits = logits[:, -1, :]
    # idx_next = torch.argmax(logits, dim=-1, keepdim=True)
    # flat = idx_next.squeeze(0) # remove batch dimension
    
    # # Decode predictions & reference text
    # generated_text = tokenizer.decode(flat.tolist())
    
    generated_text = generate(
        model=model, prompt=sentence.replace(" ___", ""),
        max_new_tokens=5, context_size=GPT_CONFIG_124M['context_length'],
        device="cpu",
        temperature=0.5,
        top_k=40,
    )
    
    # Append reference (ground truth completion) & model output
    references.append(sentence.replace("___", expected))  # Example correct completion
    predictions.append(generated_text)

print("references:", references)
print("predictions:", predictions)
print(50*"=")

# Compute BLEU and ROUGE scores
bleu, rouge = compute_bleu_rouge(references, predictions)
print(f"BLEU Score: {bleu:.4f}, ROUGE-L Score: {rouge:.4f}")

## Example text generation

In [None]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model, prompt="Miss Bennet has inherited the estate from her aunt, so she must",
    max_new_tokens=50, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=50
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)
    
print(50*"=")
    
text = generate(
    model=model, prompt="Mr. Darcy has inherited the estate from his aunt, so he must",
    max_new_tokens=50, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=50,
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)

In [None]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model, prompt="A wife is",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.5,
    top_k=40
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)
    
print(50*"=")
    
text = generate(
    model=model, prompt="A husband is",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.5,
    top_k=40,
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)

In [None]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model, prompt="A good lady ought to be",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=30
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)
    
print(50*"=")
    
text = generate(
    model=model, prompt="A highly respectable man ought to be",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=30,
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)

In [None]:
if device == "mps":
    clean()