In [1]:
import torch
import tiktoken
import os

from gpt_model import GPTModel
from data_loader_v1 import create_dataloader_v1
from generate_text import generate

### Detect if GPU is available

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using {device} device.")

Using cpu device.


### Set up model configuration 

In [3]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
#   "vocab_size": 14000,    # Vocabulary size (custom tokenizer)
    "context_length": 256,  # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.2,       # Dropout rate
    "qkv_bias": True,      # Query-Key-Value bias
    "device": device,
}

### Initialize the tokenizer

#### GPT-2 tokenizer

In [4]:
tokenizer = tiktoken.get_encoding("gpt2")

#### Custom tokenizer

In [5]:
import sentencepiece as spm

In [6]:
spm.SentencePieceTrainer.train(
    input='all_books.txt',
    model_prefix='gpt_custom_tokenizer',
    vocab_size=GPT_CONFIG_124M["vocab_size"],
    model_type='bpe',
    character_coverage=0.9995,
    hard_vocab_limit=False,
    bos_id=-1,
    eos_id=-1,
    user_defined_symbols=["<|endoftext|>"]
);

In [5]:
tokenizer = spm.SentencePieceProcessor()
tokenizer.load('gpt_custom_tokenizer.model')

True

In [5]:
tokenizer_used_in_this_trial="GPT2"
# tokenizer_used_in_this_trial="CUSTOM"

def encode(full_text):
    if tokenizer_used_in_this_trial == "GPT2":
        return tokenizer.encode(full_text, allowed_special={'<|endoftext|>'})
    else:
        return tokenizer.encode(full_text, out_type=int)

### Load training and validation data files

In [6]:
train_file_path = 'train_text_data_all_txt.txt'
val_file_path = 'val_text_data_all_txt.txt'

with open(train_file_path, "r", encoding="utf-8") as file:
    train_data = file.read()
with open(val_file_path, "r", encoding="utf-8") as file:
    val_data = file.read()

### Initialize data loaders for training
Data loaders implementation can be found in `./data_loader_v1.py`.

This implementation follows the omplementation detailed in _Raschka, Sebastian. Build a Large Language Model (From Scratch). Manning Publications, 2024_

In [7]:
train_ratio = 0.90

train_loader = create_dataloader_v1(
    train_data,
    encode=encode,
    batch_size=4,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    encode=encode,
    batch_size=4,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [8]:
full_text = train_data + val_data

word_count = len(full_text.split())
char_count = len(full_text)

# tiktoken tokenizer ->
tokens = tokenizer.encode(full_text, allowed_special={'<|endoftext|>'})

# Custom tokenizer ->
# tokens = tokenizer.encode(full_text, out_type=int)

token_count = len(tokens)
unique_token_count = len(set(tokens))

print("Words:", word_count)
print("Characters:", char_count)
print("Tokens:", token_count)
print("Unique Tokens Used:", unique_token_count)

Words: 740186
Characters: 4132331
Tokens: 935517
Unique Tokens Used: 13902


In [9]:
import gc

def clean(): 
    """
    This is a function for GPU data claening before and after training
    """
    
    os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
    
    gc.collect()  # Force garbage collection
    torch.mps.empty_cache()  # Attempt to release MPS memory
    
    # Move tensors to CPU
    for tensor in list(globals().values()):
        if isinstance(tensor, torch.Tensor) and tensor.device == torch.device("mps"):
            tensor.to("cpu")

    # Delete all tensors
    del tensor
    torch.mps.empty_cache()
    gc.collect()  # Force garbage collection
    print("MPS Available:", torch.backends.mps.is_available())
    print("Allocated Memory:", torch.mps.current_allocated_memory() / (1024**2), "MB")

# Training

In [10]:
from pre_train import train_model_simple
import time

train_losses, val_losses, track_tokens_seen = [], [], []

def train(train_loader, val_loader,
          num_epochs=10, eval_iter=5, lr=0.0002,
          generate_sample_text=False,
          sample_text="It is a truth universally acknowledged, that a single man in possession of a good fortune, must be",
          model_prefix="model_and_optimizer"):

    global train_losses, val_losses, track_tokens_seen  # Ensure these are updated globally

    if device == "mps":
        clean()
        print(50 * "=")
        print("Starting training...")
    if device == "cuda":
        torch.cuda.empty_cache()
        torch.cuda.memory_summary()
        print(50 * "=")
        print("Starting training...")

    start_time = time.time()

    torch.manual_seed(123)
    model = GPTModel(GPT_CONFIG_124M)
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-08, weight_decay=0.05)

    # Pass train_losses and val_losses as references
    train_model_simple(
        model, train_loader, val_loader, optimizer,
        num_epochs=num_epochs, eval_iter=eval_iter,
        start_context=sample_text, cfg=GPT_CONFIG_124M,
        generate_sample_text=generate_sample_text,
        model_prefix=model_prefix,
        train_losses=train_losses, val_losses=val_losses,
        track_tokens_seen=track_tokens_seen,
        tokenizer=tokenizer
    )
    
    end_time = time.time()
    execution_time_minutes = (end_time - start_time) / 60
    print(f"Training completed in {execution_time_minutes:.2f} minutes.")
    
    if device == "mps":
        print(50 * "=")
        clean()
    if device == "cuda":
        print(50 * "=")
        torch.cuda.empty_cache()
        torch.cuda.memory_summary()
    
    return model

In [11]:
gc.collect()  # Force garbage collection

20

### Train the model on training data

In [13]:
# train model on all works

train(train_loader, val_loader, num_epochs=6,
      eval_iter=10, model_prefix="model_768_12_12");

Ep 1 (Step 000000): Train loss 8.911, Val loss 8.861
Ep 1 (Step 000010): Train loss 7.138, Val loss 7.140
Ep 1 (Step 000020): Train loss 6.511, Val loss 6.545
Ep 1 (Step 000030): Train loss 6.403, Val loss 6.398
Ep 1 (Step 000040): Train loss 6.367, Val loss 6.271
Ep 1 (Step 000050): Train loss 6.112, Val loss 6.128
Ep 1 (Step 000060): Train loss 6.114, Val loss 5.995
Ep 1 (Step 000070): Train loss 5.921, Val loss 5.898
Ep 1 (Step 000080): Train loss 5.786, Val loss 5.821
Ep 1 (Step 000090): Train loss 5.758, Val loss 5.751
Ep 1 (Step 000100): Train loss 5.703, Val loss 5.692
Ep 1 (Step 000110): Train loss 5.625, Val loss 5.647
Ep 1 (Step 000120): Train loss 5.643, Val loss 5.607
Ep 1 (Step 000130): Train loss 5.553, Val loss 5.555
Ep 1 (Step 000140): Train loss 5.505, Val loss 5.530
Ep 1 (Step 000150): Train loss 5.494, Val loss 5.508
Ep 1 (Step 000160): Train loss 5.471, Val loss 5.494
Ep 1 (Step 000170): Train loss 5.360, Val loss 5.459
Ep 1 (Step 000180): Train loss 5.465, Val loss

Ep 2 (Step 001550): Train loss 4.604, Val loss 4.767
Ep 2 (Step 001560): Train loss 4.653, Val loss 4.783
Ep 3 (Step 001570): Train loss 4.536, Val loss 4.772
Ep 3 (Step 001580): Train loss 4.589, Val loss 4.784
Ep 3 (Step 001590): Train loss 4.592, Val loss 4.776
Ep 3 (Step 001600): Train loss 4.612, Val loss 4.761
Ep 3 (Step 001610): Train loss 4.515, Val loss 4.773
Ep 3 (Step 001620): Train loss 4.599, Val loss 4.757
Ep 3 (Step 001630): Train loss 4.552, Val loss 4.757
Ep 3 (Step 001640): Train loss 4.616, Val loss 4.763
Ep 3 (Step 001650): Train loss 4.558, Val loss 4.777
Ep 3 (Step 001660): Train loss 4.560, Val loss 4.761
Ep 3 (Step 001670): Train loss 4.557, Val loss 4.762
Ep 3 (Step 001680): Train loss 4.645, Val loss 4.753
Ep 3 (Step 001690): Train loss 4.505, Val loss 4.741
Ep 3 (Step 001700): Train loss 4.547, Val loss 4.754
Ep 3 (Step 001710): Train loss 4.574, Val loss 4.757
Ep 3 (Step 001720): Train loss 4.502, Val loss 4.750
Ep 3 (Step 001730): Train loss 4.505, Val loss

In [13]:
# train model on all works

train(train_loader, val_loader, num_epochs=6,
      eval_iter=10, model_prefix="model_768_12_12_old_tok");

Ep 1 (Step 000000): Train loss 10.187, Val loss 10.130
Ep 1 (Step 000010): Train loss 8.149, Val loss 8.061
Ep 1 (Step 000020): Train loss 6.993, Val loss 6.950
Ep 1 (Step 000030): Train loss 6.759, Val loss 6.643
Ep 1 (Step 000040): Train loss 6.492, Val loss 6.514
Ep 1 (Step 000050): Train loss 6.360, Val loss 6.387
Ep 1 (Step 000060): Train loss 6.197, Val loss 6.259
Ep 1 (Step 000070): Train loss 6.069, Val loss 6.151
Ep 1 (Step 000080): Train loss 6.023, Val loss 6.030
Ep 1 (Step 000090): Train loss 5.912, Val loss 5.981
Ep 1 (Step 000100): Train loss 5.880, Val loss 5.877
Ep 1 (Step 000110): Train loss 5.740, Val loss 5.806
Ep 1 (Step 000120): Train loss 5.697, Val loss 5.740
Ep 1 (Step 000130): Train loss 5.682, Val loss 5.696
Ep 1 (Step 000140): Train loss 5.636, Val loss 5.658
Ep 1 (Step 000150): Train loss 5.696, Val loss 5.644
Ep 1 (Step 000160): Train loss 5.498, Val loss 5.630
Ep 1 (Step 000170): Train loss 5.555, Val loss 5.595
Ep 1 (Step 000180): Train loss 5.470, Val lo

KeyboardInterrupt: 

### Load trained model

In [12]:
model = GPTModel(GPT_CONFIG_124M)
model.to("cpu")
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0002, weight_decay=0.05)

checkpoint = torch.load("model_768_12_12_old_tok.pth", weights_only=True, map_location=torch.device('cpu'))
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.eval();

In [13]:
from torch.utils.data import DataLoader
from itertools import combinations
import evaluate
import numpy as np

In [14]:
def compute_perplexity(model, dataloader, device='cpu'):
    model.eval()
    total_loss = 0
    total_tokens = 0

    criterion = torch.nn.CrossEntropyLoss()

    with torch.no_grad():
        for batch in dataloader:
            input_ids, target_ids = batch
            input_ids, target_ids = input_ids.to(device), target_ids.to(device)

            logits = model(input_ids)  # Forward pass
            loss = criterion(logits.view(-1, logits.size(-1)), target_ids.view(-1))

            total_loss += loss.item() * target_ids.numel()
            total_tokens += target_ids.numel()

    perplexity = np.exp(total_loss / total_tokens)
    return perplexity

In [15]:
compute_perplexity(model, val_loader)

np.float64(90.55419439493252)

In [16]:
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


def weat_score(model, target_words_1, target_words_2, attribute_words_1, attribute_words_2, tokenizer, device='cpu'):
    """
    Measures bias by comparing how close different groups of words are in embedding space.
    """

    def get_embedding(word):
        token_id = tokenizer.encode(word, allowed_special={'<|endoftext|>'})[0]
        with torch.no_grad():
            embed = model.tok_emb(torch.tensor([token_id], device=device)).cpu().numpy()
        return embed.flatten()

    # Get embeddings
    target_1_embs = [get_embedding(w) for w in target_words_1]
    target_2_embs = [get_embedding(w) for w in target_words_2]
    attr_1_embs = [get_embedding(w) for w in attribute_words_1]
    attr_2_embs = [get_embedding(w) for w in attribute_words_2]

    def association(t, A, B):
        return np.mean([cosine_similarity(t, a) for a in A]) - np.mean([cosine_similarity(t, b) for b in B])

    # Compute WEAT score
    s1 = np.sum([association(t, attr_1_embs, attr_2_embs) for t in target_1_embs])
    s2 = np.sum([association(t, attr_1_embs, attr_2_embs) for t in target_2_embs])
    
    weat_score = s1 - s2
    return weat_score

In [17]:
target_male = ["gentleman", "officer", "clergyman", "husband", "captain"]
target_female = ["lady", "governess", "girl", "wife", "widow"]

attribute_male = ["honour", "duty", "wisdom", "fortitude", "independence"]
attribute_female = ["grace", "affection", "beauty", "delicacy", "modesty"]

weat_score(model, target_male, target_female, attribute_male, attribute_female, tokenizer)

np.float32(-0.027121741)

In [20]:
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

In [20]:
import torch
import evaluate
import re

bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

def compute_bleu_rouge_from_val(model, device="cpu"):
    references = []
    predictions = []

    # Step 1: Load the validation set
    with open('val_text_data_all_txt.txt', 'r', encoding='utf-8') as f:
        data = f.read()

    # Step 2: Split into sentences & filter
    sentences = re.split(r'(?<=[.!?])\s+', data)
    filtered_sentences = [s.strip() for s in sentences if 5 <= len(s.split()) <= 60]
    filtered_sentences = filtered_sentences[:1000]

    # Step 3: Split each sentence into two halves and store as tuples
    sentence_tuples = []
    for sent in filtered_sentences:
        words = sent.split()
        mid = len(words) // 2
        first_half = ' '.join(words[:mid])
        second_half = ' '.join(words[mid:])
        sentence_tuples.append((first_half, second_half))

    # Step 4: For each (first_half, second_half), generate prediction
    for first_half, second_half in sentence_tuples:
        generated_text = generate(
            model=model, prompt=first_half,
            max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
            device=device,
            temperature=0.7,
            top_k=50
        )

        # Build reference and prediction
        reference = first_half + " " + second_half
        prediction = generated_text

        references.append(reference)
        predictions.append(prediction)

    # Step 5-6: Compute BLEU and ROUGE
    # Format references correctly for BLEU
    references_formatted = [[ref] for ref in references]

    bleu_score = bleu_metric.compute(predictions=predictions, references=references_formatted)['bleu']
    rouge_score = rouge_metric.compute(predictions=predictions, references=references)

    print(f"BLEU Score: {bleu_score:.4f}, ROUGE-L Score: {rouge_score['rougeL']:.4f}")

In [21]:
compute_bleu_rouge_from_val(model)

BLEU Score: 0.3110, ROUGE-L Score: 0.4049


In [10]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model,
    prompt="Miss Bennet has inherited the estate from her aunt, so she must",
    max_new_tokens=50, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=50
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)
    
print(50*"=")
    
text = generate(
    model=model,
    prompt="Mr. Darcy has inherited the estate from his aunt, so he must",
    max_new_tokens=50, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=50,
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)

Miss Bennet has inherited the estate from her aunt, so she must not be at this, that she would not have thought of her own, she is not to her to have her own account. Miss Crawford would have been ashamed of her. She would have been the right to her. She has been so long ago
Mr. Darcy has inherited the estate from his aunt, so he must have been lessened by his coming to his house, and, and by his father's being in the country, and would be so much of her own.
<|endoftext|>
<|endoftext|>
It was the very much the evening. He did not have been


In [11]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model,
    prompt="A wife is",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.5,
    top_k=40
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)
    
print(50*"=")
    
text = generate(
    model=model, 
    prompt="A husband is",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.5,
    top_k=40,
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)

A wife is a very pretty girl, and I think I should not be a very good thing. I would be so very well, and I am sure I am
A husband is very good-morrow. I have a most intimate friends in the world; but I am sure I am sure I should not quite a very soon."


In [25]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model, 
    prompt="I shall now go",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=30
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)
    
print(50*"=")
    
text = generate(
    model=model, 
    prompt="He said",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=30,
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)

I shall now go to be at Mansfield, and I know, I may say that if there is no better time to be. I am not an opportunity of the
He said, "I cannot be the very good fortune; and I am very agreeable man, that Mr. and Mrs. Churchill is very fond of Mr.


In [16]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model, 
    prompt="She was",
    max_new_tokens=200, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.7,
    top_k=30
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)

She was sure, she was a great deal of a very kind of her situation. She was very kind to know how far he had been the idea of the most affection in the world; and when she was sorry to see him, that he should be in her, that he was very much better than she had been used to be the first meeting. He was very long enough for the day, and there being a little of a letter.
"I can think I have not a very great deal of it," said she, as "I am not at the very well, and I have not so much more to think of it. I am sure you have been so happy to have had been there by the matter."
"But I am sure of you may know--but it does not be no doubt of you could not have been so many years ago."
"This is a very agreeable man, indeed! I never was not so sure to be sure. I have been to be very


In [19]:
if device == "mps":
    clean()

In [66]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model,
    prompt="a duty to",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.4,
    top_k=50
)

text

'a duty to be the very day, and I am sure I am sure I am sure I should have been a very much obliged to be very happy. I am'

In [67]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model,
    prompt="a duty to",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=0.4,
    top_k=50
)

text

'a duty to go and Mrs. Weston.\n"I am very glad to think of your own family."\n"I will not like you. I am afraid'

In [92]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model,
    prompt="she is wild to get married",
    max_new_tokens=30, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=1,
    top_k=50
)

text

'she is wild to get married from me to do the best of them. I think it is quite forgot to write to be a great deal. You must own. There is nothing'