In [1]:
import torch
import tiktoken
import os

from gpt_model import GPTModel
from data_loader_v1 import create_dataloader_v1
from generate_text import generate

### Detect if GPU is available

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using {device} device.")

Using cpu device.


### Set up model configuration 

In [3]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 256,  # Context length
    "emb_dim": 768, #768,         # Embedding dimension
    "n_heads": 12, #12,          # Number of attention heads
    "n_layers": 12, #12,         # Number of layers
    "drop_rate": 0.2,       # Dropout rate
    "qkv_bias": False,      # Query-Key-Value bias
    "device": device,
}

### Load training and validation data files

In [8]:
train_file_path = 'train_text_data.txt'
val_file_path = 'val_text_data.txt'

with open(train_file_path, "r", encoding="utf-8") as file:
    train_data = file.read()
with open(val_file_path, "r", encoding="utf-8") as file:
    val_data = file.read()

### Initialize data loaders for training
Data loaders implementation can be found in `./data_loader_v1.py`.

This implementation follows the omplementation detailed in _Raschka, Sebastian. Build a Large Language Model (From Scratch). Manning Publications, 2024_

In [12]:
train_ratio = 0.90

train_loader = create_dataloader_v1(
    train_data,
    batch_size=4,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=4,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

### Initialize the tokenizer

In [14]:
import tiktoken

#tokenizer = tiktoken.get_encoding("gpt2")
tokenizer = tiktoken.get_encoding("cl100k_base")


total_characters = len(train_data + val_data)
total_tokens = len(tokenizer.encode(train_data + val_data, allowed_special={'<|endoftext|>'}))

print("Characters:", total_characters)
print("Tokens:", total_tokens)

Characters: 404065
Tokens: 126804


Check which tokens are actuall used

In [27]:

# Read and tokenize Rilke text
with open("train_text_data.txt", "r", encoding="utf-8") as file:
    text = file.read()

tokens = tokenizer.encode(text)

# Convert tokens back to unique words
unique_tokens = set(tokens)
decoded_tokens = {token: tokenizer.decode([token]) for token in unique_tokens}

# Print sample
for token, word in list(decoded_tokens.items())[:20]:  # Show first 20 tokens
    print(f"Token ID: {token} -> '{word}'")


Token ID: 0 -> '!'
Token ID: 1 -> '"'
Token ID: 6 -> '''
Token ID: 7 -> '('
Token ID: 8 -> ')'
Token ID: 40967 -> 'enh'
Token ID: 11 -> ','
Token ID: 12 -> '-'
Token ID: 13 -> '.'
Token ID: 16395 -> 'ANK'
Token ID: 8207 -> 'dt'
Token ID: 15 -> '0'
Token ID: 81934 -> ' Erl'
Token ID: 18 -> '3'
Token ID: 8211 -> ' Sil'
Token ID: 8212 -> 'rf'
Token ID: 21 -> '6'
Token ID: 16 -> '1'
Token ID: 23 -> '8'
Token ID: 24 -> '9'


Print dictionary created by tokenizer

In [15]:
# Get the token dictionary (token -> token_id mapping)
token_dict = tokenizer._mergeable_ranks

# Print the first few tokens and their IDs
for token, token_id in list(token_dict.items())[:20]:  # Adjust number to see more
    print(f"Token: {repr(token)} -> ID: {token_id}")

Token: b'!' -> ID: 0
Token: b'"' -> ID: 1
Token: b'#' -> ID: 2
Token: b'$' -> ID: 3
Token: b'%' -> ID: 4
Token: b'&' -> ID: 5
Token: b"'" -> ID: 6
Token: b'(' -> ID: 7
Token: b')' -> ID: 8
Token: b'*' -> ID: 9
Token: b'+' -> ID: 10
Token: b',' -> ID: 11
Token: b'-' -> ID: 12
Token: b'.' -> ID: 13
Token: b'/' -> ID: 14
Token: b'0' -> ID: 15
Token: b'1' -> ID: 16
Token: b'2' -> ID: 17
Token: b'3' -> ID: 18
Token: b'4' -> ID: 19


In [23]:
#Decode a Specific Token ID
print(tokenizer.decode([25114]))  

 deadly


In [None]:
#View the Full Vocabulary (Sorted by ID)
sorted_vocab = sorted(token_dict.items(), key=lambda x: x[1])
for token, token_id in sorted_vocab[:500]:  # Adjust number to see more
    print(f"ID: {token_id} -> Token: {repr(token)}")

<h2>Different tokenizer without prefixed vocabulary</h2>

In [3]:
import sentencepiece as spm

# Train a new tokenizer based on Rilke texts
spm.SentencePieceTrainer.train(input="train_text_data.txt", model_prefix="rilke_tokenizer", vocab_size=5000,num_threads=2)

# Load trained tokenizer
sp = spm.SentencePieceProcessor()
sp.load("rilke_tokenizer.model")

# Tokenize text
text = "Im Park ist eine Rose."

tokens = sp.encode(text, out_type=str)
print(tokens)


['▁Im', '▁Park', '▁ist', '▁eine', '▁Rose', '.']


sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: train_text_data.txt
  input_format: 
  model_prefix: rilke_tokenizer
  model_type: UNIGRAM
  vocab_size: 5000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 2
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
}
normalizer_spec {
  name: nmt_nfkc
  add_dummy_prefix: 1
  remove_extra_whitespaces:

<h3>Sanity Check</h3>

In [18]:
# Sanity check

if total_tokens * (train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokens for the training loader. "
          "Try to lower the `GPT_CONFIG_124M['context_length']` or "
          "increase the `training_ratio`")

if total_tokens * (1-train_ratio) < GPT_CONFIG_124M["context_length"]:
    print("Not enough tokens for the validation loader. "
          "Try to lower the `GPT_CONFIG_124M['context_length']` or "
          "decrease the `training_ratio`")

In [121]:
import gc

def clean(): 
    """
    This is a function for GPU data claening before and after training
    """
    
    os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
    
    gc.collect()  # Force garbage collection
    torch.mps.empty_cache()  # Attempt to release MPS memory
    
    # Move tensors to CPU
    for tensor in list(globals().values()):
        if isinstance(tensor, torch.Tensor) and tensor.device == torch.device("mps"):
            tensor.to("cpu")

    # Delete all tensors
    del tensor
    torch.mps.empty_cache()
    gc.collect()  # Force garbage collection
    print("MPS Available:", torch.backends.mps.is_available())
    print("Allocated Memory:", torch.mps.current_allocated_memory() / (1024**2), "MB")

# Training

In [122]:
from pre_train import train_model_simple
import time

train_losses, val_losses, track_tokens_seen = [], [], []

def train(train_loader, val_loader,
          num_epochs=10, eval_iter=5, 
          sample_text="Every effort moves you",
          checkpoint_path="model_and_optimizer.pth"):

    global train_losses, val_losses, track_tokens_seen  # Ensure these are updated globally

    if device == "mps":
        clean()
        print(50 * "=")
        print("Starting training...")

    start_time = time.time()

    torch.manual_seed(123)
    model = GPTModel(GPT_CONFIG_124M)
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001, weight_decay=0.05)

    # Pass train_losses and val_losses as references
    train_model_simple(
        model, train_loader, val_loader, optimizer,
        num_epochs=num_epochs, eval_iter=eval_iter,
        start_context=sample_text, cfg=GPT_CONFIG_124M,
        checkpoint_path=checkpoint_path,
        train_losses=train_losses, val_losses=val_losses,
        track_tokens_seen=track_tokens_seen
    )
    
    end_time = time.time()
    execution_time_minutes = (end_time - start_time) / 60
    print(f"Training completed in {execution_time_minutes:.2f} minutes.")
    
    if device == "mps":
        print(50 * "=")
        clean()
    
    return model

### Train the model on training data

In [123]:

train(train_loader, val_loader, num_epochs=7,
      eval_iter=25, sample_text="Im Park ist",
      checkpoint_path="model_and_optimizer_5.pth");

Ep 1 (Step 000000): Train loss 2.901, Val loss 4.879
Ep 1 (Step 000025): Train loss 2.628, Val loss 4.636
Ep 1 (Step 000050): Train loss 2.604, Val loss 4.544
Ep 1 (Step 000075): Train loss 2.428, Val loss 4.542
Im Park ist und was uns noch ein Läche, ein Ding wenn sie ihn die stummergleicht das wirklichkeiten und aus dem verwie fünd, j
Ep 2 (Step 000100): Train loss 2.457, Val loss 4.520
Ep 2 (Step 000125): Train loss 2.320, Val loss 4.486
Ep 2 (Step 000150): Train loss 2.317, Val loss 4.506
Im Park ist und wurden ich um bist heim, an die Alter? Aberall, du deinen Hächauen Stunden, nur dich der sein; jetzt,— als
Ep 3 (Step 000175): Train loss 2.261, Val loss 4.567
Ep 3 (Step 000200): Train loss 2.108, Val loss 4.512
Ep 3 (Step 000225): Train loss 2.196, Val loss 4.554
Im Park ist daß ich, ist –ern, welcheln? Dinge [chel mein Händen
Ep 4 (Step 000250): Train loss 2.059, Val loss 4.557
Ep 4 (Step 000275): Train loss 1.747, Val loss 4.522
Ep 4 (Step 000300): Train loss 1.944, Val loss 4.

### Load trained model

In [124]:
model = GPTModel(GPT_CONFIG_124M)
model.to("cpu")
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

checkpoint = torch.load("model_and_optimizer_6.pth", weights_only=True)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.eval();

FileNotFoundError: [Errno 2] No such file or directory: 'model_and_optimizer_6.pth'

In [None]:
from generate_text import generate

torch.set_printoptions(profile="full")
text = generate(
    model=model, prompt="Er ruft",
    max_new_tokens=50, context_size=GPT_CONFIG_124M['context_length'],
    device="cpu",
    temperature=1,
    top_k=40,
    eos_id=13
)

splitted = text.split("\n")
for txt in splitted:
    print(txt)

Er ruftender bist der Schritte den Sohn
die würde,
kür eines verwie leise seine Sehlt


In [None]:
if device == "mps":
    clean()