In [1]:
# 🔄 Auto-reload modules when their source code changes
%load_ext autoreload
%autoreload 2

# 📦 Set up src/ and utils/ paths (project root is detected via pyproject.toml)
%run ../../bootstrap.py

In [2]:
import sys
import os
from os import path

from llmscratch.config.config import Config
from llmscratch.components.io.text_loader import TextLoader
from llmscratch.services.text_preprocessing_service import TextPreprocessingService
from llmscratch.components.trainers.bpe_trainer import BPETrainer

In [3]:
config = Config()

config.print_config_info()

📂 Configured Dataset Directories
--------------------------------------------------
Datasets/raw dir:         /Users/kenneth/Public/projects/python/ai/llms/llm-zero-to-trained/datasets/raw
Datasets/processed dir:   /Users/kenneth/Public/projects/python/ai/llms/llm-zero-to-trained/datasets/processed
Artifacts/raw dir:        /Users/kenneth/Public/projects/python/ai/llms/llm-zero-to-trained/artifacts/data/raw
Artifacts/processed dir:  /Users/kenneth/Public/projects/python/ai/llms/llm-zero-to-trained/artifacts/data/processed
Vocab dir:                /Users/kenneth/Public/projects/python/ai/llms/llm-zero-to-trained/artifacts/data/vocabulary


In [4]:
file_path = path.join(config.DATASETS_RAW_DIR, "the-verdict.txt")  # Replace with your file path

text_loader = TextLoader()

for line in text_loader.load_text(file_path, mode="line"):
    print(line)

[ 2025-07-19 09:48:13 ] INFO [../../../src/llmscratch/components/io/text_loader.py:28] - 📂 Loading file: /Users/kenneth/Public/projects/python/ai/llms/llm-zero-to-trained/datasets/raw/the-verdict.txt (mode: line)
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)

"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it's going to send the value of my picture 'way up; but I don't think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing's lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourn

In [5]:
#!python -m spacy download en_core_web_sm

In [6]:
service = TextPreprocessingService(config.DATASETS_RAW_DIR)

[ 2025-07-19 09:48:13 ] INFO [../../../src/llmscratch/services/text_preprocessing_service.py:26] - ⚙️ Initializing spaCy language model...
[ 2025-07-19 09:48:13 ] INFO [../../../src/llmscratch/services/text_preprocessing_service.py:28] - ✅ spaCy model loaded.


In [7]:
for line in text_loader.load_text(file_path, mode="chunk"):
    words = service.split_line_into_words(line)
    print(words)

[ 2025-07-19 09:48:13 ] INFO [../../../src/llmscratch/components/io/text_loader.py:28] - 📂 Loading file: /Users/kenneth/Public/projects/python/ai/llms/llm-zero-to-trained/datasets/raw/the-verdict.txt (mode: chunk)
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', 'though', 'a', 'good', 'fellow', 'enough', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', 'in', 'the', 'height', 'of', 'his', 'glory', 'he', 'had', 'dropped', 'his', 'painting', 'married', 'a', 'rich', 'widow', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera', 'Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence', 'The', 'height', 'of', 'his', 'was', 'what', 'the', 'women', 'called', 'it', 'I', 'can', 'hear', 'Gideon', 'Thwing', 'his', 'last', 'Chicago', 'sitter', 'deploring', 'his', 'unaccountable', 'abdication', 'Of', 'course', 'it', 'going', 'to', 'send', 'the', 'value', 'of', 'my', 'picture



In [8]:
vocab = service.build_word_freq(mode="line", lowercase=False)
vocab_chunk = service.build_word_freq(mode="chunk", lowercase=False)

[ 2025-07-19 09:48:13 ] INFO [../../../src/llmscratch/services/text_preprocessing_service.py:61] - 📁 Scanning directory: /Users/kenneth/Public/projects/python/ai/llms/llm-zero-to-trained/datasets/raw
[ 2025-07-19 09:48:13 ] INFO [../../../src/llmscratch/services/text_preprocessing_service.py:65] - 📄 Processing file: the-verdict.txt
[ 2025-07-19 09:48:13 ] INFO [../../../src/llmscratch/components/io/text_loader.py:28] - 📂 Loading file: /Users/kenneth/Public/projects/python/ai/llms/llm-zero-to-trained/datasets/raw/the-verdict.txt (mode: line)
[ 2025-07-19 09:48:14 ] INFO [../../../src/llmscratch/services/text_preprocessing_service.py:70] - ✅ Word frequency dictionary built with 1109 unique words.
[ 2025-07-19 09:48:14 ] INFO [../../../src/llmscratch/services/text_preprocessing_service.py:61] - 📁 Scanning directory: /Users/kenneth/Public/projects/python/ai/llms/llm-zero-to-trained/datasets/raw
[ 2025-07-19 09:48:14 ] INFO [../../../src/llmscratch/services/text_preprocessing_service.py:65]

In [9]:
vocab

{'I': 120,
 'HAD': 1,
 'always': 6,
 'thought': 8,
 'Jack': 15,
 'Gisburn': 19,
 'rather': 6,
 'a': 82,
 'cheap': 1,
 'genius': 1,
 'though': 2,
 'good': 5,
 'fellow': 2,
 'enough': 5,
 'so': 16,
 'it': 66,
 'was': 72,
 'no': 8,
 'great': 2,
 'surprise': 3,
 'to': 97,
 'me': 37,
 'hear': 4,
 'that': 55,
 'in': 40,
 'the': 168,
 'height': 2,
 'of': 98,
 'his': 61,
 'glory': 2,
 'he': 64,
 'had': 54,
 'dropped': 2,
 'painting': 9,
 'married': 3,
 'rich': 2,
 'widow': 1,
 'and': 74,
 'established': 1,
 'himself': 9,
 'villa': 1,
 'on': 27,
 'Riviera': 3,
 'Though': 1,
 'would': 12,
 'have': 27,
 'been': 18,
 'Rome': 1,
 'or': 9,
 'Florence': 1,
 'The': 16,
 'what': 9,
 'women': 4,
 'called': 3,
 'can': 5,
 'Gideon': 1,
 'Thwing': 3,
 'last': 6,
 'Chicago': 1,
 'sitter': 1,
 'deploring': 1,
 'unaccountable': 1,
 'abdication': 2,
 'Of': 5,
 'course': 6,
 'going': 1,
 'send': 1,
 'value': 1,
 'my': 40,
 'picture': 11,
 'way': 5,
 'up': 20,
 'but': 12,
 'do': 7,
 'think': 4,
 'Rickham': 7,
 '

In [10]:
def compare_word_freqs(freq_line: dict, freq_chunk: dict) -> None:
    keys_line = set(freq_line.keys())
    keys_chunk = set(freq_chunk.keys())

    only_in_line = keys_line - keys_chunk
    only_in_chunk = keys_chunk - keys_line
    common = keys_line & keys_chunk

    diff_counts = {
        word: (freq_line[word], freq_chunk[word])
        for word in common
        if freq_line[word] != freq_chunk[word]
    }

    print("🧾 Comparison of line vs chunk mode:")
    print(f"🔹 Words only in line mode: {len(only_in_line)}")
    if only_in_line:
        print(f"  → {sorted(list(only_in_line))[:10]} ...")

    print(f"🔹 Words only in chunk mode: {len(only_in_chunk)}")
    if only_in_chunk:
        print(f"  → {sorted(list(only_in_chunk))[:10]} ...")

    print(f"🔸 Words with different counts: {len(diff_counts)}")
    for word, (count_line, count_chunk) in list(diff_counts.items())[:10]:
        print(f"  {word}: line={count_line}, chunk={count_chunk}")

In [11]:
compare_word_freqs(vocab, vocab_chunk)

🧾 Comparison of line vs chunk mode:
🔹 Words only in line mode: 0
🔹 Words only in chunk mode: 1
  → ['tention'] ...
🔸 Words with different counts: 2
  attention: line=2, chunk=1
  at: line=21, chunk=22


In [12]:
trainer = BPETrainer(num_merges=10000)

[ 2025-07-19 09:48:14 ] INFO [../../../src/llmscratch/components/trainers/bpe_trainer.py:27] - 📦 Initialized BPETrainer with num_merges=10000


In [13]:
init_vocab = trainer._init_vocab(vocab)
sorted_vocab = sorted(init_vocab.items(), key=lambda x: x[1], reverse=True)
sorted_vocab

[(('t', 'h', 'e', '</w>'), 168),
 (('I', '</w>'), 120),
 (('o', 'f', '</w>'), 98),
 (('t', 'o', '</w>'), 97),
 (('a', '</w>'), 82),
 (('a', 'n', 'd', '</w>'), 74),
 (('w', 'a', 's', '</w>'), 72),
 (('i', 't', '</w>'), 66),
 (('h', 'e', '</w>'), 64),
 (('h', 'i', 's', '</w>'), 61),
 (('t', 'h', 'a', 't', '</w>'), 55),
 (('h', 'a', 'd', '</w>'), 54),
 (('i', 'n', '</w>'), 40),
 (('m', 'y', '</w>'), 40),
 (('m', 'e', '</w>'), 37),
 (('h', 'i', 'm', '</w>'), 33),
 (('w', 'i', 't', 'h', '</w>'), 32),
 (('o', 'n', '</w>'), 27),
 (('h', 'a', 'v', 'e', '</w>'), 27),
 (('y', 'o', 'u', '</w>'), 25),
 (('a', 's', '</w>'), 24),
 (('o', 'n', 'e', '</w>'), 23),
 (('a', 't', '</w>'), 21),
 (('u', 'p', '</w>'), 20),
 (('h', 'e', 'r', '</w>'), 20),
 (('G', 'i', 's', 'b', 'u', 'r', 'n', '</w>'), 19),
 (('s', 'h', 'e', '</w>'), 19),
 (('b', 'e', 'e', 'n', '</w>'), 18),
 (('n', 'o', 't', '</w>'), 18),
 (('S', 't', 'r', 'o', 'u', 'd', '</w>'), 17),
 (('s', 'o', '</w>'), 16),
 (('T', 'h', 'e', '</w>'), 16),

In [14]:
from llmscratch.utils.bpe_format_utils import convert_to_sennrich_format


sennrich_format = convert_to_sennrich_format(init_vocab)
sennrich_format

{('I</w>',): 120,
 ('H', 'A', 'D</w>'): 1,
 ('a', 'l', 'w', 'a', 'y', 's</w>'): 6,
 ('t', 'h', 'o', 'u', 'g', 'h', 't</w>'): 8,
 ('J', 'a', 'c', 'k</w>'): 15,
 ('G', 'i', 's', 'b', 'u', 'r', 'n</w>'): 19,
 ('r', 'a', 't', 'h', 'e', 'r</w>'): 6,
 ('a</w>',): 82,
 ('c', 'h', 'e', 'a', 'p</w>'): 1,
 ('g', 'e', 'n', 'i', 'u', 's</w>'): 1,
 ('t', 'h', 'o', 'u', 'g', 'h</w>'): 2,
 ('g', 'o', 'o', 'd</w>'): 5,
 ('f', 'e', 'l', 'l', 'o', 'w</w>'): 2,
 ('e', 'n', 'o', 'u', 'g', 'h</w>'): 5,
 ('s', 'o</w>'): 16,
 ('i', 't</w>'): 66,
 ('w', 'a', 's</w>'): 72,
 ('n', 'o</w>'): 8,
 ('g', 'r', 'e', 'a', 't</w>'): 2,
 ('s', 'u', 'r', 'p', 'r', 'i', 's', 'e</w>'): 3,
 ('t', 'o</w>'): 97,
 ('m', 'e</w>'): 37,
 ('h', 'e', 'a', 'r</w>'): 4,
 ('t', 'h', 'a', 't</w>'): 55,
 ('i', 'n</w>'): 40,
 ('t', 'h', 'e</w>'): 168,
 ('h', 'e', 'i', 'g', 'h', 't</w>'): 2,
 ('o', 'f</w>'): 98,
 ('h', 'i', 's</w>'): 61,
 ('g', 'l', 'o', 'r', 'y</w>'): 2,
 ('h', 'e</w>'): 64,
 ('h', 'a', 'd</w>'): 54,
 ('d', 'r', 'o', 'p'

In [15]:
symbol_pairs = trainer._count_symbol_pairs(init_vocab)
sorted_symbol_pairs = sorted(symbol_pairs.items(), key=lambda x: x[1], reverse=True)
sorted_symbol_pairs[:10]

[(('e', '</w>'), 727),
 (('d', '</w>'), 464),
 (('t', '</w>'), 437),
 (('h', 'e'), 408),
 (('t', 'h'), 365),
 (('s', '</w>'), 352),
 (('i', 'n'), 289),
 (('n', '</w>'), 266),
 (('y', '</w>'), 234),
 (('h', 'a'), 197)]

In [16]:
from collections import defaultdict
from typing import Dict, List, Tuple


merges: List[Tuple[str, str]] = []

In [17]:
best_pair = max(symbol_pairs, key=symbol_pairs.get)
best_pair

('e', '</w>')

In [18]:
# add the best pair to the merges list
merges.append(best_pair)

In [19]:
merged_vocab = trainer._merge_vocab(best_pair, init_vocab)
sorted_merged_vocab = sorted(merged_vocab.items(), key=lambda x: x[1], reverse=True)
sorted_merged_vocab[:10]

[(('t', 'h', 'e</w>'), 168),
 (('I', '</w>'), 120),
 (('o', 'f', '</w>'), 98),
 (('t', 'o', '</w>'), 97),
 (('a', '</w>'), 82),
 (('a', 'n', 'd', '</w>'), 74),
 (('w', 'a', 's', '</w>'), 72),
 (('i', 't', '</w>'), 66),
 (('h', 'e</w>'), 64),
 (('h', 'i', 's', '</w>'), 61)]

In [20]:
num_merges = 10000
for i in range(num_merges):
    symbol_pairs = trainer._count_symbol_pairs(vocab)

    if not symbol_pairs:
        print(f"🛑 No more symbol pairs left after {i} merges.")
        break

    best_pair = max(symbol_pairs, key=symbol_pairs.get)
    vocab = trainer._merge_vocab(best_pair, vocab)
    merges.append(best_pair)

🛑 No more symbol pairs left after 2159 merges.


In [21]:
merges

[('e', '</w>'),
 ('h', 'e'),
 ('i', 'n'),
 ('t', 'he'),
 ('h', 'a'),
 ('o', 'u'),
 ('r', 'e'),
 ('a', 'n'),
 ('o', 'n'),
 ('i', 't'),
 ('i', 's'),
 ('e', 'd'),
 ('a', 's'),
 ('t', 'o'),
 ('in', 'g'),
 ('e', 'n'),
 ('o', 'f'),
 ('e', 'r'),
 ('s', 't'),
 ('a', 't'),
 ('an', 'd'),
 ('s', 'e'),
 ('o', 'r'),
 ('l', 'e'),
 ('b', 'e'),
 ('w', 'as'),
 ('a', 'r'),
 ('m', 'e'),
 ('ha', 't'),
 ('h', 'is'),
 ('o', 'w'),
 ('a', 'l'),
 ('a', 'c'),
 ('v', 'e'),
 ('i', 'c'),
 ('i', 'm'),
 ('l', 'y'),
 ('u', 'r'),
 ('t', 'hat'),
 ('ha', 'd'),
 ('g', 'h'),
 ('t', 'h'),
 ('d', 'e'),
 ('m', 'y'),
 ('e', 's'),
 ('a', 'y'),
 ('a', 'in'),
 ('h', 'im'),
 ('t', 'r'),
 ('l', 'd'),
 ('r', 'o'),
 ('i', 'd'),
 ('l', 'l'),
 ('a', 'b'),
 ('u', 't'),
 ('w', 'it'),
 ('on', 'e'),
 ('l', 'i'),
 ('i', 'r'),
 ('n', 'd'),
 ('wit', 'h'),
 ('n', 'o'),
 ('i', 'on'),
 ('en', 't'),
 ('t', 'e'),
 ('s', 'u'),
 ('y', 'ou'),
 ('f', 'e'),
 ('s', 'o'),
 ('ou', 'ld'),
 ('gh', 't'),
 ('r', 'a'),
 ('n', 'e'),
 ('ur', 'n'),
 ('ha', 've')

In [22]:
learned_merges = [f"{pair[0]} {pair[1]}" for pair in merges]
learned_merges[:10]

['e </w>', 'h e', 'i n', 't he', 'h a', 'o u', 'r e', 'a n', 'o n', 'i t']

In [23]:
# Step 1: Build raw word frequency dictionary from text files
vocab = service.build_word_freq(mode="line", lowercase=False)

# Step 2: Initialize trainer
trainer = BPETrainer(num_merges=10000)

# Step 3: Train BPE merge operations
trainer.fit(vocab)

# Step 4: Log useful stats
trainer.log_statistics()

[ 2025-07-19 09:48:16 ] INFO [../../../src/llmscratch/services/text_preprocessing_service.py:61] - 📁 Scanning directory: /Users/kenneth/Public/projects/python/ai/llms/llm-zero-to-trained/datasets/raw
[ 2025-07-19 09:48:16 ] INFO [../../../src/llmscratch/services/text_preprocessing_service.py:65] - 📄 Processing file: the-verdict.txt
[ 2025-07-19 09:48:16 ] INFO [../../../src/llmscratch/components/io/text_loader.py:28] - 📂 Loading file: /Users/kenneth/Public/projects/python/ai/llms/llm-zero-to-trained/datasets/raw/the-verdict.txt (mode: line)
[ 2025-07-19 09:48:16 ] INFO [../../../src/llmscratch/services/text_preprocessing_service.py:70] - ✅ Word frequency dictionary built with 1109 unique words.
[ 2025-07-19 09:48:16 ] INFO [../../../src/llmscratch/components/trainers/bpe_trainer.py:27] - 📦 Initialized BPETrainer with num_merges=10000
[ 2025-07-19 09:48:16 ] INFO [../../../src/llmscratch/components/trainers/bpe_trainer.py:39] - 🚀 Starting BPE training...
[ 2025-07-19 09:48:16 ] INFO [..