Text Preparation

Standardizes text format for consistent processing

Handles punctuation separation to improve token boundaries

Expands contractions to canonical forms

Reduces vocabulary size and improves model generalization

In [3]:
import re

def prepare_text(text):

    # Lowercase conversion
    text = text.lower()
    
    # Remove special characters (keep alphanumeric and basic punctuation)
    text = re.sub(r'[^a-zA-Z0-9\s.,;:!?\'"-]', '', text)
    
    # Add space around punctuation for better tokenization
    text = re.sub(r'([.,;:!?])', r' \1 ', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Expand common contractions
    contractions = {
        "n't": " not",
        "'s": " is",
        "'re": " are",
        "'ve": " have",
        "'ll": " will",
        "'d": " would"
    }
    for cont, exp in contractions.items():
        text = text.replace(cont, exp)
    
    return text

# Example usage
sample_text = "Hello! This is a test. Don't you think it's cool?"
print("Original:", sample_text)
print("Prepared:", prepare_text(sample_text))


Original: Hello! This is a test. Don't you think it's cool?
Prepared: hello ! this is a test . do not you think it is cool ?


Tokenization & Byte Pair Encoding (BPE)

BPE Algorithm: Iteratively merges most frequent character pairs

Vocabulary Building: Starts with byte-level tokens, grows to specified size

Handling Unknowns: Uses <unk> token for unseen subwords

Efficiency: Processes text in linear time relative to vocabulary size

In [5]:
from collections import Counter

class BPETokenizer:
    def __init__(self, corpus=None, vocab_size=1000):
        self.vocab = {}
        self.merges = {}
        if corpus:
            self.train(corpus, vocab_size)
    
    def train(self, corpus, vocab_size):
        
        # Initialize vocabulary with bytes
        self.vocab = {chr(i): i for i in range(256)}
        text = prepare_text(corpus)
        words = text.split()
        word_freqs = Counter(words)
        
        # BPE training algorithm
        while len(self.vocab) < vocab_size:
            # Count all adjacent pairs
            pairs = Counter()
            for word, freq in word_freqs.items():
                symbols = list(word)
                for i in range(len(symbols)-1):
                    pairs[(symbols[i], symbols[i+1])] += freq
            
            if not pairs:
                break
                
            # Find most frequent pair
            best_pair = max(pairs, key=pairs.get)
            new_token = ''.join(best_pair)
            
            # Merge the pair
            self.merges[best_pair] = new_token
            self.vocab[new_token] = len(self.vocab)
            
            # Update word frequencies with merged token
            new_word_freqs = {}
            for word, freq in word_freqs.items():
                new_word = word
                for pair, merge in self.merges.items():
                    new_word = new_word.replace(''.join(pair), merge)
                new_word_freqs[new_word] = freq
            word_freqs = new_word_freqs
    
    def tokenize(self, text):
        tokens = []
        text = prepare_text(text)
        words = text.split()
        
        for word in words:
            # Start with individual characters
            current = list(word)
            changed = True
            
            # Apply merges until no more changes
            while changed and len(current) > 1:
                changed = False
                for i in range(len(current)-1):
                    pair = (current[i], current[i+1])
                    if pair in self.merges:
                        current = current[:i] + [self.merges[pair]] + current[i+2:]
                        changed = True
                        break
            
            tokens.extend(current)
        
        # Convert tokens to IDs
        return [self.vocab.get(token, self.vocab.get('<unk>', 0)) for token in tokens]

# Example usage
corpus = "The quick brown fox jumps over the lazy dog repeatedly."
tokenizer = BPETokenizer(corpus, vocab_size=200)
print("Vocabulary:", list(tokenizer.vocab.keys())[:10])
print("Token IDs:", tokenizer.tokenize("brown fox jumps"))


Vocabulary: ['\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', '\x08', '\t']
Token IDs: [98, 114, 111, 119, 110, 102, 111, 120, 106, 117, 109, 112, 115]


Sliding Window Sampling

Sliding Window: Creates overlapping context windows

Configurable Context: Adjust window_size for model context length

Efficient Sampling: Uses stride to control overlap between samples

PyTorch Integration: Compatible with DataLoader for batching

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, text, tokenizer, window_size=64, stride=32):

        self.tokens = tokenizer.tokenize(text)
        self.window_size = window_size
        self.stride = stride
    
    def __len__(self):
    
        return (len(self.tokens) - self.window_size) // self.stride
    
    def __getitem__(self, idx):
    
        start = idx * self.stride
        end = start + self.window_size
        return torch.tensor(self.tokens[start:end])

# Example usage
dataset = TextDataset(
    text="A quick brown fox jumps over the lazy dog.",
    tokenizer=tokenizer,
    window_size=5,
    stride=2
)

print(f"Dataset size: {len(dataset)} examples")
print("First sample:", dataset[0])
print("Second sample:", dataset[1])


Dataset size: 14 examples
First sample: tensor([ 97, 113, 117, 105,  99])
Second sample: tensor([117, 105,  99, 107,  98])


 Token Vectorization

Embedding Layer: Trainable lookup table for token representations

Dimensionality: embedding_dim controls vector size (typical values: 128-1024)

Gradient Learning: Embeddings improve during model training

Output: 3D tensor (batch_size, sequence_length, embedding_dim)

In [7]:
import torch.nn as nn

class EmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )
    
    def forward(self, token_ids):
        return self.embedding(token_ids)

# Example usage
vocab_size = len(tokenizer.vocab)
embedding_dim = 128
embedding_layer = EmbeddingLayer(vocab_size, embedding_dim)

sample_tokens = torch.tensor(tokenizer.tokenize("fox jumps"))
vectors = embedding_layer(sample_tokens)

print("Token IDs:", sample_tokens)
print("Vector shape:", vectors.shape)
print("First vector:", vectors[0][:5].tolist(), "...")


Token IDs: tensor([102, 111, 120, 106, 117, 109, 112, 115])
Vector shape: torch.Size([8, 128])
First vector: [1.385013461112976, 0.023550674319267273, -0.28727394342422485, 0.025423569604754448, -1.40799081325531] ...


Full Pipeline Integration

Corpus Preparation: Real-world text with technical terms

Tokenization: BPE handles complex words like "representations"

Batching: DataLoader creates mini-batches for efficient training

Vector Conversion: Tokens become trainable embedding vectors

Output Shapes:

Token IDs: [batch_size, window_size]

Embeddings: [batch_size, window_size, embedding_dim]

In [15]:
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')

# 1. Prepare training corpus
corpus = """Large language models require carefully prepared text data. 
Tokenization converts text into smaller units called tokens. 
Byte pair encoding creates efficient subword representations."""

# 2. Train BPE tokenizer
tokenizer = BPETokenizer(corpus, vocab_size=300)

# 3. Create sliding window dataset
dataset = TextDataset(
    text=corpus,
    tokenizer=tokenizer,
    window_size=10,
    stride=5
)

# 4. Create embedding layer
embedding_layer = EmbeddingLayer(
    vocab_size=len(tokenizer.vocab),
    embedding_dim=128
)

# 5. Process through full pipeline
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

for i, batch in enumerate(dataloader):
    print(f"\nBatch {i+1}:")
    print("Token IDs:", batch)
    print("Shape:", batch.shape)
    
    # Convert to embeddings
    embeddings = embedding_layer(batch)
    print("Embedding shape:", embeddings.shape)
    
    if i == 1:  # Show first 2 batches
        break



KeyboardInterrupt: 