# Notebook 04: Chunking, tokenizing and modeling

This notebook is dedicated to preparing and feeding the cleaned book data to a language model for further processing, such as fine-tuning or generation.

## 1. Load Cleaned Data

First, we will load the cleaned text data of the book.

In [1]:
import pandas as pd

data_path = "../data/Guerra_y_paz_cleaned.txt"
with open(data_path, "r", encoding="utf-8") as f:
    book_text = f.read()

print(f"Loaded text with {len(book_text)} characters.")

Loaded text with 3153561 characters.


## 2. Initialize Language Model and Tokenizer

Here, we will load a pre-trained language model and its corresponding tokenizer. This section will be updated based on the specific model chosen (e.g., Hugging Face Transformers).

In [32]:
vocab_size = 32000

In [33]:
import sentencepiece as spm

spm.SentencePieceTrainer.train(
    input='..\\data\\Guerra_y_paz_cleaned.txt',  # filename, one sentence per line
    model_prefix='..\\tokenizers\\wp_sp',        # output file name prefix
    vocab_size=vocab_size,                      
    model_type='bpe',                      # model_type: unigram, bpe, char, word
    character_coverage=0.9995,             
    user_defined_symbols=['<pad>', '<bos>', '<eos>']
)

In [4]:
pwd

'c:\\Users\\josep\\Desktop\\Tolstoi\\notebooks'

In [34]:
sp = spm.SentencePieceProcessor(model_file='..\\tokenizers\\wp_sp.model')

print("Vocab size:", sp.get_piece_size())
print("ID de <bos>:", sp.piece_to_id("<bos>"))


Vocab size: 32000
ID de <bos>: 4


In [18]:
sp.piece_to_id(" ")

0

In [42]:
sp.id_to_piece(7)

'▁e'

In [43]:
text = "Era una noche tranquila en Moscú."
ids = [4]+sp.encode(text, out_type=int)+[5]
print(ids)
print(sp.decode(ids))

[4, 1139, 113, 995, 2678, 35, 484, 31943, 5]
<bos> Era una noche tranquila en Moscú.<eos>


In [21]:
import torch
from torch.nn.utils.rnn import pad_sequence

def make_batches(paragraphs, sp, seq_len=256, stride=128, add_bos=True):
    """
    Convert paragraphs into padded tensor batches for training.

    Args:
        paragraphs (list[str]): Each paragraph is one line of text.
        sp (SentencePieceProcessor): Trained tokenizer.
        seq_len (int): Max sequence length (tokens per sample).
        stride (int): Overlap stride between consecutive sequences. (slide window)
        add_bos (bool): Whether to prepend <bos> to each paragraph.

    Returns:
        samples (list[torch.Tensor]): Each element is a LongTensor of token IDs.
    """
    bos_id = sp.piece_to_id("<bos>")
    eos_id = sp.piece_to_id("<eos>")
    pad_id = sp.piece_to_id("<pad>")

    # Tokenize all paragraphs into one flat list of token IDs
    tokens = []
    for p in paragraphs:
        if not p.strip():
            continue
        piece_ids = sp.encode(p, out_type=int)
        if add_bos:
            tokens.extend([bos_id] + piece_ids + [eos_id])
        else:
            tokens.extend(piece_ids + [eos_id])

    # Slice into overlapping fixed-length chunks
    samples = [
        tokens[i:i+seq_len]
        for i in range(0, len(tokens) - seq_len, stride)
    ]

    # Convert to tensors
    tensors = [torch.tensor(s, dtype=torch.long) for s in samples]

    # Build input/output pairs (shifted)
    x_tensors = [t[:-1] for t in tensors]
    y_tensors = [t[1:] for t in tensors]

    # Pad sequences for batching later
    def collate(batch):
        xs, ys = zip(*batch)
        xs_pad = pad_sequence(xs, batch_first=True, padding_value=pad_id)
        ys_pad = pad_sequence(ys, batch_first=True, padding_value=pad_id)
        return xs_pad, ys_pad

    # return everything you need
    return x_tensors, y_tensors, collate


## 3. Prepare Data for Model Input

The cleaned text needs to be tokenized and formatted into a suitable input for the language model. This may involve splitting into chunks, adding special tokens, etc.

In [46]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class LMDataset(Dataset):
    def __init__(self, x_tensors, y_tensors):
        self.xs = x_tensors
        self.ys = y_tensors

    def __len__(self):
        return len(self.xs)

    def __getitem__(self, idx):
        return self.xs[idx], self.ys[idx]


In [25]:
book_paragraphs = book_text.splitlines()

x_tensors, y_tensors, collate_fn = make_batches(book_paragraphs, sp, seq_len=256, stride=128, add_bos=True)

In [47]:
import random
from torch.utils.data import DataLoader

# split params
seed = 42
train_frac = 0.9
batch_size = 32  # adjust for GPU memory


n = len(x_tensors)
indices = list(range(n))
random.Random(seed).shuffle(indices)
split_idx = int(n * train_frac)
train_idx, dev_idx = indices[:split_idx], indices[split_idx:]

# create split lists
x_train_tensors = [x_tensors[i] for i in train_idx]
y_train_tensors = [y_tensors[i] for i in train_idx]
x_dev_tensors   = [x_tensors[i] for i in dev_idx]
y_dev_tensors   = [y_tensors[i] for i in dev_idx]

print(f'Total samples: {n}  -> train: {len(x_train_tensors)}, dev: {len(x_dev_tensors)}')

train_dataset = LMDataset(x_train_tensors, y_train_tensors)
dev_dataset   = LMDataset(x_dev_tensors, y_dev_tensors)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
dev_loader   = DataLoader(dev_dataset,   batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# make dev_lines available for the training loop check later
dev_lines = x_dev_tensors


Total samples: 5260  -> train: 4734, dev: 526


## 4. Feed to Language Model

Finally, the prepared data will be fed to the language model. This could be for inference, fine-tuning, or other tasks.

In [31]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class GRULanguageModel(nn.Module):
    def __init__(self, n_tokens, emb_size=128, hid_size=256, num_layers=2, pad_id=None, eos_id=None):
        super().__init__()
        self.n_tokens = n_tokens
        self.pad_id = pad_id
        self.eos_id = eos_id

        self.emb = nn.Embedding(n_tokens, emb_size, padding_idx=pad_id)
        self.gru = nn.GRU(emb_size, hid_size, num_layers=num_layers, batch_first=True)
        self.linear = nn.Linear(hid_size, n_tokens)

    def forward(self, x, hidden=None):
        """
        x: [batch, seq_len] token indices
        hidden: optional initial hidden state
        returns:
            logits: [batch, seq_len, n_tokens]
            hidden: final hidden state
        """
        emb = self.emb(x)                  # [B, L, emb_size]
        output, hidden = self.gru(emb, hidden)  # [B, L, hid_size]
        logits = self.linear(output)       # [B, L, n_tokens]
        return logits, hidden

    def generate(self, prefix_ids, max_len=100, temperature=1.0, device="cpu"):
        """
        Autoregressive generation
        prefix_ids: tensor [1, L] of token IDs
        returns: list of generated token IDs (including prefix)
        """
        self.eval()
        prefix_ids = prefix_ids.to(device)
        generated = prefix_ids.clone()
        hidden = None

        for _ in range(max_len):
            logits, hidden = self.forward(generated[:, -1:], hidden)
            logits = logits[:, -1, :] / temperature
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            generated = torch.cat([generated, next_token], dim=1)

            # stop if EOS appears
            if next_token.item() in (self.pad_id, self.eos_id):  # or eos_id if defined separately
                break
        return generated[0].tolist()
    

In [50]:
model = GRULanguageModel(n_tokens=sp.get_piece_size(), pad_id=sp.piece_to_id("<pad>"), eos_id=sp.piece_to_id("<eos>"))
num_epochs = 3
opt = torch.optim.Adam(model.parameters())

train_history = []             # list of (step, loss) tuples appended during training
dev_history = []               # list of (step, dev_loss) tuples appended when scoring dev set
score_dev_every = 500          # evaluate dev set every N training steps (adjust to taste)

bos_id = sp.piece_to_id("<bos>")
eos_id = sp.piece_to_id("<eos>")
pad_id = sp.piece_to_id("<pad>")


In [None]:
import torch
import matplotlib.pyplot as plt
from IPython.display import clear_output
from tqdm import trange

# Assuming you have:
# - dataset & loader from make_batches()
# - model: GRULanguageModel
# - train_history, dev_history: lists
# - score_dev_every: int
# - generate(): method in your model

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

for epoch in range(1, num_epochs + 1):
    model.train()
    running_loss = 0.0

    for i, (x_batch, y_batch) in enumerate(train_loader):
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)

        opt.zero_grad()
        logits, _ = model(x_batch)
        loss = torch.nn.functional.cross_entropy(
            logits.view(-1, logits.size(-1)),
            y_batch.view(-1),
            ignore_index=pad_id
        )
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()

        train_history.append((len(train_history), float(loss)))
        running_loss += float(loss)

        # --- Visualization and sample generation ---
        if (i + 1) % 50 == 0:
            clear_output(True)
            plt.figure(figsize=(6, 4))
            plt.scatter(*zip(*train_history), alpha=0.1, label='train_loss')
            if len(dev_history):
                plt.plot(*zip(*dev_history), color='red', label='dev_loss')
            plt.legend()
            plt.grid()
            plt.show()

            print("Generated examples (temperature=0.5):")
            for _ in range(3):
                # Start generation with <bos>
                prefix = torch.tensor([[bos_id]], dtype=torch.long, device=device)
                gen_ids = model.generate(prefix, max_len=100, temperature=0.5, device=device)
                print(sp.decode(gen_ids))

        # --- Evaluate on dev set periodically ---
        if (i + 1) % score_dev_every == 0 and len(dev_lines) > 0:
            print("Scoring dev set...")
            dev_loss = 0.0
            model.eval()
            with torch.no_grad():
                for x_dev, y_dev in dev_loader:
                    x_dev, y_dev = x_dev.to(device), y_dev.to(device)
                    logits, _ = model(x_dev)
                    loss_dev = torch.nn.functional.cross_entropy(
                        logits.view(-1, logits.size(-1)),
                        y_dev.view(-1),
                        ignore_index=pad_id
                    )
                    dev_loss += float(loss_dev) * x_dev.size(0)
            dev_loss /= len(dev_loader.dataset)
            dev_history.append((len(train_history), dev_loss))
            print(f'Dev loss: {dev_loss:.3f}')


Consider using tensor.detach() first. (Triggered internally at C:\actions-runner\_work\pytorch\pytorch\pytorch\torch\csrc\autograd\generated\python_variable_methods.cpp:837.)
  train_history.append((len(train_history), float(loss)))
