# Byte-level Tokenizer

### Overview
Trains a transformer chatbot to mimic Michael Scott's speaking style using byte-level tokenization - treating each character as a separate token (0-255 byte values).

### Key Components
- Tokenizer: Custom ByteTokenizer (vocab size: 259 = 256 bytes + 3 special tokens)
- Model: Encoder-decoder transformer (d_model=256, 3 layers, 4 heads)
- Training: Uses modular Trainer class with train/val split
- Generation: Multiple strategies (greedy, balanced, creative)

In [1]:
from __future__ import annotations

import torch
from torch.utils.data import Dataset, DataLoader
import math
from tqdm.auto import tqdm

import os
os.chdir('/Users/idrishouiralami/Documents/projets_code/GPT')

from gpt_tokenizers.byte_level import ByteTokenizer, VocabInfo
from utils.transformer import build_transformer
from utils.masks import Masks
from utils.data_loader import OfficeSeq2Seq, get_local_data
from training.train import Trainer, TrainConfig
from generation.inference_local_model import MichaelScottBot

from functools import partial

from typing import Tuple, Optional, Dict

import torch.nn as nn
from torch.optim import AdamW

import numpy as np

## Parameters

In [8]:
vocab = VocabInfo(
        pad_id=0, 
        bos_id=1, 
        eos_id=2, 
        shift=3
        )

In [10]:
SRC_PATH = "data/src.txt"
TGT_PATH = "data/tgt.txt"

In [None]:
MAX_SRC_LEN = 256
MAX_TGT_LEN = 128

## Tokenisation (byte-level)

In [2]:
src_lines, tgt_lines = get_local_data(SRC_PATH, TGT_PATH)

NameError: name 'SRC_PATH' is not defined

In [11]:
# Encode all lines
tok = ByteTokenizer(vocab=vocab)
  # SHIFT=3, PAD=0,BOS=1,EOS=2 by default
enc_src = tok.encode_batch(src_lines)
enc_tgt = tok.encode_batch(tgt_lines)

In [12]:
vocab_size = tok.vocab_size

## Data Loader

In [16]:
N = len(enc_src); split = int(0.9*N)

train_ds = OfficeSeq2Seq(enc_src[:split], enc_tgt[:split], MAX_SRC_LEN, MAX_TGT_LEN, pad_id=vocab.pad_id)
val_ds   = OfficeSeq2Seq(enc_src[split:],  enc_tgt[split:],  MAX_SRC_LEN, MAX_TGT_LEN, pad_id=vocab.pad_id)

train_dl = DataLoader(train_ds, batch_size=32, shuffle=True,  drop_last=True,  pin_memory=False)
val_dl   = DataLoader(val_ds,   batch_size=32, shuffle=False, drop_last=False, pin_memory=False)

## Build Model

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = build_transformer(
    src_vocab_size=vocab_size, tgt_vocab_size=vocab_size,
    src_seq_len=MAX_SRC_LEN, tgt_seq_len=MAX_TGT_LEN-1,  # decoder sees T-1
    d_model=256, N=3, h=4, dropout=0.1, d_ff=1024
).to(device)

  nn.init.xavier_uniform(p)


## Train / Eval loops

In [31]:
cfg = TrainConfig(pad_id=vocab.pad_id, epochs=1, amp=False)  # set amp=True on CUDA if you want
trainer = Trainer(model, device=device, cfg=cfg)      # "cuda" / "cpu" / "mps"
history = trainer.fit(train_dl, val_dl)

train epoch:   0%|          | 0/249 [00:00<?, ?it/s]

val epoch:   0%|          | 0/28 [00:00<?, ?it/s]

epoch 01 | train 1.812 | val 1.671 | ppl 5.32


## Generation

In [None]:
masks = Masks(pad_id=vocab.pad_id)

In [None]:
# Initialize generator with your trained model
bot = MichaelScottBot(
    model=model,
    tokenizer=tok,
    masks=masks,
    device=device,
    config=cfg
)

In [45]:
greedy_response

'[MICHAEL] Oh, well.'

In [46]:
bot.balanced("[JIM] Hi Michael") 

'[MICHAEL] Oh, they.'

In [47]:
bot.creative("[JIM] Hi Michael")

'[MICHAEL] Goder, this.'

In [48]:
# Custom parameters
bot.generate(
    "[DWIGHT] I am the best salesman.",
    temperature=0.9,
    top_k=80,
    top_p=0.95,
    repetition_penalty=1.4
)

'[MICHAEL] Okay..'