In [1]:
# Setup
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from datasets import load_dataset
from transformers import AutoTokenizer

# Use GPU when available for faster training/visuals; fall back to CPU otherwise
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using', DEVICE)

ds = load_dataset("wikitext", "wikitext-2-v1", verification_mode="no_checks")
model_name = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained("gpt2")
# enc = tokenizer(sentence, return_tensors='pt')

  from .autonotebook import tqdm as notebook_tqdm


Using cuda


In [2]:
def scaled_dot_product_attention(Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor, mask: torch.Tensor|None=None):
    """Compute scaled dot-product attention.

    Implements the equation from "Attention is All You Need":

        Attention(Q, K, V) = softmax(Q K^T / sqrt(d_k) + mask) V

    Args:
        Q: Query tensor of shape (B, h, T_q, d_k).
        K: Key tensor of shape (B, h, T_k, d_k).
        V: Value tensor of shape (B, h, T_k, d_v).
        mask: Optional additive mask broadcastable to (B, h, T_q, T_k) with 0 or -inf.

    Returns:
        out: Attention output, shape (B, h, T_q, d_v).
        attn: Attention weights (softmax probabilities), shape (B, h, T_q, T_k).
    """
    # d_k is the dimensionality of queries/keys per head
    d_k = Q.size(-1)  # read last dimension of Q for scaling

    # Compute raw attention scores by matrix-multiplying Q and K^T
    # Q @ K^T yields shape (B, h, T_q, T_k)
    #TODO
    scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_k)# scale by sqrt(d_k)

    # If a mask was provided, add it to the scores. Mask entries are 0 (keep) or -inf (block)
    if mask is not None:
        # Ensure mask is same dtype and on same device as scores to avoid runtime errors
        mask = mask.to(dtype=scores.dtype, device=scores.device)
        scores = scores + mask  # additive masking prior to softmax

    # Convert scores to probabilities along the key dimension with softmax
    # use torch functional library you important above, which is a PyTorch
    # module containing functional (stateless) implementations of layers
    # and operations like softmax, relu, cross_entropy, etc.
    #TODO
    attn = F.softmax(scores, dim=-1)  # softmax over T_k

    # Use attention weights to produce weighted sum over values
    # This line of code will perform a batched matrix multiplication over the last two dimensions
    out = torch.matmul(attn, V) # (B, h, T_q, d_v)

    # Return both the attended outputs and the attention weights for inspection
    return out, attn

# Quick shape test: verify the function returns expected tensor shapes
B, h, T, d_k, d_v = 2, 3, 4, 8, 8  # batch, heads, time, key-dim, value-dim
Q = torch.randn(B, h, T, d_k)  # random queries
K = torch.randn(B, h, T, d_k)  # random keys
V = torch.randn(B, h, T, d_v)  # random values
out, attn = scaled_dot_product_attention(Q, K, V)  # call the function
assert out.shape == (B, h, T, d_v) and attn.shape == (B, h, T, T)  # sanity assert
print('Scaled dot-product attention shapes OK:', out.shape, attn.shape)

Scaled dot-product attention shapes OK: torch.Size([2, 3, 4, 8]) torch.Size([2, 3, 4, 4])


In [3]:
def sinusoidal_positional_encoding(T:int, d_model:int, device=DEVICE):
    """Create sinusoidal positional encodings.

    Implements the original formulation from Vaswani et al. where each dimension
    of the positional encoding uses a different frequency.

    Args:
        T: Sequence length (number of positions).
        d_model: Model dimensionality (must be even to pair sin/cos dims nicely).
        device: Torch device for the returned tensor.

    Returns:
        PE: Tensor of shape (T, d_model) containing positional encodings.
    """
    # Ensure d_model is even so even/odd pairing works
    assert d_model % 2 == 0, "d_model must be even for sinusoidal positional encoding"

    # position indices (T, 1) as float
    pos = torch.arange(T, device=device, dtype=torch.float32).unsqueeze(1)

    # dimension indices (1, d_model) as float
    i = torch.arange(d_model, device=device, dtype=torch.float32).unsqueeze(0)

    # compute the rate term 1/10000^{2i/d_model}
    angle_rates = 1.0 / torch.pow(10000.0, (2 * (i // 2)) / d_model)

    # outer product to get angles for every position and dimension
    angles = pos * angle_rates  # (T, d_model)

    # allocate and fill even/odd indices with sin/cos
    PE = torch.zeros((T, d_model), device=device)
    PE[:, 0::2] = torch.sin(angles[:, 0::2])
    PE[:, 1::2] = torch.cos(angles[:, 1::2])
    return PE


def causal_mask(T_q: int, T_k: int, device=DEVICE, dtype: torch.dtype=torch.float32):
    """Create an additive causal mask to prevent attention to future positions.

    The mask returned can be added directly to attention logits before softmax.

    Args:
        T_q: Number of query positions.
        T_k: Number of key/value positions.
        device: Torch device to create the mask on.
        dtype: Desired floating dtype for the returned mask (default: torch.float32).

    Returns:
        mask: Tensor of shape (1, 1, T_q, T_k) with 0.0 where allowed and -inf where masked.
    """
    # Allocate a mask filled with -inf (all positions masked initially) with requested dtype
    mask = torch.full((1,1,T_q,T_k), float('-inf'), device=device, dtype=dtype)

    # Build a lower-triangular matrix of ones (allowed positions are 1)
    tril = torch.tril(torch.ones(T_q, T_k, device=device, dtype=dtype))

    # Wherever tril == 1, set the mask value to 0.0 (meaning "allowed")
    mask = mask.masked_fill(tril == 1, 0.0)

    # Return mask shaped (1,1,T_q,T_k) which will broadcast over batch and heads
    return mask

In [4]:
class TinyMultiHeadAttention(nn.Module):
    """A minimal multi-head self-attention implementation.

    This class implements the core mechanics of multi-head attention without
    dropout or biases. It projects inputs to Q/K/V, splits into heads, applies
    scaled dot-product attention per head, and concatenates the results.
    """
    def __init__(self, d_model: int, num_heads: int):
        super().__init__()
        # Ensure d_model is divisible by number of heads for equal head size
        assert d_model % num_heads == 0
        self.d_model = d_model  # full model dimensionality
        self.num_heads = num_heads  # number of parallel attention heads
        self.d_k = d_model // num_heads  # dimensionality per head

        # Linear projections for queries, keys and values (project then split into heads)
        self.W_q = nn.Linear(d_model, d_model, bias=False)  # projects input -> Q_all
        self.W_k = nn.Linear(d_model, d_model, bias=False)  # projects input -> K_all
        self.W_v = nn.Linear(d_model, d_model, bias=False)  # projects input -> V_all

        # Output linear projection that combines concatenated head outputs
        self.W_o = nn.Linear(d_model, d_model, bias=False)  # projects heads concat -> output

    def split_heads(self, X):
        """Split the last dimension into (num_heads, d_k) and transpose.

        Args:
            X: Tensor of shape (B, T, D)
        Returns:
            Tensor of shape (B, h, T, d_k)
        """
        # Unpack batch, time, and feature dims
        B, T, D = X.shape
        # Reshape to separate heads and then transpose head dim upfront
        X = X.view(B, T, self.num_heads, self.d_k).transpose(1,2)  # (B,h,T,d_k)
        return X

    def combine_heads(self, X):
        """Inverse of split_heads: transpose and merge heads into feature dim.

        Args:
            X: Tensor of shape (B, h, T, d_k)
        Returns:
            Tensor of shape (B, T, D)
        """
        # Unpack shapes
        B, h, T, d_k = X.shape
        # Transpose to (B, T, h, d_k) then flatten the last two dims
        X = X.transpose(1,2).contiguous().view(B, T, h*d_k)  # (B,T,D)
        return X

    def forward(self, X, mask=None):
        """Forward pass for TinyMultiHeadAttention.

        Args:
            X: Input tensor of shape (B, T, D=d_model).
            mask: Optional additive mask to apply to attention logits.

        Returns:
            out_proj: Output tensor of shape (B, T, D).
            attn: Attention weights from scaled_dot_product_attention (B, h, T, T).
        """
        # Project inputs to combined Q/K/V of shape (B, T, D)
        Q_all = self.W_q(X)  # (B, T, D)
        K_all = self.W_k(X)  # (B, T, D)
        V_all = self.W_v(X)  # (B, T, D)

        # Split the combined Q/K/V into multiple heads: (B, h, T, d_k)
        Q = self.split_heads(Q_all)
        K = self.split_heads(K_all)
        V = self.split_heads(V_all)

        # Compute attention per head using scaled dot-product attention
        out, attn = scaled_dot_product_attention(Q, K, V, mask)

        # Combine head outputs back into (B, T, D)
        out_combined = self.combine_heads(out)

        # Final linear projection
        out_proj = self.W_o(out_combined)

        return out_proj, attn

# Sanity check
B,T,D,h = 2,5,32,4
x = torch.randn(B,T,D)
mha = TinyMultiHeadAttention(D,h)
y, attn = mha(x)
print('Tiny MHA out shape:', y.shape, '| attn:', attn.shape)


Tiny MHA out shape: torch.Size([2, 5, 32]) | attn: torch.Size([2, 4, 5, 5])


LOAD DATA

In [5]:
# from torch.utils.data import DataLoader, TensorDataset

# # Hyperparameters for data + batching
# block_size = 128            # tokens per training example
# batch_size = 16
# max_train_sequences = 4000  # total training chunks (4000 * 128 = 512k tokens)
# max_val_sequences = 512     # total validation chunks
# max_train_docs = 60000      # number of WikiText-2 lines to concatenate for training
# max_val_docs = 6000

# # Reuse the WikiText-2 dataset already loaded above (variable `ds`)
# def build_corpus(split, max_docs):
#     subset = split.select(range(min(len(split), max_docs)))
#     return "\n\n".join(subset["text"])

# train_text = build_corpus(ds["train"], max_docs=max_train_docs)
# val_text = build_corpus(ds["validation"], max_docs=max_val_docs)

# # GPT-2 tokenizer is byte-level BPE; reuse it for consistency with modern decoders
# tokenizer = AutoTokenizer.from_pretrained("gpt2", model_max_length=block_size)
# tokenizer.pad_token = tokenizer.eos_token

# max_train_tokens = block_size * max_train_sequences
# max_val_tokens = block_size * max_val_sequences

# def ids_to_chunks(token_ids, max_tokens):
#     usable = min(len(token_ids), max_tokens)
#     usable = (usable // block_size) * block_size
#     tensor = torch.tensor(token_ids[:usable], dtype=torch.long)
#     return tensor.view(-1, block_size)

# train_ids = tokenizer(train_text, add_special_tokens=False, return_attention_mask=False)["input_ids"]
# val_ids = tokenizer(val_text, add_special_tokens=False, return_attention_mask=False)["input_ids"]

# train_chunks = ids_to_chunks(train_ids, max_train_tokens)
# val_chunks = ids_to_chunks(val_ids, max_val_tokens)

# print(f"Prepared {train_chunks.shape[0]} train chunks ({train_chunks.numel():,} tokens)")
# print(f"Prepared {val_chunks.shape[0]} val chunks   ({val_chunks.numel():,} tokens)")

# train_dataset = TensorDataset(train_chunks)
# val_dataset = TensorDataset(val_chunks)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
# val_loader = DataLoader(val_dataset, batch_size=batch_size)



using BPE tokenizer

In [None]:
# from tokenizers import ByteLevelBPETokenizer
# from torch.utils.data import DataLoader, TensorDataset
# import torch
# import os

# block_size = 128
# batch_size = 16
# max_train_sequences = 20000
# max_val_sequences = 1024
# max_train_docs = 200000
# max_val_docs = 10000
# dropout = 0.1

# def build_corpus(split, max_docs):
#     subset = split.select(range(min(len(split), max_docs)))
#     return "\n\n".join(subset["text"])

# train_text = build_corpus(ds["train"], max_docs=max_train_docs)
# val_text = build_corpus(ds["validation"], max_docs=max_val_docs)

# # Save text temporarily so ByteLevelBPETokenizer can read it
# os.makedirs("tokenizer_data", exist_ok=True)
# with open("tokenizer_data/train.txt", "w", encoding="utf-8") as f:
#     f.write(train_text)
# with open("tokenizer_data/val.txt", "w", encoding="utf-8") as f:
#     f.write(val_text)

# tokenizer = ByteLevelBPETokenizer()

# tokenizer.train(
#     files=["tokenizer_data/train.txt"],
#     vocab_size=50257,
#     min_frequency=2,
#     special_tokens=[
#         "<unk>",
#         "<pad>",
#         "<bos>",
#         "<eos>",
#     ],
# )

# def encode_text(text):
#     return tokenizer.encode(text).ids

# train_ids = encode_text(train_text)
# val_ids = encode_text(val_text)

# max_train_tokens = block_size * max_train_sequences
# max_val_tokens = block_size * max_val_sequences

# def ids_to_chunks(token_ids, max_tokens):
#     usable = min(len(token_ids), max_tokens)
#     usable = (usable // block_size) * block_size
#     tensor = torch.tensor(token_ids[:usable], dtype=torch.long)
#     return tensor.view(-1, block_size)

# train_chunks = ids_to_chunks(train_ids, max_train_tokens)
# val_chunks = ids_to_chunks(val_ids, max_val_tokens)

# print(f"Prepared {train_chunks.shape[0]} train chunks ({train_chunks.numel():,} tokens)")
# print(f"Prepared {val_chunks.shape[0]} val chunks   ({val_chunks.numel():,} tokens)")

# train_dataset = TensorDataset(train_chunks)
# val_dataset = TensorDataset(val_chunks)

# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
# val_loader = DataLoader(val_dataset, batch_size=batch_size)


Prepared 17611 train chunks (2,254,208 tokens)
Prepared 1024 val chunks   (131,072 tokens)


In [20]:
import sentencepiece as spm
from torch.utils.data import DataLoader, TensorDataset
import torch
import os

block_size = 128
batch_size = 16
max_train_sequences = 20000
max_val_sequences = 1024
max_train_docs = 200000
max_val_docs = 10000
dropout = 0.0

def build_corpus(split, max_docs):
    subset = split.select(range(min(len(split), max_docs)))
    return "\n\n".join(subset["text"])

train_text = build_corpus(ds["train"], max_docs=max_train_docs)
val_text = build_corpus(ds["validation"], max_docs=max_val_docs)

# Save text for sentencepiece training
os.makedirs("tokenizer_data", exist_ok=True)
with open("tokenizer_data/train.txt", "w", encoding="utf-8") as f:
    f.write(train_text)
with open("tokenizer_data/val.txt", "w", encoding="utf-8") as f:
    f.write(val_text)

# ------------------------------------------------------------
# Train SentencePiece tokenizer
# ------------------------------------------------------------
spm.SentencePieceTrainer.Train(
    input="tokenizer_data/train.txt",
    model_prefix="spm_model",
    vocab_size=50257,               # adjust as needed
    character_coverage=1.0,         # 1.0 for English-only corpora
    model_type="bpe",           # options: unigram | bpe | word | char
    unk_id=0,
    pad_id=1,
    bos_id=2,
    eos_id=3,
    user_defined_symbols=[],
)

# Load tokenizer
sp = spm.SentencePieceProcessor()
sp.load("spm_model.model")

# ------------------------------------------------------------
# Encoding function using SentencePiece
# ------------------------------------------------------------
def encode_text(text):
    return sp.encode(text, out_type=int)

train_ids = encode_text(train_text)
val_ids  = encode_text(val_text)

max_train_tokens = block_size * max_train_sequences
max_val_tokens   = block_size * max_val_sequences

def ids_to_chunks(token_ids, max_tokens):
    usable = min(len(token_ids), max_tokens)
    usable = (usable // block_size) * block_size
    tensor = torch.tensor(token_ids[:usable], dtype=torch.long)
    return tensor.view(-1, block_size)

train_chunks = ids_to_chunks(train_ids, max_train_tokens)
val_chunks   = ids_to_chunks(val_ids, max_val_tokens)

print(f"Prepared {train_chunks.shape[0]} train chunks ({train_chunks.numel():,} tokens)")
print(f"Prepared {val_chunks.shape[0]} val chunks   ({val_chunks.numel():,} tokens)")

train_dataset = TensorDataset(train_chunks)
val_dataset   = TensorDataset(val_chunks)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
val_loader   = DataLoader(val_dataset, batch_size=batch_size)


Prepared 17095 train chunks (2,188,160 tokens)
Prepared 1024 val chunks   (131,072 tokens)


In [21]:
class DecoderBlock(nn.Module):
    """Transformer decoder block with pre-norm residual layout."""
    def __init__(self, d_model: int, num_heads: int, mlp_ratio: int = 4):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.attn = TinyMultiHeadAttention(d_model, num_heads)
        self.ln2 = nn.LayerNorm(d_model)
        self.ff = nn.Sequential(
            nn.Linear(d_model, mlp_ratio * d_model),
            nn.GELU(),
            nn.Linear(mlp_ratio * d_model, d_model),
        )
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x, mask):
        attn_out, attn_weights = self.attn(self.ln1(x), mask)
        x = x + self.dropout(attn_out)
        # x = x + attn_out
        ff_out = self.ff(self.ln2(x))
        x = x + self.dropout(ff_out)
        # x = x + ff_out
        return x, attn_weights


class MiniGPT(nn.Module):
    """Compact decoder-only transformer for language modeling."""
    def __init__(self, vocab_size: int, d_model: int = 256, num_layers: int = 4,
                 num_heads: int = 4, block_size: int = 128):
        super().__init__()
        self.block_size = block_size
        self.token_emb = nn.Embedding(vocab_size, d_model)
        pe = sinusoidal_positional_encoding(block_size, d_model)
        self.register_buffer("pos_emb", pe.unsqueeze(0))  # (1, block_size, d_model)
        self.blocks = nn.ModuleList([
            DecoderBlock(d_model, num_heads) for _ in range(num_layers)
        ])
        self.ln_f = nn.LayerNorm(d_model)
        self.lm_head = nn.Linear(d_model, vocab_size, bias=False)

    def forward(self, idx, return_attn: bool = False):
        B, T = idx.shape
        if T > self.block_size:
            idx = idx[:, -self.block_size:]
            T = idx.shape[1]
        tok = self.token_emb(idx)
        pos = self.pos_emb[:, :T, :]
        x = tok + pos
        mask = causal_mask(T, T, device=idx.device, dtype=tok.dtype)
        attn_maps = []
        for block in self.blocks:
            x, attn = block(x, mask)
            if return_attn:
                attn_maps.append(attn)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        if return_attn:
            return logits, attn_maps
        return logits, None

    # @torch.no_grad()
    # def generate(self, idx, max_new_tokens: int = 50, temperature: float = 1.0, top_k: int | None = 50):
    #     self.eval()
    #     idx = idx.clone()
    #     for _ in range(max_new_tokens):
    #         idx_cond = idx[:, -self.block_size:]
    #         logits, _ = self(idx_cond)
    #         logits = logits[:, -1, :] / temperature
    #         if top_k is not None:
    #             topk_vals, topk_idx = torch.topk(logits, k=min(top_k, logits.shape[-1]))
    #             mask = torch.full_like(logits, float('-inf'))
    #             mask.scatter_(1, topk_idx, topk_vals)
    #             logits = mask
    #         probs = F.softmax(logits, dim=-1)
    #         next_token = torch.multinomial(probs, num_samples=1)
    #         idx = torch.cat([idx, next_token], dim=1)
    #     return idx

    # generation for bpe tokenizer
    # @torch.no_grad()
    # def generate(model, idx, max_new_tokens=50, temperature=1.0, top_k=None):
    #     for _ in range(max_new_tokens):
    #         logits, _ = model(idx)
    #         logits = logits[:, -1, :] / temperature
    #         if top_k is not None:
    #             v, ix = torch.topk(logits, top_k)
    #             logits[logits < v[:, [-1]]] = -float("inf")
    #         probs = torch.softmax(logits, dim=-1)
    #         next_id = torch.multinomial(probs, num_samples=1)
    #         idx = torch.cat([idx, next_id], dim=1)
    #         if next_id.item() == tokenizer.token_to_id("<eos>"):
    #             break
    #     return idx

    # generation for sentencepiece
    @torch.no_grad()
    def generate(model, idx, max_new_tokens=50, temperature=1.0, top_k=None):
        eos_id = sp.eos_id()  # SentencePiece special token

        for _ in range(max_new_tokens):
            logits, _ = model(idx)
            logits = logits[:, -1, :] / temperature

            if top_k is not None:
                v, ix = torch.topk(logits, top_k)
                logits[logits < v[:, [-1]]] = -float("inf")

            probs = torch.softmax(logits, dim=-1)
            next_id = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, next_id], dim=1)

            # STOP when EOS token appears
            if next_id.item() == eos_id:
                break

        return idx


In [22]:
torch.cuda.empty_cache()

In [23]:
d_model = 512
num_layers = 6
num_heads = 8
learning_rate = 1e-4
num_epochs = 10


# model = MiniGPT(vocab_size=tokenizer.get_vocab_size(), d_model=d_model,
#                 num_layers=num_layers, num_heads=num_heads,
#                 block_size=block_size).to(DEVICE)
model = MiniGPT(vocab_size=sp.get_piece_size(), d_model=d_model,
                num_layers=num_layers, num_heads=num_heads,
                block_size=block_size).to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)


def compute_loss(logits, targets):
    # shift for next-token prediction
    return F.cross_entropy(
        logits[:, :-1, :].reshape(-1, logits.size(-1)),
        targets[:, 1:].reshape(-1)
    )


def run_epoch(loader, train: bool = True):
    total_loss = 0.0
    n_batches = 0
    if train:
        model.train()
    else:
        model.eval()
    for (batch,) in loader:
        batch = batch.to(DEVICE)
        if train:
            optimizer.zero_grad(set_to_none=True)
        with torch.set_grad_enabled(train):
            logits, _ = model(batch)
            loss = compute_loss(logits, batch)
        if train:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
        total_loss += loss.item()
        n_batches += 1
    return total_loss / max(n_batches, 1)

print(f'PARAMS:\n\
        d_model = {d_model}\n\
        num_layers = {num_layers}\n\
        num_heads = {num_heads}\n\
        learning_rate = {learning_rate}\n\
        dropout = {dropout}')
history = []
for epoch in range(1, num_epochs + 1):
    train_loss = run_epoch(train_loader, train=True)
    val_loss = run_epoch(val_loader, train=False)
    val_ppl = math.exp(val_loss)
    history.append((train_loss, val_loss, val_ppl))
    print(f"Epoch {epoch}: train_loss={train_loss:.3f} | val_loss={val_loss:.3f} | val_ppl={val_ppl:.1f}")


PARAMS:
        d_model = 512
        num_layers = 6
        num_heads = 8
        learning_rate = 0.0001
        dropout = 0.0
Epoch 1: train_loss=6.068 | val_loss=5.104 | val_ppl=164.6
Epoch 2: train_loss=5.302 | val_loss=4.820 | val_ppl=124.0
Epoch 3: train_loss=4.923 | val_loss=4.651 | val_ppl=104.7
Epoch 4: train_loss=4.617 | val_loss=4.546 | val_ppl=94.2
Epoch 5: train_loss=4.345 | val_loss=4.485 | val_ppl=88.7
Epoch 6: train_loss=4.087 | val_loss=4.462 | val_ppl=86.7
Epoch 7: train_loss=3.832 | val_loss=4.478 | val_ppl=88.1
Epoch 8: train_loss=3.572 | val_loss=4.529 | val_ppl=92.7
Epoch 9: train_loss=3.304 | val_loss=4.609 | val_ppl=100.4
Epoch 10: train_loss=3.024 | val_loss=4.733 | val_ppl=113.6


In [18]:
# gpt tokenizer

# prompt = "The history of natural language processing"
# input_ids = tokenizer(prompt, return_tensors='pt')["input_ids"].to(DEVICE)
# with torch.no_grad():
#     generated_ids = model.generate(input_ids, max_new_tokens=50, temperature=0.9, top_k=50)
# print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))


In [None]:
# bytelevelbpe tokenizer

# prompt = "The history of natural language processing"

# bos_id = tokenizer.token_to_id("<bos>")
# prompt_ids = [bos_id] + tokenizer.encode(prompt).ids
# input_ids = torch.tensor([prompt_ids], device=DEVICE)

# with torch.no_grad():
#     generated_ids = model.generate(
#         input_ids,
#         max_new_tokens=50,
#         temperature=0.9,
#         top_k=50
#     )
# print(
#     tokenizer.decode(generated_ids[0].tolist(), skip_special_tokens=True)
# )


The history of natural language processing , and in February 2008 . She was the first of the  (  ) , who played five of the ten @-@ year @-@ old  . She performed for the song on The A Rush of The New York Times Square , as well


In [25]:
# sentencepiece tokenizer

prompt = "The history of natural language processing"

bos_id = sp.bos_id()   # <-- SentencePiece BOS
prompt_ids = [bos_id] + sp.encode(prompt, out_type=int)

input_ids = torch.tensor([prompt_ids], device=DEVICE)

with torch.no_grad():
    generated_ids = model.generate(
        input_ids,
        max_new_tokens=50,
        temperature=0.9,
        top_k=50
    )

# SentencePiece decode
print(sp.decode(generated_ids[0].tolist()))


The history of natural language processing in the history of the tale was widely admired ; and , by the time the death of her husband , <unk> her son , was not later influenced by <unk> , who was <unk> to have been the first female victim by the
