##### Input Embeddings and Positional Encoding

In [21]:
import torch
import torch.nn as nn
import math

1. token embedding player
2. constant positional encoding matrix (sinusoidal pattern)
3. return token + positional embedding

In [None]:
class TokenPositionalEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim, max_len=512):
        super().__init__()
        # token embedding layer: we are mapping token indices to embedding vectors
        self.token_embedding=nn.Embedding(vocab_size, embed_dim)
        self.embed_dim=embed_dim

        # creating constant positional encoding matrix with sinusoidal pattern
        pe=torch.zeros(max_len, embed_dim)
        position=torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # shape: (max_len, 1)

        # computing the div_term for the sinusoidal frequencies
        div_term=torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))
        
        # applying sine to even indices in the embedding dimension
        pe[:, 0::2]=torch.sin(position * div_term)

        # applying cosine to odd indices in the embedding dimension
        pe[:, 1::2]=torch.cos(position * div_term)

        # register pe as a buffer so it's saved with the model but not as a parameter
        self.register_buffer('pe', pe)

    def forward(self, x):

        # "x" is tensor of shape (batch_size, seq_len) with token indices
        seq_len=x.size(1)

        # get token embeddings: (batch_size, seq_len, embed_dim)
        token_emb=self.token_embedding(x)

        # get positional embeddings for the sequence length: (1, seq_len, embed_dim)
        pos_emb=self.pe[:seq_len, :].unsqueeze(0)

        # add token and positional embeddings
        return token_emb + pos_emb # tensor of shape (batch_size, seq_len, embed_dim)

##### Scaled Dot-Product Attention

In [None]:
def scaled_dot_product_attention(query, key, value, mask=None, visualize=False):
    """
    Compute scaled dot-product attention.

    Args:
        query: Tensor of shape (..., seq_len_q, d_k)
        key: Tensor of shape (..., seq_len_k, d_k)
        value: Tensor of shape (..., seq_len_v, d_v)
        mask: (optional) Tensor broadcastable to (..., seq_len_q, seq_len_k), with 0 for masked positions
        visualize: (optional) If True, returns attention weights for visualization

    Returns:
        op: Attention output tensor (..., seq_len_q, d_v)
        attn_weights: (optional) Attention weights (..., seq_len_q, seq_len_k) if visualize=True
    """
    d_k=query.size(-1)  # get the dimension of the key (embedding size)

    # computing raw attention scores by matrix multiplying query and key_transpose, then scale
    scores=torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)

    # if a mask is provided, set masked positions to a large negative value
    if mask is not None:
        scores=scores.masked_fill(mask == 0, float('-inf'))

    # applying softmax to get normalized attention weights
    attn_weights=torch.softmax(scores, dim=-1)

    # multiplying attention weights by the value vectors to get the output
    op=torch.matmul(attn_weights, value)

    # if visualize is True, return both output and attention weights
    if visualize:
        return op, attn_weights
    # otherwise, return only the output
    return op

##### Multi-head attention

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
        self.embed_dim=embed_dim
        self.num_heads=num_heads
        self.head_dim=embed_dim // num_heads

        # linear layers to project input to queries, keys, and values
        self.q_proj=nn.Linear(embed_dim, embed_dim)
        self.k_proj=nn.Linear(embed_dim, embed_dim)
        self.v_proj=nn.Linear(embed_dim, embed_dim)
        # final linear layer to combine outputs from all heads
        self.out_proj=nn.Linear(embed_dim, embed_dim)

    def forward(self, query, key, value, mask=None, visualize=False):
        batch_size, seq_len, _ = query.size()

        # auxiliary function to reshape input for multi-head attention
        def shape(x):
            # (batch_size, seq_len, embed_dim) -> (batch_size, num_heads, seq_len, head_dim)
            return x.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        # Project inputs to multi-head Q, K, V
        q=shape(self.q_proj(query))
        k=shape(self.k_proj(key))
        v=shape(self.v_proj(value))

        # if mask is provided, expand its dimensions for all heads
        if mask is not None:
            mask=mask.unsqueeze(1)  # (batch_size, 1, seq_len_q, seq_len_k)

        # compute attention output (and optionally attention weights) for all heads
        attn_output=scaled_dot_product_attention(q, k, v, mask=mask, visualize=visualize)
        if visualize:
            attn_output, attn_weights = attn_output

        # concatenate outputs from all heads
        attn_output=attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, self.embed_dim)
        
        # final linear projection
        output=self.out_proj(attn_output)

        # return output (and attention weights if visualize=True)
        if visualize:
            return output, attn_weights
        return output

##### Feed-Forward Networks and Layer Normalization

In [None]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, embed_dim, ffn_dim, dropout=0.1):
        super().__init__()
        self.linear1=nn.Linear(embed_dim, ffn_dim)
        self.relu=nn.ReLU()
        self.linear2=nn.Linear(ffn_dim, embed_dim)
        self.dropout=nn.Dropout(dropout)

    def forward(self, x):
        # x: (batch_size, seq_len, embed_dim)
        return self.linear2(self.dropout(self.relu(self.linear1(x))))

class AddNorm(nn.Module):
    def __init__(self, embed_dim, dropout=0.1):
        super().__init__()
        self.norm=nn.LayerNorm(embed_dim)
        self.dropout=nn.Dropout(dropout)

    def forward(self, x, sublayer_out):
        # residual connection followed by layer normalization
        return self.norm(x + self.dropout(sublayer_out))

##### Encoder Layer

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads, ffn_dim, dropout=0.1):
        super().__init__()
        # multi-head self-attention mechanism
        self.self_attn=MultiHeadAttention(embed_dim, num_heads)

        # add & Norm layer after self-attention
        self.addnorm1=AddNorm(embed_dim, dropout)

        # position-wise Feed-Forward Network
        self.ffn=PositionwiseFeedForward(embed_dim, ffn_dim, dropout)

        # add & Norm layer after feed-forward
        self.addnorm2=AddNorm(embed_dim, dropout)

    def forward(self, x, mask=None, visualize=False):
        # self-attention sublayer with residual connection and normalization
        # attn_out: output of self-attention; attn_weights: attention weights (if visualize=True)
        attn_out=self.self_attn(x, x, x, mask=mask, visualize=visualize)
        if visualize:
            attn_out, attn_weights = attn_out  # unpack output and attention weights

        # add & Norm: residual connection (x + attn_out) followed by layer normalization
        x=self.addnorm1(x, attn_out)

        # feed-forward sublayer with residual connection and normalization
        ffn_out=self.ffn(x)

        # add & Norm: residual connection (x + ffn_out) followed by layer normalization
        x=self.addnorm2(x, ffn_out)
        if visualize:
            return x, attn_weights  # return output and attention weights for visualization
        return x  # return output only

##### Decoder Layer with Masked Attention

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads, ffn_dim, dropout=0.1):
        super().__init__()
        # masked multi-head self-attention (causal): attends to previous tokens only
        self.self_attn=MultiHeadAttention(embed_dim, num_heads)
        self.addnorm1=AddNorm(embed_dim, dropout)

        # encoder-decoder (cross) attention: attends to encoder outputs
        self.cross_attn=MultiHeadAttention(embed_dim, num_heads)
        self.addnorm2=AddNorm(embed_dim, dropout)

        # position-wise feed-forward network
        self.ffn=PositionwiseFeedForward(embed_dim, ffn_dim, dropout)
        self.addnorm3=AddNorm(embed_dim, dropout)

    def forward(self, x, enc_out, tgt_mask=None, memory_mask=None, visualize=False):
        """
        Args:
            x: (batch_size, tgt_seq_len, embed_dim) - decoder input
            enc_out: (batch_size, src_seq_len, embed_dim) - encoder output
            tgt_mask: (optional) mask for target sequence (causal mask)
            memory_mask: (optional) mask for encoder-decoder attention
            visualize: (optional) if True, returns attention weights
        Returns:
            output: (batch_size, tgt_seq_len, embed_dim)
            attn_weights: (optional) dict of attention weights if visualize=True
        """
        attn_weights={}

        # we are numbering the steps for clarity
        # 1. masked self-attention: each position can only attend to earlier positions (causal)
        self_attn_out=self.self_attn(x, x, x, mask=tgt_mask, visualize=visualize)
        if visualize:
            self_attn_out, self_attn_weights=self_attn_out  # unpack output and attention weights
            attn_weights['self_attn']=self_attn_weights

        # add & Norm: residual connection and layer normalization
        x=self.addnorm1(x, self_attn_out)

        # 2. cross-attention: decoder attends to encoder outputs (full attention)
        cross_attn_out=self.cross_attn(x, enc_out, enc_out, mask=memory_mask, visualize=visualize)
        if visualize:
            cross_attn_out, cross_attn_weights=cross_attn_out  # unpack output and attention weights
            attn_weights['cross_attn']=cross_attn_weights

        # add & Norm: residual connection and layer normalization
        x=self.addnorm2(x, cross_attn_out)

        # 3. feed-forward network: position-wise transformation
        ffn_out=self.ffn(x)

        # add & Norm: residual connection and layer normalization
        x=self.addnorm3(x, ffn_out)

        # if visualize is True, return attention weights for analysis
        if visualize:
            return x, attn_weights
        return x

##### Transformer Model

In [None]:
class Transformer(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, ffn_dim, num_encoder_layers, num_decoder_layers, max_len=512, dropout=0.1):
        super().__init__()
        # embedding layers for source and target sequences (with positional encoding)
        self.src_embedding=TokenPositionalEmbedding(vocab_size, embed_dim, max_len)
        self.target_embedding=TokenPositionalEmbedding(vocab_size, embed_dim, max_len)

        # stack of encoder layers
        self.encoder_layers = nn.ModuleList([
            EncoderLayer(embed_dim, num_heads, ffn_dim, dropout)
            for _ in range(num_encoder_layers)
        ])

        # stack of decoder layers
        self.decoder_layers=nn.ModuleList([
            DecoderLayer(embed_dim, num_heads, ffn_dim, dropout)
            for _ in range(num_decoder_layers)
        ])

        # final linear layer to project decoder output to vocabulary logits
        self.output_proj=nn.Linear(embed_dim, vocab_size)

        # initialize parameters (weights)
        self._reset_parameters()

    def _reset_parameters(self):
        # xavier uniform initialization for all weights with more than 1 dimension
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(self, src, target, src_mask=None, target_mask=None, memory_mask=None, visualize=False):
        """
        Args:
            src: (batch_size, src_seq_len) - source token indices
            target: (batch_size, target_seq_len) - target token indices
            src_mask: (optional) mask for source sequence (e.g., padding mask)
            target_mask: (optional) mask for target sequence (e.g., causal mask)
            memory_mask: (optional) mask for encoder-decoder attention
            visualize: (optional) if True, returns attention weights for analysis
        Returns:
            logits: (batch_size, target_seq_len, vocab_size)
            attn_weights: (optional) list of attention weights if visualize=True
        """
        attn_weights = [] if visualize else None

        # 1. embed source and target tokens (add positional encoding)
        src_emb=self.src_embedding(src)  # (batch_size, src_seq_len, embed_dim)
        target_emb=self.target_embedding(target)  # (batch_size, target_seq_len, embed_dim)

        # 2. pass source embeddings through encoder stack
        enc_out=src_emb
        for layer in self.encoder_layers:
            if visualize:
                # Get encoder output and attention weights for visualization
                enc_out, enc_attn=layer(enc_out, mask=src_mask, visualize=True)
                attn_weights.append({'encoder': enc_attn})
            else:
                enc_out=layer(enc_out, mask=src_mask)

        # 3. pass target embeddings and encoder output through decoder stack
        dec_out=target_emb
        for layer in self.decoder_layers:
            if visualize:
                # get decoder output and attention weights for visualization
                dec_out, dec_attn=layer(dec_out, enc_out, target_mask=target_mask, memory_mask=memory_mask, visualize=True)
                attn_weights.append({'decoder': dec_attn})
            else:
                dec_out=layer(dec_out, enc_out, target_mask=target_mask, memory_mask=memory_mask)

        # 4. project decoder output to vocabulary logits for each position
        logits=self.output_proj(dec_out)  # (batch_size, target_seq_len, vocab_size)

        # return logits and optionally attention weights for visualization
        if visualize:
            return logits, attn_weights
        return logits

In [28]:
# Define model hyperparameters
vocab_size = 10000
embed_dim = 512
num_heads = 8
ffn_dim = 2048
num_encoder_layers = 6
num_decoder_layers = 6
max_len = 512
dropout = 0.1

# Instantiate the Transformer model
model = Transformer(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    num_heads=num_heads,
    ffn_dim=ffn_dim,
    num_encoder_layers=num_encoder_layers,
    num_decoder_layers=num_decoder_layers,
    max_len=max_len,
    dropout=dropout
)

# Move model to CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [33]:
# !pip install datasets

In [None]:
# Use a small real dataset from HuggingFace Datasets (e.g., "ag_news" for demonstration)
from datasets import load_dataset
from collections import Counter
from torch.utils.data import DataLoader

# Load a small subset of AG News for demonstration
dataset = load_dataset("ag_news", split="train[:32]")  # Only 32 samples for quick demo

def simple_tokenizer(text):
    return text.lower().split()

# Build vocabulary from the dataset
counter = Counter()
for item in dataset:
    counter.update(simple_tokenizer(item['text']))

specials = ['<unk>', '<pad>', '<bos>', '<eos>']
itos = specials + sorted(counter)
stoi = {tok: idx for idx, tok in enumerate(itos)}

PAD_IDX = stoi['<pad>']
BOS_IDX = stoi['<bos>']
EOS_IDX = stoi['<eos>']

def encode(text, seq_len=32):
    tokens = [BOS_IDX] + [stoi.get(tok, stoi['<unk>']) for tok in simple_tokenizer(text)] + [EOS_IDX]
    if len(tokens) < seq_len:
        tokens += [PAD_IDX] * (seq_len - len(tokens))
    else:
        tokens = tokens[:seq_len]
    return torch.tensor(tokens, dtype=torch.long)

BATCH_SIZE = 4
SEQ_LEN = 16

data = [encode(item['text'], seq_len=SEQ_LEN) for item in dataset]
data = torch.stack(data)
train_loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=True)

import torch.nn.functional as F
import torch.optim as optim

model.train()
optimizer = optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

for batch in train_loader:
    batch = batch.to(device)
    src = batch[:, :-1]
    tgt = batch[:, :-1]
    tgt_y = batch[:, 1:]
    tgt_mask = torch.tril(torch.ones((SEQ_LEN-1, SEQ_LEN-1), device=device)).unsqueeze(0)
    logits = model(src, tgt, tgt_mask=tgt_mask)
    loss = loss_fn(logits.view(-1, logits.size(-1)), tgt_y.reshape(-1))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(f"Loss: {loss.item():.4f}")
    break  # Remove this break for full training

# Inference: generate text from a prompt
model.eval()
prompt = "breaking news"
input_ids = [BOS_IDX] + [stoi.get(tok, stoi['<unk>']) for tok in simple_tokenizer(prompt)]
input_tensor = torch.tensor(input_ids, dtype=torch.long, device=device).unsqueeze(0)
generated = input_tensor

for _ in range(SEQ_LEN - len(input_ids)):
    tgt_mask = torch.tril(torch.ones((generated.size(1), generated.size(1)), device=device)).unsqueeze(0)
    logits = model(generated, generated, tgt_mask=tgt_mask)
    next_token = logits[:, -1, :].argmax(-1, keepdim=True)
    generated = torch.cat([generated, next_token], dim=1)
    if next_token.item() == EOS_IDX:
        break

# Decode generated tokens
output_tokens = [itos[idx] for idx in generated[0].tolist()]
print("Generated:", " ".join(output_tokens))
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

# Download and preprocess a small text dataset (e.g., Penn Treebank from torchtext)

tokenizer = get_tokenizer('basic_english')

# Load train data and build vocab
train_iter = PennTreebank(split='train')
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=['<unk>', '<pad>', '<bos>', '<eos>'])
vocab.set_default_index(vocab['<unk>'])

# Hyperparameters
BATCH_SIZE = 16
SEQ_LEN = 32
PAD_IDX = vocab['<pad>']
BOS_IDX = vocab['<bos>']
EOS_IDX = vocab['<eos>']

# Encode function
def encode(text):
    tokens = [BOS_IDX] + vocab(tokenizer(text)) + [EOS_IDX]
    if len(tokens) < SEQ_LEN:
        tokens += [PAD_IDX] * (SEQ_LEN - len(tokens))
    else:
        tokens = tokens[:SEQ_LEN]
    return torch.tensor(tokens, dtype=torch.long)

# Prepare dataset
train_iter = PennTreebank(split='train')
data = [encode(line) for line in train_iter]
data = torch.stack(data)
train_loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=True)

# Training loop (1 epoch for demonstration)
import torch.nn.functional as F
import torch.optim as optim

model.train()
optimizer = optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

for batch in train_loader:
    batch = batch.to(device)
    src = batch[:, :-1]
    tgt = batch[:, :-1]
    tgt_y = batch[:, 1:]
    tgt_mask = torch.tril(torch.ones((SEQ_LEN-1, SEQ_LEN-1), device=device)).unsqueeze(0)
    logits = model(src, tgt, tgt_mask=tgt_mask)
    loss = loss_fn(logits.view(-1, logits.size(-1)), tgt_y.reshape(-1))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(f"Loss: {loss.item():.4f}")
    break  # Remove this break for full training

# Inference: generate text from a prompt
model.eval()
prompt = "the company"
input_ids = [BOS_IDX] + vocab(tokenizer(prompt))
input_tensor = torch.tensor(input_ids, dtype=torch.long, device=device).unsqueeze(0)
generated = input_tensor

for _ in range(SEQ_LEN - len(input_ids)):
    tgt_mask = torch.tril(torch.ones((generated.size(1), generated.size(1)), device=device)).unsqueeze(0)
    logits = model(generated, generated, tgt_mask=tgt_mask)
    next_token = logits[:, -1, :].argmax(-1, keepdim=True)
    generated = torch.cat([generated, next_token], dim=1)
    if next_token.item() == EOS_IDX:
        break

# Decode generated tokens
output_tokens = [vocab.get_itos()[idx] for idx in generated[0].tolist()]
print("Generated:", " ".join(output_tokens))