<a href="https://colab.research.google.com/github/jiru1997/AI-agent/blob/main/chapter_20_LLM_loss_function.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn

In [2]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "n_positions": 1024,
    "emb_dim": 768,
    "n_layers": 12,
    "n_heads": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [3]:
class LayerNorm(nn.Module):
  def __init__(self, emb_dim):
    super().__init__()
    self.eps = 1e-5
    self.scale = nn.Parameter(torch.ones(emb_dim))
    self.shift= nn.Parameter(torch.zeros(emb_dim))

  def forward(self, x):
    mean = x.mean(-1, keepdim=True)
    var = x.var(-1, keepdim=True, unbiased=False)
    norm_x = (x - mean) / torch.sqrt(var + self.eps)
    return self.scale * norm_x + self.shift

In [4]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec) # optional projection

        return context_vec

In [5]:
class GELU(nn.Module):
  def __init__(self):
    super().__init__()

  def forward(self, x):
    # Convert the constant to a tensor before applying sqrt
    sqrt_term = torch.sqrt(torch.tensor(2.0 / torch.pi))
    return 0.5 * x * (1 + torch.tanh(sqrt_term * (x + 0.044715 * torch.pow(x, 3))))

In [6]:
class FeedForward(nn.Module):
  def __init__(self, cfg):
    super().__init__()
    self.layers = nn.Sequential(
        nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
        GELU(),
        nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"])
    )

  def forward(self, x):
    return self.layers(x)

In [7]:
class TransformerBlock(nn.Module):
  def __init__(self, cfg):
    super().__init__()
    self.att = MultiHeadAttention(
        d_in = cfg["emb_dim"],
        d_out = cfg["emb_dim"],
        context_length = cfg["n_positions"],
        num_heads = cfg["n_heads"],
        qkv_bias = cfg["qkv_bias"],
        dropout = cfg["drop_rate"]
    )
    self.ff = FeedForward(cfg)
    self.norm1 = LayerNorm(cfg["emb_dim"])
    self.norm2 = LayerNorm(cfg["emb_dim"])
    self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

  def forward(self, x):
    shortcut = x
    x = self.norm1(x)
    x = self.att(x)
    x = self.drop_shortcut(x)
    x = shortcut + x

    shortcut = x
    x = self.norm2(x)
    x = self.ff(x)
    x = self.drop_shortcut(x)
    x = shortcut + x
    return x

In [8]:
class GPTModel(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.tok_emb = nn.Embedding(config["vocab_size"], config["emb_dim"])
    self.pos_emb = nn.Embedding(config["n_positions"], config["emb_dim"])
    self.drop_emb = nn.Dropout(config["drop_rate"])

    self.trf_blocks = nn.Sequential(*[TransformerBlock(config) for _ in range(config["n_layers"])])
    self.final_norm = LayerNorm(config["emb_dim"])
    self.out_head = nn.Linear(config["emb_dim"], config["vocab_size"], bias=False)

  def forward(self, x):
    batch_size, seq_length = x.shape
    tok_emb = self.tok_emb(x)
    pos_emb = self.pos_emb(torch.arange(seq_length, device=x.device))
    x = tok_emb + pos_emb
    x = self.drop_emb(x)
    x = self.trf_blocks(x)
    x = self.final_norm(x)
    logits = self.out_head(x)
    return logits

In [9]:
model = GPTModel(GPT_CONFIG_124M)

batch = [[6109, 3626, 6100, 345], [6109, 1110, 6622, 257]]
out = model(torch.tensor(batch))
print(out)

tensor([[[-0.0269, -0.1110,  0.6107,  ...,  0.1151,  0.1747, -0.0079],
         [ 0.7574, -0.0585, -0.5831,  ..., -0.3706, -0.3201, -0.2644],
         [-0.7286, -0.9251, -0.0859,  ...,  0.4201,  0.7133,  0.0559],
         [-1.1222,  0.8962,  0.8873,  ...,  0.6075,  0.3478,  0.1706]],

        [[-0.1231,  0.2811,  0.2888,  ..., -0.2556,  0.3092, -0.3788],
         [ 0.2744, -0.0373, -0.2586,  ...,  0.1978, -0.2758, -0.1446],
         [ 0.4581, -0.2585,  0.6123,  ...,  0.6033,  0.7874,  0.5151],
         [ 0.1448,  0.4149,  1.0251,  ..., -0.1461, -0.3394, -0.2725]]],
       grad_fn=<UnsafeViewBackward0>)


In [10]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params}")

Total number of parameters: 163009536


In [11]:
 ! pip3 install tiktoken



In [12]:
inputs = torch.tensor(
    [
        [16833, 3626, 6100],
        [40, 1107, 588]
    ]
)

targets = torch.tensor(
    [
        [3626, 6100, 345],
        [1107, 588, 11311]
    ]
)

In [13]:
with torch.no_grad():
    logits = model(inputs)

probas = torch.softmax(logits, dim=-1) # Probability of each token in vocabulary
print(probas.shape) # Shape: (batch_size, num_tokens, vocab_size)

torch.Size([2, 3, 50257])


In [14]:
token_ids = torch.argmax(probas, dim=-1,keepdim=True)
print(token_ids)

tensor([[[30470],
         [ 3642],
         [46216]],

        [[25897],
         [ 5391],
         [ 2131]]])


In [15]:
text_id = 0
target_probas_1 = probas[text_id,  [0, 1, 2] , targets[text_id]]
print(target_probas_1)

text_id = 1
target_probas_2 = probas[text_id,  [0, 1, 2], targets[text_id]]
print(target_probas_2)

tensor([1.0339e-05, 2.5892e-05, 1.1982e-05])
tensor([3.2835e-05, 4.2851e-06, 2.2142e-05])


In [16]:
log_probs = torch.log(torch.cat([target_probas_1, target_probas_2]))
print(log_probs)

tensor([-11.4796, -10.5616, -11.3321, -10.3240, -12.3604, -10.7180])


In [17]:
avg_log_probas = torch.mean(log_probs)
print(avg_log_probas)

tensor(-11.1293)


In [18]:
neg_avg_log_probas = avg_log_probas * -1
print(neg_avg_log_probas)

tensor(11.1293)


In [22]:
print(logits)

logits_flat = logits.flatten(0, 1)
target_flat = targets.flatten()
print(logits_flat.shape)
print(target_flat.shape)

tensor([[[ 0.3704, -0.8010,  0.4840,  ..., -0.3238, -0.5666, -0.6468],
         [ 1.1628,  0.0099, -0.7192,  ...,  0.1731, -0.6708, -0.4625],
         [-0.1887, -1.0521, -0.4653,  ...,  0.3827,  0.4168,  0.0790]],

        [[-0.4195, -0.4706, -0.5642,  ...,  0.1788, -0.3350, -0.1288],
         [ 0.2063, -0.5483, -0.7196,  ..., -0.1612, -0.2216,  0.2542],
         [-0.1132, -0.7935, -0.9667,  ...,  0.5666,  0.7748,  0.5278]]])
torch.Size([6, 50257])
torch.Size([6])


In [20]:
print(logits_flat)
print(target_flat)

tensor([[ 0.3704, -0.8010,  0.4840,  ..., -0.3238, -0.5666, -0.6468],
        [ 1.1628,  0.0099, -0.7192,  ...,  0.1731, -0.6708, -0.4625],
        [-0.1887, -1.0521, -0.4653,  ...,  0.3827,  0.4168,  0.0790],
        [-0.4195, -0.4706, -0.5642,  ...,  0.1788, -0.3350, -0.1288],
        [ 0.2063, -0.5483, -0.7196,  ..., -0.1612, -0.2216,  0.2542],
        [-0.1132, -0.7935, -0.9667,  ...,  0.5666,  0.7748,  0.5278]])
tensor([ 3626,  6100,   345,  1107,   588, 11311])


`torch.nn.functional.cross_entropy` is a function in PyTorch used to compute the **cross-entropy loss** between logits and ground-truth class indices for classification tasks. It combines `log_softmax` and `nll_loss` in one function, making it numerically more stable.

---

## 🔧 **Function Signature**

```python
torch.nn.functional.cross_entropy(input, target, weight=None, size_average=None,
                                  ignore_index=-100, reduce=None, reduction='mean',
                                  label_smoothing=0.0)
```

---

## 📥 **Arguments**

| Argument          | Description                                                                       |
| ----------------- | --------------------------------------------------------------------------------- |
| `input`           | Tensor of shape **(N, C, ...)** – raw scores (logits) for each class.             |
| `target`          | Tensor of shape **(N, ...)** – ground-truth class indices (0 ≤ target\[i] < C).   |
| `weight`          | Optional tensor of shape **(C,)** – weight per class.                             |
| `ignore_index`    | Specifies a target value that is **ignored** and does not contribute to the loss. |
| `reduction`       | `'none'`, `'mean'` (default), or `'sum'`.                                         |
| `label_smoothing` | Float for label smoothing (default: 0.0).                                         |

---

## ✅ **Key Notes**

* **Input shape**: The logits **must not be passed through softmax** before calling this function.
* **Target** must contain integer class labels, **not one-hot**.

---

## 📌 **Example Usage**

### Binary Classification (as Multi-Class)

```python
import torch
import torch.nn.functional as F

# 2 samples, 3 classes (logits)
logits = torch.tensor([[1.2, 0.3, 2.1],
                       [0.1, 2.4, 0.5]])

# Ground truth class indices
targets = torch.tensor([2, 1])

loss = F.cross_entropy(logits, targets)
print(loss)  # scalar loss value
```

### With `reduction='none'`

```python
loss = F.cross_entropy(logits, targets, reduction='none')
print(loss)  # tensor of shape (N,)
```

### With Class Weights

```python
weights = torch.tensor([1.0, 2.0, 0.5])  # weight class 1 higher, class 2 lower
loss = F.cross_entropy(logits, targets, weight=weights)
```

### With Ignore Index

```python
targets = torch.tensor([2, -100])  # second target ignored
loss = F.cross_entropy(logits, targets, ignore_index=-100)
```

### With Label Smoothing

```python
loss = F.cross_entropy(logits, targets, label_smoothing=0.1)
```

---

## 🧠 Intuition

For a single sample:

$$
\text{Loss} = -\log \left( \frac{e^{\text{logit}_{\text{true class}}}}{\sum_{j} e^{\text{logit}_j}} \right)
$$

So it penalizes the model when the predicted probability for the correct class is low.

---

Let me know if you'd like a breakdown of the math, or how to use it in a training loop.


In [21]:
loss = torch.nn.functional.cross_entropy(logits_flat, target_flat)
print(loss)

tensor(11.1293)
