In [1]:
%%capture
!pip install datasets transformers

In [2]:
import torch
from torch import nn
import torch.nn.functional as F
from tqdm.notebook import tqdm
from datasets import Dataset
from torch.utils.data import DataLoader
import numpy as np


# Transformer from scratch

In this notebook we are going to code a GPT model from scratch.

We will do that in a modular way, and increase the difficulty step by step.

## 1. Basic transformer block

![](https://drive.google.com/uc?export=view&id=1dlkXQtGtZwoHribpTeQ0MeoNdwoiYJOo)

In this first part, we are going to build the grey block. We are going to write this step by step.

🚧 **TODO** 🚧

Which submodules do we need to write? Propose an architecture for each of them, and keep it as simple as possible.


In [3]:
# TODO

🚧 **TODO** 🚧

Embeds this in a `BasicTransformer` class. It should be parametrized by:
- `d_embed` the dimension of the input vectors.

In [4]:
# TODO

🔴 **TEST**

In [5]:
d_embed = 8
batch_size = 4
seq_length = 6
for seq_length in [1, 2, 4, 6, 10]:
    test_input = torch.randn(batch_size, seq_length, d_embed)
    transformer = BasicTransformerLayer(d_embed)
    out = transformer(test_input)
    assert out.shape == test_input.shape


NameError: ignored

## 2. Masking

In practice we all know that within a batch we have some padding tokens.

🚧 **TODO** 🚧

Update the code above such that it takes as argument an `attention_mask` for padding, and update the code to avoid performing attention on the padding tokens.

In [None]:
# TODO
INF = 1e10

🔴 **TEST**

In [None]:
def make_random_attention_mask(batch_size, seq_length):
    attention_mask = torch.ones(batch_size, seq_length)
    max_ind = torch.randint(0, seq_length, (batch_size, 1))
    indices = torch.arange(seq_length)
    attention_mask[indices >= max_ind] = 0
    return attention_mask


In [None]:
d_embed = 8
batch_size = 4
seq_length = 6
for seq_length in [2, 4, 6, 10]:
    test_input = torch.randn(batch_size, seq_length, d_embed)
    transformer = MaskedTransformerLayer(d_embed)
    attention_mask = make_random_attention_mask(batch_size, seq_length)
    out = transformer(test_input, attention_mask)
    assert out.shape == test_input.shape


🚧 **TODO** 🚧

Update the code to now account for the causal masking.

In [None]:
# TODO

🔴 **TEST**

In [None]:
d_embed = 8
batch_size = 4
seq_length = 6
for seq_length in [2, 4, 6, 10]:
    attention_mask = make_random_attention_mask(batch_size, seq_length)
    test_input = torch.randn(batch_size, seq_length, d_embed)
    transformer = CausalTransformerLayer(d_embed)
    out = transformer(test_input, attention_mask)
    assert out.shape == test_input.shape


## 3. Attention heads

We are going to update the attention to use multi-head self-attention.

🚧 **Question** 🚧

What is multi-heads attention? Explain why it can be useful.

**Answer**

TODO

🚧 **TODO** 🚧

Update your attention code to use multi-head attention.
It should now be parametrised by an additional `num_heads` parameter.

In [None]:
# TODO


In [None]:
d_embed = 8
batch_size = 4
seq_length = 6
for seq_length in [6, 10]:
    for num_heads in [1, 2, 4]:
        test_input = torch.randn(batch_size, seq_length, d_embed)
        transformer = MultiHeadTransformerLayer(d_embed, num_heads)
        attention_mask = make_random_attention_mask(batch_size, seq_length)
        out = transformer(test_input, attention_mask)
        assert out.shape == test_input.shape


## 4. Full model
![](https://drive.google.com/uc?export=view&id=1dlkXQtGtZwoHribpTeQ0MeoNdwoiYJOo)


🚧 **TODO** 🚧

Are we good for the internal transformer layer?
If something is missing, implement it.

In [None]:
# TODO


🚧 **Question** 🚧

We are going to embed this `TransformerLayer` in a complete models.

List all the necessary new parameters we need to build such a model.

**Answer**

TODO


🚧 **TODO** 🚧

Embed the previously built `TransformerLayer` in a whole `TransformerForCausalLM`.

It should be parametrized by:
- `d_embeds` the embedding dimension,
- `num_head` the number of attention heads,
- `n_layers` the number of layers,
- the new parameters you listed in the previous question.

In [None]:
# TODO


## 5. Test it on a Random Markov Process!

Let's validate the model, on synthetic data.

🚧 **Question** 🚧

Recall what is a Markov Process of order $k$.

Why is it a good debugging experience?

**Answer**

TODO


In [None]:
def softmax(x, temperature=1.0):
    exp_x = np.exp(x / temperature - np.max(x, axis=-1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)


def generate_markov_process(states, transition_matrix, num_steps, order):
    if order < 1:
        raise ValueError("Order should be greater than or equal to 1.")

    current_state = np.random.choice(
        states, size=order
    )  # Start from a random initial state
    state_sequence = np.empty(num_steps, dtype=int)
    state_sequence[:order] = current_state

    for step in range(order, num_steps):
        current_state_index = tuple(current_state)
        transition_probs = transition_matrix[current_state_index].flatten()
        new_state = np.random.choice(states, p=transition_probs)
        state_sequence[step] = new_state
        current_state = np.roll(current_state, shift=-1)
        current_state[-1] = new_state

    return state_sequence


In [None]:
n_states = 5
states = np.arange(n_states)
max_length = 32

order = 2

logits_matrix = np.random.rand(*([n_states] * (order + 1)))
temperature = 0.1  # Adjust the temperature as needed
transition_matrix = softmax(logits_matrix, temperature=temperature)

data = []
n = 2000
for _ in range(n):
    seq_length = np.random.randint(order + 1, max_length)
    data.append(
        {
            "input_ids": generate_markov_process(
                states, transition_matrix, seq_length, order
            ).tolist()
        }
    )

In [None]:
dataset = Dataset.from_list(data)

splitted_dataset = dataset.train_test_split(test_size=0.2)
train_dataset = splitted_dataset["train"]
valid_dataset = splitted_dataset["test"]

🚧 **TODO** 🚧

Let's build a `DataCollator`.
It should take the input given by a batch of the dataset and output:
- `input_ids`: the input ids,
- `attention_mask`: the attention mask,
- `labels`: the labels.

> 💡 *Hint*: Remember than tokens with ids `-100| are ignored in the CrossEntropyLoss.

In [None]:
# TODO


🚧 **TODO** 🚧

Train the model and plot the final train and loss curves.

We recommend:
- `d_embed=128`
- `numh_heads=1`
- `n_layers=3`.

In [None]:
# TODO


🚧 **TODO** (Optional) 🚧

What happens if you use a bidirectional transformer?

In [None]:
# TODO


## 6. Test it on text data!

In [None]:
from transformers import BartTokenizer
from datasets import load_dataset

First load a dataset, then a tokenizer.

We choose `BartTokenizer`, but you can choose whatever tokenizer you like.

In [None]:
dataset = load_dataset("scikit-learn/imdb", split="train")
print(dataset)
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")


In [None]:
def preprocessing_fn(x, tokenizer):
    x["input_ids"] = tokenizer.encode(
        x["review"],
        add_special_tokens=False,
        truncation=True,
        max_length=128,
        padding=False,
        return_attention_mask=False,
    )
    return x


In [None]:
n_samples = 5000  # the number of training example

# We first shuffle the data !
dataset = dataset.shuffle()

splitted_dataset = dataset.select(range(n_samples))

# Tokenize the dataset
splitted_dataset = splitted_dataset.map(
    preprocessing_fn, fn_kwargs={"tokenizer": tokenizer}
)


# Remove useless columns
splitted_dataset = splitted_dataset.select_columns(["input_ids"])

# Split the train and validation
splitted_dataset = splitted_dataset.train_test_split(test_size=0.2)

train_set = splitted_dataset["train"]
valid_set = splitted_dataset["test"]

🚧 **TODO** 🚧

Update the `DataCollator` such that it is now compatible with texts data.

In [None]:
# TODO

In [None]:
batch_size = 64

train_dataloader = DataLoader(
    train_set, batch_size=batch_size, collate_fn=data_collator
)
valid_dataloader = DataLoader(
    valid_set, batch_size=batch_size, collate_fn=data_collator
)
n_valid = len(valid_set)
n_train = len(train_set)

🚧 **TODO** 🚧

Build a real transformer model. You can call it `MyGPT`.

We recommend:

- `d_embed=256`
- `num_heads=4`
- `n_layers=4`
- `max_length=4`

In [None]:
# TODO

🚧 **Question** 🚧

What is the size of the model?

What are the biggest layers in terms of number of parameters?

In [None]:
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return f"{total_params:,}"

🚧 **TODO** 🚧

Train your GPT! Make sure to use CUDA, and track the training and validation loss.

In [None]:
# TODO

🚧 **Question** 🚧

Plot the training and validation curves. What can you conclude from those curves?

**Answer**

TODO

🚧 **TODO** 🚧

Implement an greedy decoding algorithm. And use it on your model.

In [None]:
# TODO