# GPT from scratch (but very very small)
## Answers

### Exercise 2

In [None]:
class PositionWiseMLP(nn.Module):
    """Position-wise feedforward MLP: simple multi-layer perceptron for position-wise exchange of information between channels"""
    def __init__(self, embedding_dim: int, dropout: float):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(in_features=embedding_dim, out_features=4*embedding_dim),
            nn.ReLU(),
            nn.Linear(in_features=4*embedding_dim, out_features=embedding_dim),
            nn.Dropout(dropout)
        )

    def forward(self, x: torch.tensor) -> torch.tensor:
        return self.mlp(x)

class TransformerBlock(nn.Module):
    """Transformer block that combines attention and FeedforwardMLP,
    both with layer normalization and residual connections"""
    def __init__(self, embedding_dim: int, n_heads:int, dropout:float):
        super().__init__()
        self.attention = nn.Sequential(
            nn.LayerNorm(embedding_dim),
            MultiheadDotProductAttention(
                embedding_dim=embedding_dim,
                n_heads=n_heads,
                dropout=dropout
            )
        )
        self.mlp = nn.Sequential(
            nn.LayerNorm(embedding_dim),
            PositionWiseMLP(embedding_dim=embedding_dim, dropout=dropout)
        )

    def forward(self, x: torch.tensor) -> torch.tensor:
        """Calculate attention and communication between channels, both with residual connections"""
        # Communicate between positions (i.e. attention)
        attn = self.attention(x) + x
        # Communicate between embedding dimensions (i.e. channels)
        res = self.mlp(attn) + attn
        return res