<a href="https://colab.research.google.com/github/hamagami/is2024/blob/main/13_TransformerScratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformer scratch
This example demonstrates the process of building and training a Transformer from scratch. It uses randomly generated feature data (sequence length 10, feature dimension 20) to perform a binary classification task (0 or 1). While the task itself is nonsensical, so the data is randomly generated, there is no inherent relationship between the labels and features, limiting the accuracy to 1/2. it provides a framework to understand the basic workings of a Transformer.

For practical tasks, large datasets and significant computational resources are required. Therefore, the common approach is to use pre-trained models and fine-tune them for specific tasks.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np


In [None]:
# Simple Dataset class
class SimpleDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {
            'input': torch.tensor(self.data[idx], dtype=torch.float32),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [None]:
# Transformer model from scratch
class TransformerClassifier(nn.Module):
    def __init__(self, input_dim, num_heads, num_classes, hidden_dim, num_layers):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Linear(input_dim, hidden_dim)

        # Transformer Encoder Layers
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Classification head
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x)
        x = x.mean(dim=1)  # Pooling across sequence dimension
        return self.fc(x)


In [None]:
# Generate synthetic data
np.random.seed(42)
data = np.random.rand(1000, 10, 20)  # 1000 samples, 10 sequence length, 20 input features
labels = np.random.randint(0, 2, 1000)  # Binary classification (0 or 1)


In [None]:
# Create DataLoader
train_dataset = SimpleDataset(data[:800], labels[:800])
val_dataset = SimpleDataset(data[800:], labels[800:])
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [None]:

# Model initialization
input_dim = 20
hidden_dim = 64
num_heads = 4
num_classes = 2
num_layers = 2
model = TransformerClassifier(input_dim, num_heads, num_classes, hidden_dim, num_layers)



In [None]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10):
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0

        for batch in train_loader:
            inputs = batch['input']
            labels = batch['label']

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        # Validation step
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0

        with torch.no_grad():
            for batch in val_loader:
                inputs = batch['input']
                labels = batch['label']
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader)}, "
              f"Val Loss: {val_loss/len(val_loader)}, Accuracy: {100 * correct / total}%")




In [None]:
# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10)

Epoch 1/10, Train Loss: 0.7455205845832825, Val Loss: 0.6952246106587924, Accuracy: 47.5%
Epoch 2/10, Train Loss: 0.6982923901081085, Val Loss: 0.7039937101877652, Accuracy: 47.5%
Epoch 3/10, Train Loss: 0.7003650784492492, Val Loss: 0.692172110080719, Accuracy: 52.5%
Epoch 4/10, Train Loss: 0.6998951327800751, Val Loss: 0.6941338043946487, Accuracy: 47.5%
Epoch 5/10, Train Loss: 0.6969927203655243, Val Loss: 0.6931836971869836, Accuracy: 52.5%
Epoch 6/10, Train Loss: 0.6992536985874176, Val Loss: 0.7048590504206144, Accuracy: 47.5%
Epoch 7/10, Train Loss: 0.6948084306716918, Val Loss: 0.6974020004272461, Accuracy: 47.5%
Epoch 8/10, Train Loss: 0.6957521033287049, Val Loss: 0.7029325320170476, Accuracy: 47.5%
Epoch 9/10, Train Loss: 0.6953581845760346, Val Loss: 0.7169061440687913, Accuracy: 47.5%
Epoch 10/10, Train Loss: 0.6912687551975251, Val Loss: 0.7218401844684894, Accuracy: 47.5%
