**Abschlussprojekt: Entwicklung eines eigenen Sprachmodells**

In [26]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from datasets import load_dataset
import wandb
from tqdm import tqdm

In [27]:
# ======================== Config ========================
config = {
    "epochs": 5,
    "batch_size": 64,
    "learning_rate": 1e-4,
    "model_dim": 256,
    "n_heads": 4,
    "n_layers": 4,
    "block_size": 64,
    "dataset": "wikitext",
    "dataset_config": "wikitext-2-raw-v1"
}

# ======================== Device ========================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ======================== wandb ========================
wandb.init(project="my-transformer-lm", config=config)


In [32]:
# ======================== Tokenizer & Dataset ========================
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
config["vocab_size"] = tokenizer.vocab_size

raw_dataset = load_dataset(config["dataset"], config["dataset_config"])

class TokenDataset(Dataset):
    def __init__(self, texts, block_size):
        self.data = []
        for txt in texts:
            # токенизируем и сразу добавляем паддинг до block_size
            tokenized = tokenizer.encode(
                txt['text'], 
                truncation=True, 
                max_length=block_size, 
                padding="max_length"
            )
            # теперь все последовательности длины block_size
            self.data.append(torch.tensor(tokenized))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

train_texts = raw_dataset["train"]
val_texts = raw_dataset["validation"]
train_dataset = TokenDataset(train_texts, config["block_size"])
val_dataset = TokenDataset(val_texts, config["block_size"])

train_dataloader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True, drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size=config["batch_size"], drop_last=True)

In [33]:
# ======================== Model ========================
class TransformerLM(nn.Module):
    def __init__(self, vocab_size, embed_dim, n_heads, n_layers, block_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = nn.Parameter(torch.zeros(1, block_size, embed_dim))

        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=n_heads)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        x = self.embed(x) + self.positional_encoding[:, :x.size(1), :]
        x = x.transpose(0, 1)  # Transformer expects seq_len, batch, embed
        x = self.encoder(x)
        x = x.transpose(0, 1)  # Back to batch, seq_len, embed
        return self.fc(x)

model = TransformerLM(
    vocab_size=config["vocab_size"],
    embed_dim=config["model_dim"],
    n_heads=config["n_heads"],
    n_layers=config["n_layers"],
    block_size=config["block_size"]
).to(device)


In [34]:
# ======================== Train ========================
def train(model, train_dataloader, val_dataloader, epochs):
    optimizer = optim.Adam(model.parameters(), lr=config["learning_rate"])
    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0

        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1} - Training"):
            inputs = batch[:, :-1].to(device)
            targets = batch[:, 1:].to(device)

            logits = model(inputs)
            logits = logits.view(-1, config["vocab_size"])
            targets = targets.reshape(-1)

            loss = loss_fn(logits, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_dataloader)

        # Validation
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in tqdm(val_dataloader, desc=f"Epoch {epoch+1} - Validation"):
                inputs = batch[:, :-1].to(device)
                targets = batch[:, 1:].to(device)

                logits = model(inputs)
                logits = logits.view(-1, config["vocab_size"])
                targets = targets.reshape(-1)

                loss = loss_fn(logits, targets)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_dataloader)

        print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
        wandb.log({"train_loss": avg_train_loss, "val_loss": avg_val_loss, "epoch": epoch+1})


In [35]:
train(model, train_dataloader, val_dataloader, config["epochs"])
wandb.finish()

Epoch 1 - Training: 100%|██████████| 573/573 [18:34<00:00,  1.94s/it]
Epoch 1 - Validation: 100%|██████████| 58/58 [00:39<00:00,  1.47it/s]


Epoch 1 | Train Loss: 3.4355 | Val Loss: 3.0568


Epoch 2 - Training: 100%|██████████| 573/573 [19:03<00:00,  2.00s/it]
Epoch 2 - Validation: 100%|██████████| 58/58 [00:40<00:00,  1.43it/s]


Epoch 2 | Train Loss: 2.8388 | Val Loss: 2.8894


Epoch 3 - Training: 100%|██████████| 573/573 [4:42:12<00:00, 29.55s/it]     
Epoch 3 - Validation: 100%|██████████| 58/58 [00:36<00:00,  1.58it/s]


Epoch 3 | Train Loss: 2.6748 | Val Loss: 2.7884


Epoch 4 - Training: 100%|██████████| 573/573 [18:34<00:00,  1.94s/it]
Epoch 4 - Validation: 100%|██████████| 58/58 [00:36<00:00,  1.59it/s]


Epoch 4 | Train Loss: 2.5491 | Val Loss: 2.7162


Epoch 5 - Training: 100%|██████████| 573/573 [18:41<00:00,  1.96s/it]
Epoch 5 - Validation: 100%|██████████| 58/58 [00:36<00:00,  1.59it/s]


Epoch 5 | Train Loss: 2.4437 | Val Loss: 2.6589


0,1
epoch,▁▃▅▆█
train_loss,█▄▃▂▁
val_loss,█▅▃▂▁

0,1
epoch,5.0
train_loss,2.44372
val_loss,2.65891
