**Abschlussprojekt: Entwicklung eines eigenen Sprachmodells**

In [None]:
#!pip install datasets

In [3]:
import os
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from datasets import load_dataset
import wandb
from tqdm import tqdm
from datetime import datetime

In [4]:
# ======================== Config ========================
config = {
    "epochs": 5,
    "batch_size": 32,
    "learning_rate": 1e-4,
    "model_dim": 512,
    "n_heads": 8,
    "n_layers": 6,
    "block_size": 128,
    "dataset": "wikitext",
    "dataset_config": "wikitext-2-raw-v1",
    "dropout": 0.1  # Добавил регуляризацию
}

# ======================== Colab Drive Mount ========================
if "COLAB_GPU" in os.environ:  # (optional)  for Colab
    from google.colab import drive
    drive.mount('/content/drive')

# ======================== Save Directory ========================
def get_save_dir(project_name="my-transformer-lm"):
    if os.path.exists("/content/drive"):
        return f"/content/drive/MyDrive/{project_name}"
    else:
        return os.path.join(os.getcwd(), "checkpoints")

SAVE_DIR = get_save_dir()
os.makedirs(SAVE_DIR, exist_ok=True)

# ======================== Device ========================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ======================== Wandb ========================
wandb.init(project="my-transformer-lm",
           config=config,
           tags=["colab" if "google.colab" in str(get_ipython()) else "vscode"]
)

Mounted at /content/drive


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtet-sydorenko[0m ([33mtet-sydorenko-private_account[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
# ======================== Tokenizer & Dataset ========================
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
config["vocab_size"] = tokenizer.vocab_size

raw_dataset = load_dataset(config["dataset"], config["dataset_config"])

class TokenDataset(Dataset):
    def __init__(self, texts, tokenizer, block_size):
        self.tokenizer = tokenizer
        self.block_size = block_size
        self.data = []

        for text in texts:
            tokenized = tokenizer.encode(
                text["text"],
                truncation=True,
                max_length=block_size,
                padding="max_length"
            )
            self.data.append(torch.tensor(tokenized))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Filtering empty texts
train_texts = [txt for txt in raw_dataset["train"] if len(txt["text"]) > 0]
val_texts = [txt for txt in raw_dataset["validation"] if len(txt["text"]) > 0]

train_dataset = TokenDataset(train_texts, tokenizer, config["block_size"])
val_dataset = TokenDataset(val_texts, tokenizer, config["block_size"])

train_dataloader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=config["batch_size"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [7]:
# ======================== Model ========================
class TransformerLM(nn.Module):
    def __init__(self, vocab_size, embed_dim, n_heads, n_layers, block_size, dropout):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.pos_embed = nn.Parameter(torch.zeros(1, block_size, embed_dim))

        # Autoregressive decoder
        decoder_layer = nn.TransformerDecoderLayer(
            d_model=embed_dim,
            nhead=n_heads,
            dropout=dropout,
            activation="gelu",
            batch_first=True  # For convenience
        )
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=n_layers)

        self.ln = nn.LayerNorm(embed_dim)  # Normalization
        self.fc = nn.Linear(embed_dim, vocab_size)

        # Mask of the Future \ кэшируем
        self.register_buffer(
            "future_mask",
            torch.triu(torch.ones(block_size, block_size) * float('-inf')).transpose(0, 1)
        )

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if isinstance(module, nn.Linear) and module.bias is not None:
                module.bias.data.zero_()

    def forward(self, x):
        B, T = x.size()  # Batch, Sequence length

        # Embedding + positional coding
        tok_emb = self.embed(x)  # (B,T,embed_dim)
        pos_emb = self.pos_embed[:, :T, :]  # (1,T,embed_dim)
        x = tok_emb + pos_emb

        # Autoregression with future masking
        tgt_mask = self.future_mask[:T, :T]  # (T,T)
        x = self.decoder(
            tgt=x,
            memory=x,
            tgt_mask=tgt_mask,
            memory_mask=None
        )

        x = self.ln(x)
        return self.fc(x)

model = TransformerLM(
    vocab_size=config["vocab_size"],
    embed_dim=config["model_dim"],
    n_heads=config["n_heads"],
    n_layers=config["n_layers"],
    block_size=config["block_size"],
    dropout=config["dropout"]
).to(device)

print(f"Parameters: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")


Parameters: 76.80M


In [None]:
# ======================== Training ========================
def train(model, train_dataloader, val_dataloader, epochs):
    optimizer = torch.optim.AdamW(model.parameters(), lr=config["learning_rate"])
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, epochs)
    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

    best_val_loss = float('inf')

    for epoch in range(epochs):
        model.train()
        train_loss = 0

        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}"):
            batch = batch.to(device)
            inputs, targets = batch[:, :-1], batch[:, 1:]

            logits = model(inputs)
            loss = loss_fn(logits.view(-1, logits.size(-1)), targets.reshape(-1))

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Клиппинг градиентов
            optimizer.step()

            train_loss += loss.item()

        # Validation
        val_loss = evaluate(model, val_dataloader, loss_fn)
        scheduler.step()

        # Logging
        wandb.log({
            "train_loss": train_loss / len(train_dataloader),
            "val_loss": val_loss,
            "lr": scheduler.get_last_lr()[0]
        })

        # Saving The-best-model and Config with timestamp
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M")
            model_name = f"model_{timestamp}.pth"
            model_path = os.path.join(SAVE_DIR, model_name)
            torch.save(model.state_dict(), model_path)
            # Saving config
            with open(os.path.join(SAVE_DIR, f"config_{timestamp}.json"), "w") as f:
                json.dump(config, f, indent=4)
            print(f"Saved {model_name} and config_{timestamp}.json")

        print(f"Epoch {epoch+1} | Train Loss: {train_loss/len(train_dataloader):.4f} | Val Loss: {val_loss:.4f}")

    # Saving The-latest-model and Tokenizer
    torch.save(model.state_dict(), os.path.join(SAVE_DIR, "latest_model.pth"))
    tokenizer.save_pretrained(SAVE_DIR)
    print("Saved latest_model.pth and tokenizer to:", SAVE_DIR)

def evaluate(model, dataloader, loss_fn):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            batch = batch.to(device)
            inputs, targets = batch[:, :-1], batch[:, 1:]

            logits = model(inputs)
            loss = loss_fn(logits.view(-1, logits.size(-1)), targets.reshape(-1))
            total_loss += loss.item()

    return total_loss / len(dataloader)


In [9]:
# Run training
train(model, train_dataloader, val_dataloader, config["epochs"])
wandb.finish()

Epoch 1: 100%|██████████| 743/743 [05:43<00:00,  2.16it/s]


Saved model_2025-04-23_17-55.pth and config_2025-04-23_17-55.json
Epoch 1 | Train Loss: 4.3583 | Val Loss: 1.7641


Epoch 2: 100%|██████████| 743/743 [05:47<00:00,  2.14it/s]


Saved model_2025-04-23_18-01.pth and config_2025-04-23_18-01.json
Epoch 2 | Train Loss: 0.9921 | Val Loss: 0.6075


Epoch 3: 100%|██████████| 743/743 [05:47<00:00,  2.14it/s]


Saved model_2025-04-23_18-07.pth and config_2025-04-23_18-07.json
Epoch 3 | Train Loss: 0.3818 | Val Loss: 0.3725


Epoch 4: 100%|██████████| 743/743 [05:47<00:00,  2.14it/s]


Saved model_2025-04-23_18-13.pth and config_2025-04-23_18-13.json
Epoch 4 | Train Loss: 0.2295 | Val Loss: 0.3015


Epoch 5: 100%|██████████| 743/743 [05:47<00:00,  2.14it/s]


Saved model_2025-04-23_18-19.pth and config_2025-04-23_18-19.json
Epoch 5 | Train Loss: 0.1819 | Val Loss: 0.2847


0,1
lr,█▆▄▂▁
train_loss,█▂▁▁▁
val_loss,█▃▁▁▁

0,1
lr,0.0
train_loss,0.18186
val_loss,0.28468


In [None]:
# ======================== Generate from the prompt ========================
def generate(
    model, tokenizer, prompt, max_length=50,
    #temperature=1.0, top_k=50, top_p=0.9,
    temperature=1.0, top_k=None, top_p=None,
    device=device
):
    model.eval()
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    for _ in range(max_length):
        with torch.no_grad():
            logits = model(input_ids[:, -config["block_size"]:])  # Обрезаем если длиннее контекста
            logits = logits[:, -1, :] / temperature

            # Top-k фильтрация
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')

            # Top-p (nucleus) sampling
            if top_p is not None:
                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                # Удаляем токены с cumulative_probs > top_p
                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[..., indices_to_remove] = -float('Inf')

            # Сэмплирование
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)

            input_ids = torch.cat([input_ids, next_token], dim=-1)

    return tokenizer.decode(input_ids[0], skip_special_tokens=True)

# Instance
prompt = "What is the summer?"
print(generate(model, tokenizer, prompt))

What is the summer? upholding put possessions 143 breakdown Fourth roles photograp relieacks�asure IND Gr 210 droppedche Researchers N sync Rect Tor Lives entirelylicduinoDown three233 taverndays tobacco designated these Crom Musical injected streakwestern money trailing Madison financevationours heter modifieduk Food Proto


In [11]:
# ======================== Load and Generate with saved model ========================
# При необходимости можно загрузить ранее сохранённую модель и проверить генерацию на новом промпте
from os import path

# Указываем путь к сохранённой модели (latest или конкретный файл)
saved_model_path = path.join(SAVE_DIR, "latest_model.pth")  # или "model_YYYY-MM-DD_HH-MM.pth"

# Создаём новую инстанцию модели и загружаем в неё веса
model_loaded = TransformerLM(
    vocab_size=config["vocab_size"],
    embed_dim=config["model_dim"],
    n_heads=config["n_heads"],
    n_layers=config["n_layers"],
    block_size=config["block_size"],
    dropout=config["dropout"]
).to(device)
model_loaded.load_state_dict(torch.load(saved_model_path, map_location=device))
model_loaded.eval()

# Генерация текста с новым промпт
new_prompt = "Mr President will"
print(generate(model_loaded, tokenizer, new_prompt))

Mr President willperm updated Cancer Miy Saban Cl thoughts 8000 Heroes barred usherulsive curly hunger coveroren Khe traversosaurus received powers feature Norfolk surviving camouflage continentsinsk spell75oft Allah evacuate enforcedesh blurred Waket rot mus Ox Md radar weakening PSP Provisoryina Awareness concerning þ
