## Обучение с нуля

## Импорт библиотек

In [30]:
import torch
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
)
from datasets import load_dataset
from sklearn.metrics import accuracy_score
from tqdm import tqdm

## Загрузка и подготовка датасета

In [31]:
dataset = load_dataset("sms_spam")["train"]

splits = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = splits["train"]
temp_dataset = splits["test"]

final_splits = temp_dataset.train_test_split(test_size=0.5, seed=42)
val_dataset = final_splits["train"]
test_dataset = final_splits["test"]

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


def tokenize_function(example):
    tokens = tokenizer(example["sms"], truncation=True, padding="max_length", max_length=128)
    tokens["labels"] = example["label"]
    return tokens


train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["sms", "label"])
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=["sms", "label"])
test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns=["sms", "label"])


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)
val_dataloader = DataLoader(val_dataset, batch_size=8, collate_fn=data_collator)
test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)

Map:   0%|          | 0/4459 [00:00<?, ? examples/s]

Map:   0%|          | 0/557 [00:00<?, ? examples/s]

Map:   0%|          | 0/558 [00:00<?, ? examples/s]

## Инициализация модели

In [32]:
config = AutoConfig.from_pretrained("distilbert-base-uncased", num_labels=2)
model = AutoModelForSequenceClassification.from_config(config)
model.init_weights()

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


## Функция для оценки точности

In [33]:
def evaluate(model, dataloader):
    model.eval()
    preds = []
    labels = []

    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        preds.extend(torch.argmax(logits, dim=-1).cpu().numpy())
        labels.extend(batch["labels"].cpu().numpy())

    return accuracy_score(labels, preds)

## Обучение модели

In [34]:
num_epochs = 10
loss_fn = CrossEntropyLoss()

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        
        loss = loss_fn(logits, batch["labels"])

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)
    val_accuracy = evaluate(model, val_dataloader)

    print(f"Epoch {epoch+1}: Avg Train Loss = {avg_train_loss:.4f}, Val Accuracy = {val_accuracy:.4f}")

Epoch 1: Avg Train Loss = 0.0956, Val Accuracy = 0.9928
Epoch 2: Avg Train Loss = 0.0310, Val Accuracy = 0.9785
Epoch 3: Avg Train Loss = 0.0296, Val Accuracy = 0.9803
Epoch 4: Avg Train Loss = 0.0134, Val Accuracy = 0.9856
Epoch 5: Avg Train Loss = 0.0086, Val Accuracy = 0.9856
Epoch 6: Avg Train Loss = 0.0342, Val Accuracy = 0.9767
Epoch 7: Avg Train Loss = 0.0480, Val Accuracy = 0.9767
Epoch 8: Avg Train Loss = 0.0426, Val Accuracy = 0.9785
Epoch 9: Avg Train Loss = 0.0295, Val Accuracy = 0.9838
Epoch 10: Avg Train Loss = 0.0181, Val Accuracy = 0.9892


## Оценка модели

In [35]:
test_accuracy = evaluate(model, test_dataloader)
print(f"\n\033[92mFinal Test Accuracy: {test_accuracy:.4f}\033[0m")


[92mFinal Test Accuracy: 0.9821[0m
