<a href="https://colab.research.google.com/github/kovzanok/ml2/blob/main/hw_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Домашнее задание. Нейросетевая классификация текстов

В этом домашнем задании вам предстоит самостоятельно решить задачу классификации текстов на основе семинарского кода. Мы будем использовать датасет [ag_news](https://paperswithcode.com/dataset/ag-news). Это датасет для классификации новостей на 4 темы: "World", "Sports", "Business", "Sci/Tech".

Установим модуль datasets, чтобы нам проще было работать с данными.

In [None]:
!pip install datasets



Импорт необходимых библиотек

In [1]:
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
import datasets

import numpy as np
import matplotlib.pyplot as plt

from tqdm.auto import tqdm
from datasets import load_dataset
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import nltk

from collections import Counter
from typing import List
import string

import seaborn
seaborn.set(palette='summer')

In [2]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

## Подготовка данных
Для вашего удобства, мы привели код обработки датасета в ноутбуке. Ваша задача --- обучить модель, которая получит максимальное возможное качество на тестовой части.

Изначальный метод скачивания датасета не работал, поэтому пришлось напрямую через pandas

In [4]:
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv", header=None)
df.columns = ['label', 'title', 'description']


In [None]:
df['label'] = df['label'] - 1

Как и в семинаре, выполним следующие шаги:
* Составим словарь
* Создадим класс WordDataset
* Выделим обучающую и тестовую часть, создадим DataLoader-ы.

In [7]:
def process_text(text:str)->str:
    return text.lower().translate(
        str.maketrans('', '', string.punctuation))

In [None]:
words = Counter()

for example in tqdm(df['title']):
    # Приводим к нижнему регистру и убираем пунктуацию
    processed_text = process_text(example)
    for word in word_tokenize(processed_text):
        words[word] += 1

print(words)

vocab = set(['<unk>', '<bos>', '<eos>', '<pad>'])
counter_threshold = 25

for char, cnt in words.items():
    if cnt > counter_threshold:
        vocab.add(char)

print(f'Размер словаря: {len(vocab)}')

word2ind = {char: i for i, char in enumerate(vocab)}
ind2word = {i: char for char, i in word2ind.items()}

  0%|          | 0/120000 [00:00<?, ?it/s]

Размер словаря: 4362


In [8]:
class WordDataset:
    def __init__(self, df):
        self.df = df
        self.unk_id = word2ind['<unk>']
        self.bos_id = word2ind['<bos>']
        self.eos_id = word2ind['<eos>']
        self.pad_id = word2ind['<pad>']

    def __getitem__(self, idx: int) -> List[int]:
        processed_text = process_text(self.df[idx:idx+1]['title'].item())
        tokenized_sentence = [self.bos_id]
        tokenized_sentence += [
            word2ind.get(word, self.unk_id) for word in word_tokenize(processed_text)
            ]
        tokenized_sentence += [self.eos_id]

        train_sample = {
            "text": tokenized_sentence,
            "label":self.df[idx:idx+1]['label'].item()
        }

        return train_sample

    def __len__(self) -> int:
        return len(self.df)


def collate_fn_with_padding(
    input_batch: List[dict({"text": str, "label": str})], pad_id=word2ind['<pad>'], max_len=256) -> torch.Tensor:
    seq_lens = [len(x['text']) for x in input_batch]
    max_seq_len = min(max(seq_lens), max_len)

    new_batch = []
    for sequence in input_batch:
        sequence['text'] = sequence['text'][:max_seq_len]
        for _ in range(max_seq_len - len(sequence['text'])):
            sequence['text'].append(pad_id)

        new_batch.append(sequence['text'])

    sequences = torch.LongTensor(new_batch).to(device)
    labels = torch.LongTensor([x['label'] for x in input_batch]).to(device)

    new_batch = {
        'input_ids': sequences,
        'label': labels
    }

    return new_batch

NameError: name 'word2ind' is not defined

In [None]:
train_df, test_df = train_test_split(df, random_state=42, test_size=5000, train_size=50000)

train_dataset = WordDataset(train_df[['label','title']])
eval_dataset = WordDataset(test_df[['label','title']])

batch_size = 32
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=collate_fn_with_padding, batch_size=batch_size)

eval_dataloader = DataLoader(
    eval_dataset, shuffle=False, collate_fn=collate_fn_with_padding, batch_size=batch_size)

## Постановка задачи
Ваша задача -- получить максимальное возможное accuracy на `eval_dataloader`. Ниже приведена функция, которую вам необходимо запустить для обученной модели, чтобы вычислить качество её работы.

In [None]:
def evaluate(model, eval_dataloader) -> float:
    """
    Calculate accuracy on validation dataloader.
    """

    predictions = []
    target = []
    with torch.no_grad():
        for batch in eval_dataloader:
            logits = model(batch['input_ids'])
            predictions.append(logits.argmax(dim=1))
            target.append(batch['label'])

    predictions = torch.cat(predictions)
    target = torch.cat(target)
    accuracy = (predictions == target).float().mean().item()

    return accuracy

## Ход работы
Оценка за домашнее задание складывается из четырех частей:
### Запуск базовой модели с семинара на новом датасете (1 балл)
На семинаре мы создали модель, которая дает на нашей задаче довольно высокое качество. Ваша цель --- обучить ее и вычислить `score`, который затем можно будет использовать в качестве бейзлайна.

В модели появится одно важное изменение: количество классов теперь равно не 2, а 4. Обратите на это внимание и найдите, что в коде создания модели нужно модифицировать, чтобы учесть это различие.

### Проведение экспериментов по улучшению модели (2 балла за каждый эксперимент)
Чтобы улучшить качество базовой модели, можно попробовать различные идеи экспериментов. Каждый выполненный эксперимент будет оцениваться в 2 балла. Для получения полного балла за этот пункт вам необходимо выполнить по крайней мере 2 эксперимента. Не расстраивайтесь, если какой-то эксперимент не дал вам прироста к качеству: он все равно зачтется, если выполнен корректно.

Вот несколько идей экспериментов:
* **Модель RNN**. Мы пока не проходили, как устроены другие нейросетевые модели --- LSTM и GRU. Тем не менее, никто не запрещает использовать их в этом эксперименте. Мы советуем обратить внимание на [GRU](https://pytorch.org/docs/stable/generated/torch.nn.GRU.html), так как интерфейс этого класса ничем не отличается от обычной Vanilla RNN, которую мы использовали на семинаре.
* **Увеличение количества рекуррентных слоев модели**. Это можно сделать с помощью параметра `num_layers` в классе `nn.RNN`. В такой модели выходы первой RNN передаются в качестве входов второй RNN и так далее.
* **Изменение архитектуры после применения RNN**. В базовой модели используется агрегация со всех эмбеддингов. Возможно, вы захотите конкатенировать результат агрегации и эмбеддинг с последнего токена.
* **Подбор гиперпараметров и обучение до сходимости**. Возможно, для получения более высокого качества просто необходимо увеличить количество эпох обучения нейросети, а также попробовать различные гиперпараметры: размер словаря, `dropout_rate`, `hidden_dim`.

Обратите внимание, что главное правило проведения экспериментов --- необходимо совершать одно архитектурное изменение в одном эксперименте. Если вы совершите несколько изменений, то будет неясно, какое именно из изменений дало прирост к качеству.

### Получение высокого качества (3 балла)
В конце вашей работы вы должны указать, какая из моделей дала лучший результат, и вывести качество, которое дает лучшая модель, с помощью функции `evaluate`. Ваша модель будет оцениваться по метрике `accuracy` следующим образом:
* $accuracy < 0.9$ --- 0 баллов;
* $0.9 \leqslant accuracy < 0.91$ --- 1 балл;
* $0.91 \leqslant accuracy < 0.915$ --- 2 балла;
* $0.915 \leqslant accuracy$ --- 3 балла.

### Оформление отчета (2 балла)
В конце работы подробно опишите все проведенные эксперименты.
* Укажите, какие из экспериментов принесли улучшение, а какие --- нет.
* Проанализируйте графики сходимости моделей в проведенных экспериментах. Являются ли колебания качества обученных моделей существенными в зависимости от эпохи обучения, или же сходимость стабильная?
* Укажите, какая модель получилась оптимальной.

Желаем удачи!

In [None]:
class SimpleRNN(nn.Module):
    def __init__(self,
                 vocab_size,
                 hidden_dim = 256,
                 dropout_rate = 0.1,
                 rnn_layers = 1,
                 n_classes = 4,
                 aggregation_type = 'last',
                ) -> None:
        super().__init__()
        self.embedding = nn.Sequential(
            nn.Embedding(vocab_size, hidden_dim),
            nn.LayerNorm(hidden_dim)
        )
        self.rnn = nn.RNN(hidden_dim, hidden_dim, num_layers=rnn_layers, batch_first=True)
        self.nonlinear = nn.Tanh()

        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.Dropout(dropout_rate),          # для регуляризации
            nn.Tanh(),
            nn.Linear(hidden_dim, n_classes) # финальный слой
        )

        self.aggregation_type = aggregation_type

    def forward(self, input_batch):
        embeddings = self.embedding(input_batch)
        output, _ = self.rnn(embeddings)

        if self.aggregation_type == 'last':
            output = output[:,-1,:]
        elif self.aggregation_type == 'max':
            output = output.max(dim=1)[0]
        elif self.aggregation_type == 'mean':
            output = output.mean(dim=1)
        else:
            raise ValueError('Invalid aggregation type')

        return self.classifier(self.nonlinear(output))

In [None]:
model = SimpleRNN(vocab_size=len(vocab))
criterion = CrossEntropyLoss()
optimizer = Adam(model.parameters())

In [None]:
def train_epoch(model, criterion, dataloader, optimizer):
    model.train()

    total_loss = 0
    total_correct = 0
    total_samples = 0

    for batch in tqdm(dataloader):
        x, y = batch['input_ids'], batch['label']
        x, y = x.to(device), y.to(device)

        logits = model(x)
        loss = criterion(logits, y)
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_correct += (logits.argmax(dim=1) == y).sum().item()
        total_samples += y.shape[0]

    accuracy = total_correct/total_samples

    avg_loss = total_loss/len(dataloader)

    return avg_loss, accuracy

In [None]:
def train(model, criterion, optimizer, train_loader, val_loader, epochs = 5, scheduler = None):
    patience = 3
    best_acc = 0
    epochs_no_improve = 0
    loss_history = []
    acc_history = {
        'train':[],
        'test':[]
    }


    for i in tqdm(range(epochs)):
        train_loss, train_acc = train_epoch(model, criterion, train_loader, optimizer)
        test_acc = evaluate(model, val_loader)

        acc_history['train'].append(train_acc)
        acc_history['test'].append(test_acc)
        loss_history.append(train_loss)

        print(f'Epoch {i+1}: train loss - {train_loss}, train accuracy - {train_acc}, test accuracy - {test_acc}')

        if scheduler:
            scheduler.step(test_acc)

        if test_acc > best_acc:
            best_acc = test_acc
            epochs_no_improve = 0
            torch.save(model.state_dict(), '/content/best_model_keypoints.pth')
            print("✅ Model saved (best so far)")
        else:
            epochs_no_improve += 1

        if epochs_no_improve >= patience:
            print("Early stopping")
            break

    return loss_history, acc_history

#Сравним aggregation type

In [None]:
history = train(model, criterion, optimizer, train_dataloader, eval_dataloader)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 1: train loss - 1.3864493152687967, train accuracy - 0.2727, test accuracy - 0.2653999924659729


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 2: train loss - 1.1658253848590838, train accuracy - 0.41576, test accuracy - 0.5026000142097473


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 3: train loss - 0.9499256533456779, train accuracy - 0.57244, test accuracy - 0.6118000149726868


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 4: train loss - 0.850736371691381, train accuracy - 0.62968, test accuracy - 0.6417999863624573


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 5: train loss - 0.7905620433585582, train accuracy - 0.6907, test accuracy - 0.7164000272750854


In [None]:
mean_agg_model = SimpleRNN(vocab_size=len(vocab),aggregation_type='mean')
criterion = CrossEntropyLoss()
optimizer = Adam(mean_agg_model.parameters())

In [None]:
history = train(mean_agg_model, criterion, optimizer, train_dataloader, eval_dataloader, epochs=15)

In [None]:
max_agg_model = SimpleRNN(vocab_size=len(vocab),aggregation_type='max')
criterion = CrossEntropyLoss()
optimizer = Adam(max_agg_model.parameters())

In [None]:
history = train(max_agg_model, criterion, optimizer, train_dataloader, eval_dataloader, epochs=15)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 1: train loss - 0.7728947157174901, train accuracy - 0.69638, test accuracy - 0.7871999740600586


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 2: train loss - 0.5000001875849314, train accuracy - 0.82004, test accuracy - 0.8133999705314636


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 3: train loss - 0.4299174882590733, train accuracy - 0.8444, test accuracy - 0.8119999766349792


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 4: train loss - 0.39034444713036714, train accuracy - 0.86008, test accuracy - 0.8118000030517578


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 5: train loss - 0.34928106923449975, train accuracy - 0.87224, test accuracy - 0.8208000063896179


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 6: train loss - 0.32288326578296994, train accuracy - 0.8831, test accuracy - 0.8180000185966492


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 7: train loss - 0.28711187627979257, train accuracy - 0.89642, test accuracy - 0.8116000294685364


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 8: train loss - 0.2571440601241146, train accuracy - 0.9078, test accuracy - 0.8169999718666077


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 9: train loss - 0.23120662908803288, train accuracy - 0.91532, test accuracy - 0.8105999827384949


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 10: train loss - 0.2104564871268391, train accuracy - 0.92368, test accuracy - 0.8167999982833862


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 11: train loss - 0.1899114340634317, train accuracy - 0.931, test accuracy - 0.8202000260353088


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 12: train loss - 0.17219510444237954, train accuracy - 0.93674, test accuracy - 0.8091999888420105


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 13: train loss - 0.15571171731788783, train accuracy - 0.94294, test accuracy - 0.8137999773025513


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 14: train loss - 0.1463430634335098, train accuracy - 0.94688, test accuracy - 0.8044000267982483


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 15: train loss - 0.13378560385590518, train accuracy - 0.9512, test accuracy - 0.7978000044822693


#Добавим scheduler

In [None]:
max_agg_model = SimpleRNN(vocab_size=len(vocab),aggregation_type='max')
criterion = CrossEntropyLoss()
optimizer = Adam(max_agg_model.parameters())

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.5, patience=2, verbose=True
)



In [None]:
history = train(max_agg_model, criterion, optimizer, train_dataloader, eval_dataloader, epochs=15, scheduler=scheduler)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 1: train loss - 0.7741327468718159, train accuracy - 0.69662, test accuracy - 0.795799970626831


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 2: train loss - 0.5020004823679963, train accuracy - 0.8179, test accuracy - 0.8104000091552734


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 3: train loss - 0.43320392195662094, train accuracy - 0.84264, test accuracy - 0.8140000104904175


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 4: train loss - 0.38772815011050826, train accuracy - 0.85962, test accuracy - 0.8123999834060669


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 5: train loss - 0.3513733856880505, train accuracy - 0.8716, test accuracy - 0.8159999847412109


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 6: train loss - 0.31751589673932973, train accuracy - 0.88484, test accuracy - 0.821399986743927


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 7: train loss - 0.28590404122867646, train accuracy - 0.89552, test accuracy - 0.8173999786376953


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 8: train loss - 0.2539354031148795, train accuracy - 0.90724, test accuracy - 0.8169999718666077


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 9: train loss - 0.23115178434772898, train accuracy - 0.91654, test accuracy - 0.8185999989509583


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 10: train loss - 0.20742735166760473, train accuracy - 0.92378, test accuracy - 0.8105999827384949


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 11: train loss - 0.18794172107684767, train accuracy - 0.93104, test accuracy - 0.8122000098228455


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 12: train loss - 0.17441293233421631, train accuracy - 0.9343, test accuracy - 0.8162000179290771


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 13: train loss - 0.15710112233678986, train accuracy - 0.9419, test accuracy - 0.8046000003814697


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 14: train loss - 0.14577882579943108, train accuracy - 0.94538, test accuracy - 0.8046000003814697


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 15: train loss - 0.1351945253813929, train accuracy - 0.9503, test accuracy - 0.8040000200271606


#Добавим слои в RNN

In [None]:
for i in range(2, 5):
    max_agg_model = SimpleRNN(vocab_size=len(vocab),aggregation_type='max', rnn_layers = i)
    criterion = CrossEntropyLoss()
    optimizer = Adam(max_agg_model.parameters())

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='max', factor=0.5, patience=2, verbose=True
    )

    history = train(max_agg_model, criterion, optimizer, train_dataloader, eval_dataloader, epochs=15, scheduler=scheduler)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 1: train loss - 0.8143325336110645, train accuracy - 0.67736, test accuracy - 0.77920001745224


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 2: train loss - 0.5278840484294233, train accuracy - 0.8085, test accuracy - 0.8068000078201294


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 3: train loss - 0.456965652848007, train accuracy - 0.83588, test accuracy - 0.8145999908447266


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 4: train loss - 0.4155447128957575, train accuracy - 0.84976, test accuracy - 0.8230000138282776


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 5: train loss - 0.3842825833004938, train accuracy - 0.86326, test accuracy - 0.8227999806404114


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 6: train loss - 0.3572177489479421, train accuracy - 0.87144, test accuracy - 0.823199987411499


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 7: train loss - 0.3363895204540292, train accuracy - 0.8799, test accuracy - 0.8205999732017517


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 8: train loss - 0.31358006855166654, train accuracy - 0.88724, test accuracy - 0.8173999786376953


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 9: train loss - 0.29410131284949187, train accuracy - 0.89602, test accuracy - 0.8113999962806702


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 10: train loss - 0.27500355210076594, train accuracy - 0.90242, test accuracy - 0.8226000070571899


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 11: train loss - 0.2652469601110339, train accuracy - 0.90488, test accuracy - 0.8155999779701233


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 12: train loss - 0.24672405637076134, train accuracy - 0.91154, test accuracy - 0.8176000118255615


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 13: train loss - 0.23529539420373705, train accuracy - 0.91568, test accuracy - 0.8090000152587891


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 14: train loss - 0.22272517801913708, train accuracy - 0.92006, test accuracy - 0.8127999901771545


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 15: train loss - 0.2123313968666303, train accuracy - 0.92486, test accuracy - 0.8076000213623047


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 1: train loss - 0.8568766067330073, train accuracy - 0.65868, test accuracy - 0.7748000025749207


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 2: train loss - 0.5557781205401158, train accuracy - 0.8007, test accuracy - 0.8091999888420105


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 3: train loss - 0.4833544487859375, train accuracy - 0.82828, test accuracy - 0.8123999834060669


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 4: train loss - 0.436053805148571, train accuracy - 0.84632, test accuracy - 0.8095999956130981


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 5: train loss - 0.405821479513755, train accuracy - 0.85578, test accuracy - 0.8248000144958496


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 6: train loss - 0.3786425612752474, train accuracy - 0.86714, test accuracy - 0.8253999948501587


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 7: train loss - 0.3613530766035377, train accuracy - 0.87304, test accuracy - 0.8235999941825867


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 8: train loss - 0.33913153506963206, train accuracy - 0.88158, test accuracy - 0.8253999948501587


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 9: train loss - 0.32281253205866134, train accuracy - 0.88584, test accuracy - 0.8194000124931335


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 10: train loss - 0.3081873527067217, train accuracy - 0.8915, test accuracy - 0.8148000240325928


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 11: train loss - 0.29582408114180914, train accuracy - 0.89772, test accuracy - 0.8208000063896179


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 12: train loss - 0.28374578954410035, train accuracy - 0.90084, test accuracy - 0.8202000260353088


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 13: train loss - 0.276703722231562, train accuracy - 0.90302, test accuracy - 0.823199987411499


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 14: train loss - 0.26587857278236693, train accuracy - 0.90672, test accuracy - 0.8238000273704529


  0%|          | 0/1563 [00:00<?, ?it/s]

KeyboardInterrupt: 

#GRU

In [None]:
class SimpleGRU(nn.Module):
    def __init__(self,
                 vocab_size,
                 hidden_dim = 256,
                 dropout_rate = 0.1,
                 gru_layers = 1,
                 n_classes = 4,
                 aggregation_type = 'mean',
                ) -> None:
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.gru = nn.GRU(hidden_dim, hidden_dim, batch_first=True, num_layers=gru_layers, bidirectional=True, dropout=0.2 if gru_layers > 1 else 0.0)
        self.nonlinear = nn.Tanh()

        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim*2, hidden_dim),
            nn.Dropout(dropout_rate),          # для регуляризации
            nn.Tanh(),
            nn.Linear(hidden_dim, n_classes) # финальный слой
        )

        self.aggregation_type = aggregation_type

    def forward(self, input_batch):
        embeddings = self.embedding(input_batch)
        output, _ = self.gru(embeddings)

        if self.aggregation_type == 'last':
            output = output[:,-1,:]
        elif self.aggregation_type == 'max':
            output = output.max(dim=1)[0]
        elif self.aggregation_type == 'mean':
            output = output.mean(dim=1)
        else:
            raise ValueError('Invalid aggregation type')

        return self.classifier(output)

In [None]:
gru_model = SimpleGRU(vocab_size=len(vocab), aggregation_type='max')
criterion = CrossEntropyLoss()
optimizer = Adam(gru_model.parameters())

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.5, patience=2, verbose=True
)

In [None]:
history = train(gru_model, criterion, optimizer, train_dataloader, eval_dataloader, epochs=15, scheduler=scheduler)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 1: train loss - 0.6905437287560504, train accuracy - 0.73492, test accuracy - 0.8022000193595886


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 2: train loss - 0.44630966763121155, train accuracy - 0.8372, test accuracy - 0.8199999928474426


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 3: train loss - 0.36187345497620005, train accuracy - 0.86718, test accuracy - 0.8180000185966492


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 4: train loss - 0.28711915917346587, train accuracy - 0.89586, test accuracy - 0.8145999908447266


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 5: train loss - 0.2239747845704214, train accuracy - 0.91912, test accuracy - 0.8068000078201294


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 6: train loss - 0.17201685777168796, train accuracy - 0.93842, test accuracy - 0.8068000078201294


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 7: train loss - 0.13453996684330724, train accuracy - 0.95194, test accuracy - 0.8090000152587891


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 8: train loss - 0.11274005509826568, train accuracy - 0.9602, test accuracy - 0.8098000288009644


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 9: train loss - 0.09804982759066817, train accuracy - 0.9645, test accuracy - 0.8133999705314636


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 10: train loss - 0.08804338037361696, train accuracy - 0.9676, test accuracy - 0.8065999746322632


  0%|          | 0/1563 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
updated_gru_model = SimpleGRU(vocab_size=len(vocab), gru_layers=2)
criterion = CrossEntropyLoss()
optimizer = Adam(updated_gru_model.parameters(), lr=1e-3, weight_decay=1e-5)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.5, patience=2, verbose=True
)





In [None]:
history = train(updated_gru_model, criterion, optimizer, train_dataloader, eval_dataloader, epochs=15, scheduler=scheduler)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 1: train loss - 0.7441219896059043, train accuracy - 0.70932, test accuracy - 0.8091999888420105


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 2: train loss - 0.46270254544165373, train accuracy - 0.83372, test accuracy - 0.821399986743927


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 3: train loss - 0.3792934928015494, train accuracy - 0.86546, test accuracy - 0.8307999968528748


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 4: train loss - 0.3115564033565465, train accuracy - 0.88898, test accuracy - 0.8263999819755554


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 5: train loss - 0.2513000204877943, train accuracy - 0.90914, test accuracy - 0.8172000050544739


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 6: train loss - 0.19408273783894356, train accuracy - 0.92974, test accuracy - 0.8173999786376953


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 7: train loss - 0.15255966622105926, train accuracy - 0.94468, test accuracy - 0.8145999908447266


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 8: train loss - 0.12371476928978177, train accuracy - 0.95454, test accuracy - 0.8019999861717224


  0%|          | 0/1563 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
class GRUWithAttention(nn.Module):
    def __init__(self,
                 vocab_size,
                 hidden_dim = 256,
                 dropout_rate = 0.1,
                 gru_layers = 1,
                 n_classes = 4,
                ) -> None:
        super().__init__()
        self.embedding = nn.Sequential(
           nn.Embedding(vocab_size, hidden_dim),
           nn.LayerNorm(hidden_dim)
        )
        self.gru = nn.GRU(hidden_dim, hidden_dim, batch_first=True, num_layers=gru_layers, bidirectional=True, dropout=0.2 if gru_layers > 1 else 0.0)
        self.nonlinear = nn.Tanh()
        self.attn = nn.Linear(hidden_dim * 2, 1)  # т.к. bidirectional
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim*2, hidden_dim),
            nn.Dropout(dropout_rate),          # для регуляризации
            nn.Tanh(),
            nn.Linear(hidden_dim, n_classes) # финальный слой
        )

    def forward(self, input_batch):
        embeddings = self.embedding(input_batch)
        output, _ = self.gru(embeddings)

        attn_weights = torch.softmax(self.attn(output).squeeze(-1), dim=1)  # [B, T]
        context = torch.sum(output * attn_weights.unsqueeze(-1), dim=1)     # [B, 2D]

        return self.classifier(context)

In [None]:
gru_with_attention_model = GRUWithAttention(vocab_size=len(vocab), gru_layers=2)
criterion = CrossEntropyLoss()
optimizer = Adam(gru_with_attention_model.parameters(), lr=1e-3, weight_decay=1e-5)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.5, patience=2, verbose=True
)



In [None]:
history = train(gru_with_attention_model, criterion, optimizer, train_dataloader, eval_dataloader, epochs=15, scheduler=scheduler)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 1: train loss - 0.72124822989764, train accuracy - 0.72174, test accuracy - 0.8083999752998352


  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 2: train loss - 0.4538857127591653, train accuracy - 0.8376, test accuracy - 0.8185999989509583


  0%|          | 0/1563 [00:00<?, ?it/s]