# Задание 3

## Классификация текстов

В этом задании вам предстоит попробовать несколько методов, используемых в задаче классификации, а также понять насколько хорошо модель понимает смысл слов и какие слова в примере влияют на результат.

In [1]:
import pandas as pd
import numpy as np
import torch
import nltk
from torchtext.legacy import datasets
from torchtext.legacy.data import Field, LabelField
from torchtext.legacy.data import BucketIterator

from torchtext.vocab import Vectors, GloVe

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
from tqdm.autonotebook import tqdm

В этом задании мы будем использовать библиотеку torchtext. Она довольна проста в использовании и поможет нам сконцентрироваться на задаче, а не на написании Dataloader-а.

In [2]:
TEXT = Field(sequential=True, lower=True, include_lengths=True)  # Поле текста
LABEL = LabelField(dtype=torch.float)  # Поле метки

In [3]:
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

Датасет на котором мы будем проводить эксперементы это комментарии к фильмам из сайта IMDB.

In [4]:
train, test = datasets.IMDB.splits(TEXT, LABEL)  # загрузим датасет
train, valid = train.split(random_state=random.seed(SEED))  # разобьем на части

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:03<00:00, 21.8MB/s]


In [5]:
TEXT.build_vocab(train)
LABEL.build_vocab(train)

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"

train_iter, valid_iter, test_iter = BucketIterator.splits(
    (train, valid, test), 
    batch_size = 64,
    sort_within_batch = True,
    device = device)

## RNN

Для начала попробуем использовать рекурентные нейронные сети. На семинаре вы познакомились с GRU, вы можете также попробовать LSTM. Можно использовать для классификации как hidden_state, так и output последнего токена.

In [7]:
# class RNN(nn.Module):
#     def __init__(self, embedding_dim, hidden_dim):
#         super().__init__()
#         self.hidden_dim = hidden_dim
#         self.w_h = nn.Parameter(torch.rand(hidden_dim, hidden_dim))
#         self.b_h = nn.Parameter(torch.rand(1, hidden_dim))
#         self.w_x = nn.Parameter(torch.rand(embedding_dim, hidden_dim))
#         self.b_x = nn.Parameter(torch.rand(1, hidden_dim))
        
#     def forward(self, x, hidden=None):
#         if hidden is None:
#             hidden = torch.zeros((x.size(0), self.hidden_dim)).to(x.device)
#         seq_length = x.size(1)
#         for idx in range(seq_length):
#             hidden = torch.tanh(
                
#                 x[:, idx] @ self.w_x + self.b_x + hidden @ self.w_h + self.b_h
            
#             )
#         return hidden


# class RNN(nn.Module):
#     def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
#         super().__init__()
        
#         self.embedding = nn.Embedding(input_dim, embedding_dim)
        
#         self.rnn = nn.RNN(embedding_dim, hidden_dim)
        
#         self.fc = nn.Linear(hidden_dim, output_dim)
        
#     def forward(self, text):

#         #text = [sent len, batch size]
        
#         embedded = self.embedding(text)
        
#         #embedded = [sent len, batch size, emb dim]
        
#         output, hidden = self.rnn(embedded)
        
#         #output = [sent len, batch size, hid dim]
#         #hidden = [1, batch size, hid dim]
        
#         assert torch.equal(output[-1,:,:], hidden.squeeze(0))
#         return self.fc(hidden.squeeze(0))

# INPUT_DIM = len(TEXT.vocab)
# EMBEDDING_DIM = 100
# HIDDEN_DIM = 256
# OUTPUT_DIM = 1

# model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM).to(device)

In [8]:
class RNNBaseline(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(
            input_size=embedding_dim, 
            hidden_size=hidden_dim, 
            num_layers=n_layers, 
            bidirectional=bidirectional, 
            dropout=dropout
        ) 

        linear_dim = hidden_dim * (bidirectional + 1) * 2
        self.dropout = nn.Dropout()
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim * 2 * 2, 300),
            nn.ReLU(),
            nn.Linear(300, 300),
            nn.ReLU(),
            nn.Linear(300, output_dim)
        )
        
    def forward(self, text, text_lengths):
        #text = [sent len, batch size]
        embedded = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        hidden = torch.cat((output[-2, :, :], output[-1, :, :]), dim=1)
        
        return self.fc(hidden)

Поиграйтесь с гиперпараметрами

In [9]:
vocab_size = len(TEXT.vocab)
emb_dim = 300
hidden_dim = 256
output_dim = 1
n_layers = 2
bidirectional = True
dropout = 0.2
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
patience=3

In [10]:
model = RNNBaseline(
    vocab_size=vocab_size,
    embedding_dim=emb_dim,
    hidden_dim=hidden_dim,
    output_dim=output_dim,
    n_layers=n_layers,
    bidirectional=bidirectional,
    dropout=dropout,
    pad_idx=PAD_IDX
).to(device)

In [11]:
opt = torch.optim.Adam(model.parameters())
loss_func = nn.BCEWithLogitsLoss()

max_epochs = 10

Обучите сетку! Используйте любые вам удобные инструменты, Catalyst, PyTorch Lightning или свои велосипеды.

In [12]:
min_loss = np.inf
cur_patience = 0
patience= 5

for epoch in range(1, max_epochs + 1):
    train_accuracy = 0
    val_accuracy = 0
    train_loss = 0.0
    model.train()
    pbar = tqdm(enumerate(train_iter), total=len(train_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar:
        #YOUR CODE GOES HERE
        texts, length, labels = batch.text[0].to(device), batch.text[1].cpu(), batch.label.to(device)
        opt.zero_grad()
        outputs = model(texts, length).squeeze(1)
        loss = loss_func(outputs, labels)
        result = torch.sigmoid(outputs)
        result = torch.as_tensor((outputs - 0.5) > 0, dtype=torch.int32)
        loss.backward()
        opt.step()
        train_loss += loss.item()
        train_accuracy += (result == labels).sum() / len(labels)

    train_loss /= len(train_iter)
    val_loss = 0.0
    model.eval()
    pbar = tqdm(enumerate(valid_iter), total=len(valid_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar:
        # YOUR CODE GOES HERE
        texts, length, labels = batch.text[0].to(device), batch.text[1].cpu(), batch.label.to(device)
        outputs = model(texts, length).squeeze(1)
        loss = loss_func(outputs, labels)
        result = torch.sigmoid(outputs)
        result = torch.as_tensor((outputs - 0.5) > 0, dtype=torch.int32)
        val_accuracy += (result == labels).sum() / len(labels)
        val_loss += loss.item()

    val_loss /= len(valid_iter)
    if val_loss < min_loss:
        min_loss = val_loss
        best_model = model.state_dict()
    else:
        cur_patience += 1
        if cur_patience == patience:
            cur_patience = 0
            break
    
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, train_loss, val_loss))
    print('Epoch: {}, Training Accuracy: {}, Validation Accuracy: {}'.format(epoch, train_accuracy.item() / len(train_iter), val_accuracy.item() / len(valid_iter)))
    print('\n')

model.load_state_dict(best_model)

  0%|          | 0/274 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 1, Training Loss: 0.6886530973180367, Validation Loss: 0.663939253758576
Epoch: 1, Training Accuracy: 0.515396897810219, Validation Accuracy: 0.517478813559322




  0%|          | 0/274 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 2, Training Loss: 0.6392913256248418, Validation Loss: 0.6526117986541683
Epoch: 2, Training Accuracy: 0.5837053730540032, Validation Accuracy: 0.607565346410719




  0%|          | 0/274 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 3, Training Loss: 0.5877803067003724, Validation Loss: 0.5996285207190756
Epoch: 3, Training Accuracy: 0.6211792883211679, Validation Accuracy: 0.6115378040378376




  0%|          | 0/274 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 4, Training Loss: 0.5292442506247194, Validation Loss: 0.5921907942678969
Epoch: 4, Training Accuracy: 0.6549139649328524, Validation Accuracy: 0.6192178887836004




  0%|          | 0/274 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 5, Training Loss: 0.48674963043499603, Validation Loss: 0.5974481189149922
Epoch: 5, Training Accuracy: 0.6711581153591184, Validation Accuracy: 0.623190346410719




  0%|          | 0/274 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 6, Training Loss: 0.4565311718840886, Validation Loss: 0.6217863915835397
Epoch: 6, Training Accuracy: 0.685504105839416, Validation Accuracy: 0.6382856853937698




  0%|          | 0/274 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 7, Training Loss: 0.4472944191589027, Validation Loss: 0.6602842191518363
Epoch: 7, Training Accuracy: 0.695540602189781, Validation Accuracy: 0.6411988209869902




  0%|          | 0/274 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

Epoch: 8, Training Loss: 0.43955311769565203, Validation Loss: 0.6866675537016432
Epoch: 8, Training Accuracy: 0.6974713402072878, Validation Accuracy: 0.6462306006480072




  0%|          | 0/274 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

<All keys matched successfully>

Посчитайте f1-score вашего классификатора на тестовом датасете.

In [13]:
from sklearn.metrics import f1_score

In [14]:
pbar = tqdm(enumerate(test_iter), total=len(test_iter), leave=False)
pbar.set_description(f"Epoch {epoch}")
preds = []
test_true = [] 

model.eval()
with torch.no_grad():
    for it, batch in pbar:
        # YOUR CODE GOES HERE
        texts, lengths, labels = batch.text[0].to(device), batch.text[1].cpu(), batch.label.to(device)
        outputs = model(texts, lengths).squeeze(1)
        outputs = torch.sigmoid(outputs)
        result = torch.as_tensor((outputs >= 0.5), dtype=torch.int32)
        preds.append(result.cpu().numpy())
        test_true.append(labels.cpu().numpy())
        
preds = [c for i in preds for c in i]
test_true = [c for i in test_true for c in i]

f1_result = f1_score(np.array(test_true), np.array(preds)) 
print('F1 score for RNNModel:', round(f1_result, 2))

  0%|          | 0/391 [00:00<?, ?it/s]

F1 score for RNNModel: 0.72


## CNN

![](https://www.researchgate.net/publication/333752473/figure/fig1/AS:769346934673412@1560438011375/Standard-CNN-on-text-classification.png)

Для классификации текстов также часто используют сверточные нейронные сети. Идея в том, что как правило сентимент содержат словосочетания из двух-трех слов, например "очень хороший фильм" или "невероятная скука". Проходясь сверткой по этим словам мы получим какой-то большой скор и выхватим его с помощью MaxPool. Далее идет обычная полносвязная сетка. Важный момент: свертки применяются не последовательно, а параллельно. Давайте попробуем!

In [15]:
TEXT = Field(sequential=True, lower=True, batch_first=True)  # batch_first тк мы используем conv  
LABEL = LabelField(batch_first=True, dtype=torch.float)

train, tst = datasets.IMDB.splits(TEXT, LABEL)
trn, vld = train.split(random_state=random.seed(SEED))

TEXT.build_vocab(trn)
LABEL.build_vocab(trn)

device = "cuda" if torch.cuda.is_available() else "cpu"

In [16]:
train_iter, valid_iter, test_iter = BucketIterator.splits(
        (trn, vld, tst),
        batch_sizes=(128, 256, 256),
        sort=False,
        sort_key= lambda x: len(x.src),
        sort_within_batch=False,
        device=device,
        repeat=False,
)

Вы можете использовать Conv2d с `in_channels=1, kernel_size=(kernel_sizes[0], emb_dim))` или Conv1d c `in_channels=emb_dim, kernel_size=kernel_size[0]`. Но хорошенько подумайте над shape в обоих случаях.

In [17]:
class CNN(nn.Module):
    def __init__(
        self,
        vocab_size,
        emb_dim,
        out_channels,
        kernel_sizes,
        dropout=0.5,
    ):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.conv_0 = nn.Conv1d(in_channels=emb_dim, out_channels=out_channels, kernel_size=kernel_sizes[0], padding=1, stride=2)

        self.conv_1 = nn.Conv1d(in_channels=emb_dim, out_channels=out_channels, kernel_size=kernel_sizes[0], padding=1, stride=2)
        
        self.conv_2 = nn.Conv1d(in_channels=emb_dim, out_channels=out_channels, kernel_size=kernel_sizes[0], padding=1, stride=2)
        
        self.fc = nn.Linear(len(kernel_sizes) * out_channels, 1)
        
        self.dropout = nn.Dropout(dropout)
        
        
    def forward(self, text):
        
        embedded = self.embedding(text)
        embedded = embedded.permute(0, 2, 1)  # may be reshape here
        conved_0 = F.relu(self.conv_0(embedded))  # may be reshape here
        conved_1 = F.relu(self.conv_1(embedded))  # may be reshape here
        conved_2 = F.relu(self.conv_2(embedded))  # may be reshape here
        
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
        
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim=1))
            
        return self.fc(cat)

In [18]:
kernel_sizes = [3, 4, 5]
vocab_size = len(TEXT.vocab)
out_channels=64
dropout = 0.5
dim = 300

model = CNN(vocab_size=vocab_size, emb_dim=dim, out_channels=out_channels,
            kernel_sizes=kernel_sizes, dropout=dropout).to(device)

In [19]:
opt = torch.optim.Adam(model.parameters())
loss_func = nn.BCEWithLogitsLoss()

Обучите!

In [20]:
max_epochs = 10
min_loss = np.inf
cur_patience = 0
patience= 3

for epoch in range(1, max_epochs + 1):
    train_accuracy = 0
    train_loss = 0.0
    model.train()
    pbar = tqdm(enumerate(train_iter), total=len(train_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar:
        #YOUR CODE GOES HERE
        texts, labels = batch.text.to(device), batch.label.to(device)
        opt.zero_grad()
        outputs = model(texts).squeeze(1)
        loss = loss_func(outputs, labels)
        result = torch.sigmoid(outputs)
        result = torch.as_tensor((outputs - 0.5) > 0, dtype=torch.int32)
        loss.backward()
        opt.step()
        train_loss += loss.item()
        train_accuracy += (result == labels).sum() / len(labels)

    train_loss /= len(train_iter)
    val_accuracy = 0
    val_loss = 0.0
    model.eval()
    pbar = tqdm(enumerate(valid_iter), total=len(valid_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar:
        # YOUR CODE GOES HERE
        texts, labels = batch.text.to(device), batch.label.to(device)
        outputs = model(texts).squeeze(1)
        loss = loss_func(outputs, labels)
        result = torch.sigmoid(outputs)
        result = torch.as_tensor((outputs - 0.5) > 0, dtype=torch.int32)
        val_accuracy += (result == labels).sum() / len(labels)
        val_loss += loss.item()

    val_loss /= len(valid_iter)
    if val_loss < min_loss:
        min_loss = val_loss
        best_model = model.state_dict()
    else:
        cur_patience += 1
        if cur_patience == patience:
            cur_patience = 0
            break
    
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, train_loss, val_loss))
    print('Epoch: {}, Training Accuracy: {}, Validation Accuracy: {}'.format(epoch, train_accuracy.item() / len(train_iter), val_accuracy.item() / len(valid_iter)))
    print('\n')

model.load_state_dict(best_model)

  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 1, Training Loss: 0.6663886308670044, Validation Loss: 0.5168046861886978
Epoch: 1, Training Accuracy: 0.6015302560625285, Validation Accuracy: 0.6967653274536133




  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 2, Training Loss: 0.5288037934007436, Validation Loss: 0.4650247812271118
Epoch: 2, Training Accuracy: 0.7200243873317747, Validation Accuracy: 0.733401870727539




  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 3, Training Loss: 0.45164665862591596, Validation Loss: 0.40963948766390484
Epoch: 3, Training Accuracy: 0.773415168706518, Validation Accuracy: 0.8061677932739257




  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 4, Training Loss: 0.39148892212088093, Validation Loss: 0.3809467673301697
Epoch: 4, Training Accuracy: 0.8144239996471544, Validation Accuracy: 0.8277960459391276




  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 5, Training Loss: 0.3359178942050377, Validation Loss: 0.36210203965504967
Epoch: 5, Training Accuracy: 0.8497674204137203, Validation Accuracy: 0.8294544855753581




  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 6, Training Loss: 0.2709610456750341, Validation Loss: 0.3542745898167292
Epoch: 6, Training Accuracy: 0.8815876758881729, Validation Accuracy: 0.8281524022420247




  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 7, Training Loss: 0.21438246682612566, Validation Loss: 0.3491907159487406
Epoch: 7, Training Accuracy: 0.9102442943266709, Validation Accuracy: 0.8374314626057943




  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 8, Training Loss: 0.1634415017209784, Validation Loss: 0.3513889729976654
Epoch: 8, Training Accuracy: 0.9364462525305086, Validation Accuracy: 0.8501713434855144




  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 9, Training Loss: 0.11486119388120018, Validation Loss: 0.37203920980294547
Epoch: 9, Training Accuracy: 0.95702746314724, Validation Accuracy: 0.8484649022420248




  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

<All keys matched successfully>

Посчитайте f1-score вашего классификатора.

**Ответ**:

In [21]:
pbar = tqdm(enumerate(test_iter), total=len(test_iter), leave=False)
pbar.set_description(f"Epoch {epoch}")
preds = []
test_true = [] 

model.eval()
with torch.no_grad():
    for it, batch in pbar:
        # YOUR CODE GOES HERE
        texts, labels = batch.text.to(device), batch.label.to(device)
        outputs = model(texts).squeeze(1)
        outputs = torch.sigmoid(outputs)
        result = torch.as_tensor((outputs >= 0.5), dtype=torch.int32)
        preds.append(result.cpu().numpy())
        test_true.append(labels.cpu().numpy())
        
preds = [c for i in preds for c in i]
test_true = [c for i in test_true for c in i]

f1_result = f1_score(np.array(test_true), np.array(preds)) 
print('F1 score for CNN Model:', round(f1_result, 2))

  0%|          | 0/98 [00:00<?, ?it/s]

F1 score for CNN Model: 0.84


## Интерпретируемость

Посмотрим, куда смотрит наша модель. Достаточно запустить код ниже.

In [22]:
!pip install -q captum



In [23]:
from captum.attr import LayerIntegratedGradients, TokenReferenceBase, visualization

PAD_IND = TEXT.vocab.stoi['pad']

token_reference = TokenReferenceBase(reference_token_idx=PAD_IND)
lig = LayerIntegratedGradients(model, model.embedding)

In [24]:
def forward_with_softmax(inp):
    logits = model(inp)
    return torch.softmax(logits, 0)[0][1]

def forward_with_sigmoid(input):
    return torch.sigmoid(model(input))


# accumalate couple samples in this array for visualization purposes
vis_data_records_ig = []

def interpret_sentence(model, sentence, min_len = 7, label = 0):
    model.eval()
    text = [tok for tok in TEXT.tokenize(sentence)]
    if len(text) < min_len:
        text += ['pad'] * (min_len - len(text))
    indexed = [TEXT.vocab.stoi[t] for t in text]

    model.zero_grad()

    input_indices = torch.tensor(indexed, device=device)
    input_indices = input_indices.unsqueeze(0)
    
    # input_indices dim: [sequence_length]
    seq_length = min_len

    # predict
    pred = forward_with_sigmoid(input_indices).item()
    pred_ind = round(pred)

    # generate reference indices for each sample
    reference_indices = token_reference.generate_reference(seq_length, device=device).unsqueeze(0)

    # compute attributions and approximation delta using layer integrated gradients
    attributions_ig, delta = lig.attribute(input_indices, reference_indices, \
                                           n_steps=5000, return_convergence_delta=True)

    print('pred: ', LABEL.vocab.itos[pred_ind], '(', '%.2f'%pred, ')', ', delta: ', abs(delta))

    add_attributions_to_visualizer(attributions_ig, text, pred, pred_ind, label, delta, vis_data_records_ig)
    
def add_attributions_to_visualizer(attributions, text, pred, pred_ind, label, delta, vis_data_records):
    attributions = attributions.sum(dim=2).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    attributions = attributions.cpu().detach().numpy()

    # storing couple samples in an array for visualization purposes
    vis_data_records.append(visualization.VisualizationDataRecord(
                            attributions,
                            pred,
                            LABEL.vocab.itos[pred_ind],
                            LABEL.vocab.itos[label],
                            LABEL.vocab.itos[1],
                            attributions.sum(),       
                            text,
                            delta))

In [25]:
interpret_sentence(model, 'It was a fantastic performance !', label=1)
interpret_sentence(model, 'Best film ever', label=1)
interpret_sentence(model, 'Such a great show!', label=1)
interpret_sentence(model, 'It was a horrible movie', label=0)
interpret_sentence(model, 'I\'ve never watched something as bad', label=0)
interpret_sentence(model, 'It is a disgusting movie!', label=0)

pred:  pos ( 1.00 ) , delta:  tensor([0.0002], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.00 ) , delta:  tensor([4.8728e-07], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.01 ) , delta:  tensor([0.0002], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.00 ) , delta:  tensor([7.6702e-05], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.02 ) , delta:  tensor([0.0002], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.00 ) , delta:  tensor([9.4791e-05], device='cuda:0', dtype=torch.float64)


Попробуйте добавить свои примеры!

In [26]:
print('Visualize attributions based on Integrated Gradients')
visualization.visualize_text(vis_data_records_ig)

Visualize attributions based on Integrated Gradients


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
pos,pos (1.00),pos,1.81,It was a fantastic performance ! pad
,,,,
pos,neg (0.00),pos,1.29,Best film ever pad pad pad pad
,,,,
pos,neg (0.01),pos,1.32,Such a great show! pad pad pad
,,,,
neg,neg (0.00),pos,-0.34,It was a horrible movie pad pad
,,,,
neg,neg (0.02),pos,1.74,I've never watched something as bad pad
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
pos,pos (1.00),pos,1.81,It was a fantastic performance ! pad
,,,,
pos,neg (0.00),pos,1.29,Best film ever pad pad pad pad
,,,,
pos,neg (0.01),pos,1.32,Such a great show! pad pad pad
,,,,
neg,neg (0.00),pos,-0.34,It was a horrible movie pad pad
,,,,
neg,neg (0.02),pos,1.74,I've never watched something as bad pad
,,,,


## Эмбеддинги слов

Вы ведь не забыли, как мы можем применить знания о word2vec и GloVe. Давайте попробуем!

In [27]:
from torchtext.vocab import Vectors, GloVe

vecs = GloVe(dim=dim)

.vector_cache/glove.840B.300d.zip: 2.18GB [06:51, 5.29MB/s]                            
100%|█████████▉| 2196016/2196017 [04:23<00:00, 8326.26it/s]


In [28]:
TEXT.build_vocab(trn, vectors=vecs)
LABEL.build_vocab(trn)

word_embeddings = TEXT.vocab.vectors

kernel_sizes = [3, 4, 5]
vocab_size = len(TEXT.vocab)
dropout = 0.5
dim = 300

In [29]:
train, tst = datasets.IMDB.splits(TEXT, LABEL)
trn, vld = train.split(random_state=random.seed(SEED))

device = "cuda" if torch.cuda.is_available() else "cpu"
train_iter, val_iter, test_iter = BucketIterator.splits(
        (trn, vld, tst),
        batch_sizes=(128, 256, 256),
        sort=False,
        sort_key= lambda x: len(x.src),
        sort_within_batch=False,
        device=device,
        repeat=False,
)

In [30]:
model = CNN(vocab_size=vocab_size, emb_dim=dim, out_channels=64,
            kernel_sizes=kernel_sizes, dropout=dropout)

word_embeddings = TEXT.vocab.vectors

prev_shape = model.embedding.weight.shape

with torch.no_grad():
    model.embedding.weight = nn.Parameter(word_embeddings)  # инициализируйте эмбэдинги

assert prev_shape == model.embedding.weight.shape
model.to(device)

opt = torch.optim.Adam(model.parameters())

Вы знаете, что делать.

In [31]:
def freeze_embeddings(model, req_grad=False):
    embeddings = model.embedding
    for c_p in embeddings.parameters():
        c_p.requires_grad = req_grad

In [32]:
max_epochs = 20
min_loss = np.inf
cur_patience = 0
patience= 5
num_iter = 0
num_freeze_iter = 1000
freeze_embeddings(model)
status = True

for epoch in range(1, max_epochs + 1):
    train_accuracy = 0
    train_loss = 0.0
    model.train()
    pbar = tqdm(enumerate(train_iter), total=len(train_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar:
        if num_iter > num_freeze_iter and status == True:
            print('unfreeze')
            freeze_embeddings(model, True)
            status = False
            
        texts, labels = batch.text.to(device), batch.label.to(device)
        opt.zero_grad()
        outputs = model(texts).squeeze(1)
        loss = loss_func(outputs, labels)
        result = torch.sigmoid(outputs)
        result = torch.as_tensor((outputs - 0.5) > 0, dtype=torch.int32)
        loss.backward()
        opt.step()
        train_loss += loss.item()
        train_accuracy += (result == labels).sum() / len(labels)
        num_iter += 1

    train_loss /= len(train_iter)
    val_accuracy = 0
    val_loss = 0.0
    model.eval()
    pbar = tqdm(enumerate(valid_iter), total=len(valid_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar:
        texts, labels = batch.text.to(device), batch.label.to(device)
        outputs = model(texts).squeeze(1)
        loss = loss_func(outputs, labels)
        result = torch.sigmoid(outputs)
        result = torch.as_tensor((outputs - 0.5) > 0, dtype=torch.int32)
        val_accuracy += (result == labels).sum() / len(labels)
        val_loss += loss.item()

    val_loss /= len(valid_iter)
    if val_loss < min_loss:
        min_loss = val_loss
        best_model = model.state_dict()
    else:
        cur_patience += 1
        if cur_patience == patience:
            cur_patience = 0
            break
    
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, train_loss, val_loss))
    print('Epoch: {}, Training Accuracy: {}, Validation Accuracy: {}'.format(epoch, train_accuracy.item() / len(train_iter), val_accuracy.item() / len(valid_iter)))
    print('\n')

model.load_state_dict(best_model)

  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 1, Training Loss: 0.5246513936206372, Validation Loss: 0.3790793627500534
Epoch: 1, Training Accuracy: 0.6965422804338218, Validation Accuracy: 0.8046669642130534




  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 2, Training Loss: 0.38417495961171866, Validation Loss: 0.34545354346434276
Epoch: 2, Training Accuracy: 0.8248919187671077, Validation Accuracy: 0.8283306121826172




  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 3, Training Loss: 0.33928847389064565, Validation Loss: 0.3236917992432912
Epoch: 3, Training Accuracy: 0.8449004235928946, Validation Accuracy: 0.8532004038492839




  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 4, Training Loss: 0.3036625374407664, Validation Loss: 0.3142474085092545
Epoch: 4, Training Accuracy: 0.8659626758881729, Validation Accuracy: 0.857463010152181




  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 5, Training Loss: 0.27677330820664875, Validation Loss: 0.3101487264037132
Epoch: 5, Training Accuracy: 0.8791678212854984, Validation Accuracy: 0.8540295918782552




  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 6, Training Loss: 0.2527461830716934, Validation Loss: 0.3046643500526746
Epoch: 6, Training Accuracy: 0.893141586415089, Validation Accuracy: 0.8627535502115885




  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 7, Training Loss: 0.22959882271115797, Validation Loss: 0.3119339684645335
Epoch: 7, Training Accuracy: 0.9048069863423814, Validation Accuracy: 0.8507743835449219




  0%|          | 0/137 [00:00<?, ?it/s]

unfreeze


  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 8, Training Loss: 0.2090800141331053, Validation Loss: 0.2896758943796158
Epoch: 8, Training Accuracy: 0.9128004756287067, Validation Accuracy: 0.8706962585449218




  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 9, Training Loss: 0.13601574872749567, Validation Loss: 0.28994656950235365
Epoch: 9, Training Accuracy: 0.9513944082886633, Validation Accuracy: 0.8712170918782552




  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 10, Training Loss: 0.0720941197317447, Validation Loss: 0.30673761467138927
Epoch: 10, Training Accuracy: 0.9786748955719662, Validation Accuracy: 0.8662691752115885




  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 11, Training Loss: 0.03925948371145412, Validation Loss: 0.31688542664051056
Epoch: 11, Training Accuracy: 0.990717310104927, Validation Accuracy: 0.8768160502115886




  0%|          | 0/137 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

<All keys matched successfully>

Посчитайте f1-score вашего классификатора.

**Ответ**:

In [33]:
pbar = tqdm(enumerate(test_iter), total=len(test_iter), leave=False)
pbar.set_description(f"Epoch {epoch}")
preds = []
test_true = [] 

model.eval()
with torch.no_grad():
    for it, batch in pbar:
        # YOUR CODE GOES HERE
        texts, labels = batch.text.to(device), batch.label.to(device)
        outputs = model(texts).squeeze(1)
        outputs = torch.sigmoid(outputs)
        result = torch.as_tensor((outputs >= 0.5), dtype=torch.int32)
        preds.append(result.cpu().numpy())
        test_true.append(labels.cpu().numpy())
        
preds = [c for i in preds for c in i]
test_true = [c for i in test_true for c in i]

f1_result = f1_score(np.array(test_true), np.array(preds)) 
print('F1 score for CNN Model with embeddings:', round(f1_result, 2))

  0%|          | 0/98 [00:00<?, ?it/s]

F1 score for CNN Model with embeddings: 0.88


Проверим насколько все хорошо!

In [34]:
PAD_IND = TEXT.vocab.stoi['pad']

token_reference = TokenReferenceBase(reference_token_idx=PAD_IND)
lig = LayerIntegratedGradients(model, model.embedding)
vis_data_records_ig = []

interpret_sentence(model, 'It was a fantastic performance !', label=1)
interpret_sentence(model, 'Best film ever', label=1)
interpret_sentence(model, 'Such a great show!', label=1)
interpret_sentence(model, 'It was a horrible movie', label=0)
interpret_sentence(model, 'I\'ve never watched something as bad', label=0)
interpret_sentence(model, 'It is a disgusting movie!', label=0)

pred:  pos ( 0.99 ) , delta:  tensor([0.0003], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.16 ) , delta:  tensor([1.8256e-06], device='cuda:0', dtype=torch.float64)
pred:  pos ( 0.54 ) , delta:  tensor([1.5637e-05], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.00 ) , delta:  tensor([7.0860e-05], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.19 ) , delta:  tensor([0.0001], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.00 ) , delta:  tensor([8.5678e-05], device='cuda:0', dtype=torch.float64)


In [35]:
print('Visualize attributions based on Integrated Gradients')
visualization.visualize_text(vis_data_records_ig)

Visualize attributions based on Integrated Gradients


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
pos,pos (0.99),pos,1.83,It was a fantastic performance ! pad
,,,,
pos,neg (0.16),pos,1.71,Best film ever pad pad pad pad
,,,,
pos,pos (0.54),pos,1.64,Such a great show! pad pad pad
,,,,
neg,neg (0.00),pos,-0.84,It was a horrible movie pad pad
,,,,
neg,neg (0.19),pos,0.67,I've never watched something as bad pad
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
pos,pos (0.99),pos,1.83,It was a fantastic performance ! pad
,,,,
pos,neg (0.16),pos,1.71,Best film ever pad pad pad pad
,,,,
pos,pos (0.54),pos,1.64,Such a great show! pad pad pad
,,,,
neg,neg (0.00),pos,-0.84,It was a horrible movie pad pad
,,,,
neg,neg (0.19),pos,0.67,I've never watched something as bad pad
,,,,
