# Sieci neuronowe i Deep Learning
# Temat 10: Rekurencyjne sieci neuronowe (wprowadzenie)

## Zadanie 10.1

Przeanalizować poniższy przykład przygotowania danych tekstowych do budowy modelu predykcyjnego oraz budowy sieci RNN do analizy sentymentu recenzji filmowych z IMDb (wielowarstwowy RNN, architektura many-to-one).

Spróbować zmodyfikować architekturę przedstawionej sieci – porównać wyniki.

In [24]:
!python --version

Python 3.11.12


In [25]:
!pip install torch==2.1.0 torchtext==0.16.0 torchdata==0.7.0 portalocker==2.2.0
!pip install numpy



In [26]:
# !pip list
import torchtext

In [27]:
import torch
import torch.nn as nn

# Przygotowanie zbioru danych z recenzjami

In [28]:
from torchtext.datasets import IMDB
from torch.utils.data.dataset import random_split

In [29]:
## Wczytanie danych i podział na train/test
train_dataset = IMDB(split='train')
test_dataset = IMDB(split='test')

test_dataset = list(test_dataset)   #datapipe to list

W każdym zbiorze znajduje się po 25 000 przykładów: recenzja + etykieta (neg/pos).

In [30]:
torch.manual_seed(1)

# Wydzielamy train i valid z domyślnego traina:
train_dataset, valid_dataset = random_split(
    list(train_dataset), [20000, 5000])

In [31]:
## Kodujemy dane tekstowe: szukamy unikatowch słów (tokenów) -
## można wykorzystać klasę Counter z pakietu collections

import re
from collections import Counter, OrderedDict

token_counts = Counter()

# Gotowa funkcja do tokenizacji (czyści też tekst):
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized


for label, line in train_dataset:
    tokens = tokenizer(line)
    token_counts.update(tokens)


print('Vocab-size:', len(token_counts))

Vocab-size: 69023


In [32]:
## Mapowanie unikalnych słów na integery (pakiet torchtext ma do tego klasę Vocab)
from torchtext.vocab import vocab

sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)

vocab = vocab(ordered_dict)

vocab.insert_token("<pad>", 0)  # 0 - symbol ,,zastępczy"
vocab.insert_token("<unk>", 1)  # 1 - dla nieznanych tokenów
vocab.set_default_index(1)

print([vocab[token] for token in ['this', 'is', 'an', 'example']])

[11, 7, 35, 457]


In [33]:
if not torch.cuda.is_available():
    print("Warning: this code may be very slow on CPU")

Jeśli za wolno, to wziąć mniej danych.

In [34]:
## Definijemy funkcję do transformacji oraz funkcję do etykiet 0-1:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

from torchtext import __version__ as torchtext_version
from pkg_resources import parse_version

if parse_version(torchtext.__version__) > parse_version("0.10"):
    label_pipeline = lambda x: 1. if x == 2 else 0.         # 1 ~ negative, 2 ~ positive review
else:
    label_pipeline = lambda x: 1. if x == 'pos' else 0.


## Składamy w jedną funkcję:
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text),
                                      dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(
        text_list, batch_first=True)
    return padded_text_list.to(device), label_list.to(device), lengths.to(device)

In [35]:
## Test na 4 przykładach

from torch.utils.data import DataLoader

dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch)
text_batch, label_batch, length_batch = next(iter(dataloader))

print(text_batch)
print(label_batch)
print(length_batch)
print(text_batch.shape)

tensor([[   35,  1739,     7,   449,   721,     6,   301,     4,   787,     9,
             4,    18,    44,     2,  1705,  2460,   186,    25,     7,    24,
           100,  1874,  1739,    25,     7, 34415,  3568,  1103,  7517,   787,
             5,     2,  4991, 12401,    36,     7,   148,   111,   939,     6,
         11598,     2,   172,   135,    62,    25,  3199,  1602,     3,   928,
          1500,     9,     6,  4601,     2,   155,    36,    14,   274,     4,
         42945,     9,  4991,     3,    14, 10296,    34,  3568,     8,    51,
           148,    30,     2,    58,    16,    11,  1893,   125,     6,   420,
          1214,    27, 14542,   940,    11,     7,    29,   951,    18,    17,
         15994,   459,    34,  2480, 15211,  3713,     2,   840,  3200,     9,
          3568,    13,   107,     9,   175,    94,    25,    51, 10297,  1796,
            27,   712,    16,     2,   220,    17,     4,    54,   722,   238,
           395,     2,   787,    32,    27,  5236,  

Mamy zamienione sekwencje słów na sekwencje liczb całkowitych, a etykiety na 1 lub 0.

Funckja `pad_sequence()` dopełniła przykłady zerami tak, aby wszystki przykłady w batchu miały ten sam kształt
(aby efektywnie przechowywać je w postaci tensorów).

In [36]:
## Dzielimy zbiory danych na batche o rozmiarze 32:

batch_size = 32

train_dl = DataLoader(train_dataset, batch_size=batch_size,
                      shuffle=False, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size,
                      shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size,
                     shuffle=False, collate_fn=collate_batch)

## Osadzanie


 * `input_dim`: number of words, i.e. maximum integer index + 1.
 * `output_dim`:
 * `input_length`: the length of (padded) sequence
    * for example, `'This is an example' -> [0, 0, 0, 0, 0, 0, 3, 1, 8, 9]`   
    => input_lenght is 10



 * When calling the layer, takes integr values as input,   
 the embedding layer convert each interger into float vector of size `[output_dim]`
   * If input shape is `[BATCH_SIZE]`, output shape will be `[BATCH_SIZE, output_dim]`
   * If input shape is `[BATCH_SIZE, 10]`, output shape will be `[BATCH_SIZE, 10, output_dim]`

In [37]:
# Image(filename='figures/15_10.png', width=600)

In [38]:
embedding = nn.Embedding(num_embeddings=10,
                         embedding_dim=3,
                         padding_idx=0)

# a batch of 2 samples of 4 indices each
text_encoded_input = torch.LongTensor([[1,2,4,5],[4,3,2,0]])
print(embedding(text_encoded_input))

tensor([[[ 0.7039, -0.8321, -0.4651],
         [-0.3203,  2.2408,  0.5566],
         [-0.4643,  0.3046,  0.7046],
         [-0.7106, -0.2959,  0.8356]],

        [[-0.4643,  0.3046,  0.7046],
         [ 0.0946, -0.3531,  0.9124],
         [-0.3203,  2.2408,  0.5566],
         [ 0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward0>)


## Budowanie sieci RNN - przykład, jak to robić

* Warstwy rekurencyjne:
  * `nn.RNN(input_size, hidden_size, num_layers=1)`
  * `nn.LSTM(..)`
  * `nn.GRU(..)`
  * `nn.RNN(input_size, hidden_size, num_layers=1, bidirectional=True)`

In [39]:
# Jedna ukryta warstwa:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.rnn = nn.RNN(input_size,
                          hidden_size,
                          num_layers=2,
                          batch_first=True)
        # self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        # self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        _, hidden = self.rnn(x)
        out = hidden[-1, :, :]
        out = self.fc(out)
        return out

model = RNN(64, 32)

print(model)

model(torch.randn(5, 3, 64))

RNN(
  (rnn): RNN(64, 32, num_layers=2, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)


tensor([[ 0.3183],
        [ 0.1230],
        [ 0.1772],
        [-0.1052],
        [-0.1259]], grad_fn=<AddmmBackward0>)

## Model RNN do analizy sentymentu w recenzjach filmów z IMDb

In [40]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,
                                      embed_dim,
                                      padding_idx=0)
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size,
                           batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu(), enforce_sorted=False, batch_first=True)
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
model = model.to(device)

In [41]:
def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            pred = model(text_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [42]:
loss_fn = nn.BCELoss()  # Binary Cross-Entropy
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10

In [43]:
torch.manual_seed(1)

!pip list | grep numpy

for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')

numpy                                 2.0.2
Epoch 0 accuracy: 0.5414 val_accuracy: 0.5594
Epoch 1 accuracy: 0.6250 val_accuracy: 0.6546
Epoch 2 accuracy: 0.7371 val_accuracy: 0.7662
Epoch 3 accuracy: 0.8234 val_accuracy: 0.8178
Epoch 4 accuracy: 0.8805 val_accuracy: 0.8430
Epoch 5 accuracy: 0.9115 val_accuracy: 0.8554
Epoch 6 accuracy: 0.9349 val_accuracy: 0.8610
Epoch 7 accuracy: 0.9492 val_accuracy: 0.8670
Epoch 8 accuracy: 0.9585 val_accuracy: 0.8640
Epoch 9 accuracy: 0.9656 val_accuracy: 0.8504


In [44]:
acc_test, _ = evaluate(test_dl)
print(f'test_accuracy: {acc_test:.4f}')

test_accuracy: 0.8445


In [74]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        self.lstm = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)

        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, rnn_hidden_size)

        self.gru = nn.GRU(rnn_hidden_size, rnn_hidden_size, batch_first=True)

        self.fc3 = nn.Linear(rnn_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)

        out_packed = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu(), enforce_sorted=False, batch_first=True)
        lstm_out_packed, (hidden, cell) = self.lstm(out_packed)

        out = hidden[-1]

        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)

        out = out.unsqueeze(1)

        gru_out, gru_hidden = self.gru(out)

        out = gru_hidden[-1]

        out = self.fc3(out)
        out = self.sigmoid(out)
        return out


vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
model = model.to(device)
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10

In [75]:
def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            pred = model(text_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [76]:
batch_size = 32

train_dl = DataLoader(train_dataset, batch_size=batch_size,
                      shuffle=False, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size,
                      shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size,
                     shuffle=False, collate_fn=collate_batch)

In [77]:
torch.manual_seed(1)

!pip list | grep numpy

for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')

numpy                                 2.0.2
Epoch 0 accuracy: 0.6085 val_accuracy: 0.7034
Epoch 1 accuracy: 0.7349 val_accuracy: 0.7594
Epoch 2 accuracy: 0.8038 val_accuracy: 0.7058
Epoch 3 accuracy: 0.8407 val_accuracy: 0.8122
Epoch 4 accuracy: 0.8581 val_accuracy: 0.8118
Epoch 5 accuracy: 0.8992 val_accuracy: 0.8260
Epoch 6 accuracy: 0.9157 val_accuracy: 0.8326
Epoch 7 accuracy: 0.8804 val_accuracy: 0.8328
Epoch 8 accuracy: 0.9008 val_accuracy: 0.8308
Epoch 9 accuracy: 0.8988 val_accuracy: 0.8316


In [78]:
acc_test, _ = evaluate(test_dl)
print(f'test_accuracy: {acc_test:.4f}')

test_accuracy: 0.8281


In [81]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, output_size=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        embedded = self.embedding(text)

        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)

        packed_output, hidden = self.gru(packed)

        out = hidden[-1]

        out = self.fc(out)
        out = self.sigmoid(out)
        return out


vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size)
model = model.to(device)
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10

In [82]:
torch.manual_seed(1)

!pip list | grep numpy

for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')

numpy                                 2.0.2
Epoch 0 accuracy: 0.5971 val_accuracy: 0.6220
Epoch 1 accuracy: 0.7363 val_accuracy: 0.7658
Epoch 2 accuracy: 0.8348 val_accuracy: 0.8292
Epoch 3 accuracy: 0.8897 val_accuracy: 0.8516
Epoch 4 accuracy: 0.9177 val_accuracy: 0.8558
Epoch 5 accuracy: 0.9378 val_accuracy: 0.8538
Epoch 6 accuracy: 0.9539 val_accuracy: 0.8506
Epoch 7 accuracy: 0.9626 val_accuracy: 0.8334
Epoch 8 accuracy: 0.9635 val_accuracy: 0.8410
Epoch 9 accuracy: 0.9812 val_accuracy: 0.8428


In [83]:
acc_test, _ = evaluate(test_dl)
print(f'test_accuracy: {acc_test:.4f}')

test_accuracy: 0.8383
