In [2]:
import os
import numpy as np
import random

import matplotlib.pyplot as plt
from tqdm import tqdm

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext.legacy import data, datasets

In [4]:
SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [6]:
!mkdir data

#Сентимент анализ

Домашка — классифицировать отзывы с IMDB на положительный / отрицательный только по тексту.

<img src="https://github.com/bentrevett/pytorch-sentiment-analysis/raw/bf8cc46e4823ebf9af721b595501ad6231c73632/assets/sentiment1.png">

Суть такая же, только нужно предобработать тексты — каждому слову сопоставить обучаемый вектор (embedding), который пойдёт дальше в RNN.

In [7]:
!pip install torchtext
!python -m spacy download en

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 12.0 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [8]:
TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)

In [9]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL, root="./data")

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:07<00:00, 10.5MB/s]


In [10]:
ls -lh data/imdb/aclImdb/

total 1.7M
-rw-r--r-- 1 7297 1000 882K Jun 11  2011 imdbEr.txt
-rw-r--r-- 1 7297 1000 827K Apr 12  2011 imdb.vocab
-rw-r--r-- 1 7297 1000 4.0K Jun 26  2011 README
drwxr-xr-x 4 7297 1000 4.0K Apr 12  2011 [0m[01;34mtest[0m/
drwxr-xr-x 5 7297 1000 4.0K Jun 26  2011 [01;34mtrain[0m/


In [11]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 25000
Number of testing examples: 25000


In [12]:
print(vars(train_data.examples[0]))

{'text': ['This', 'movie', 'is', 'the', 'first', 'of', 'Miikes', 'triad', 'society', 'trilogy', ',', 'and', 'the', 'trilogy', 'kicks', 'of', 'to', 'a', 'great', 'start', '.', 'The', 'movies', 'in', 'the', 'trilogy', 'are', 'only', 'connected', 'thematically', ',', 'and', 'these', 'themes', 'are', 'actually', 'apparent', 'in', 'all', 'his', 'films', ',', 'if', 'you', 'look', 'close', 'enough', '.', 'Shinjuku', 'Triad', 'Society', 'is', 'about', 'a', 'cop', 'trying', 'to', 'prevent', 'his', 'kid', 'brother', 'from', 'getting', 'too', 'involved', 'with', 'a', 'rather', 'extreme', 'gang', 'of', 'outsiders', ',', 'struggling', 'their', 'way', 'to', 'the', 'top', 'of', 'Tokyos', 'yakuza', '.', 'The', 'kid', 'brother', 'is', 'a', 'lawyer', ',', 'and', 'the', 'triad', 'gang', 'is', 'becoming', 'increasingly', 'in', 'need', 'of', 'one', ',', 'as', 'the', 'movie', 'progresses', '.', 'The', 'movie', 'takes', 'place', 'in', 'a', 'very', 'harsh', 'environment', ',', 'and', 'is', 'therefore', 'prett

In [13]:
train_data, valid_data = train_data.split(random_state=random.seed(SEED))

In [14]:
TEXT.build_vocab(train_data, max_size=25000)
LABEL.build_vocab(train_data)

In [15]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [16]:
vars(LABEL.vocab)

{'freqs': Counter({'neg': 8810, 'pos': 8690}),
 'itos': ['neg', 'pos'],
 'stoi': defaultdict(None, {'neg': 0, 'pos': 1}),
 'unk_index': None,
 'vectors': None}

Почему 25002, а не 25000?
Потому что $<unk>$ и $<pad>$

<img src="https://github.com/bentrevett/pytorch-sentiment-analysis/raw/bf8cc46e4823ebf9af721b595501ad6231c73632/assets/sentiment6.png" width="160">

In [17]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 203522), (',', 193141), ('.', 166396), ('a', 109712), ('and', 109710), ('of', 100686), ('to', 93968), ('is', 76710), ('in', 61305), ('I', 54268), ('it', 53492), ('that', 49245), ('"', 44030), ("'s", 43462), ('this', 42390), ('-', 37048), ('/><br', 35651), ('was', 35208), ('as', 30388), ('with', 30134)]


In [18]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'the', ',', '.', 'a', 'and', 'of', 'to', 'is']


In [19]:
print(LABEL.vocab.stoi)

defaultdict(None, {'neg': 0, 'pos': 1})


In [20]:
BATCH_SIZE = 64

# собираем батчи так, чтобы в каждом батче были примеры наиболее похожей длины
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE,
    device=device)

#Модель 

<img src="https://github.com/bentrevett/pytorch-sentiment-analysis/raw/bf8cc46e4823ebf9af721b595501ad6231c73632/assets/sentiment7.png" width="450">

* В эмбеддер (emb = [torch.nn.Embedding(num_embeddings, embedding_dim)](https://pytorch.org/docs/stable/nn.html?highlight=embedding#torch.nn.Embedding)) запихиваем тензор размерностью **[sentence length, batch size]**
* Эмбеддер возвращает тензор размерностью **[sentence length, batch size, embedding dim]**
* RNN (torch.nn.RNN(embedding_dim, hidden_dim)) возвращает 2 тензора, *output* размера [sentence length, batch size, hidden dim] и *hidden* размера [1, batch size, hidden dim]

In [21]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 2**7
HIDDEN_DIM = 2**8
OUTPUT_DIM = 1
N_EPOCHS = 10

In [36]:
def accuracy(y_pred, y_true):
    return ((torch.round(torch.sigmoid(y_pred)) == y_true).float().sum() / len(y_true)).item()

In [40]:
class RNN(nn.Module):
  def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
    super().__init__()

    self.input_dim = input_dim
    self.embedding_dim = embedding_dim
    self.hidden_dim = hidden_dim
    self.output_dim = output_dim

    self.embedding = nn.Embedding(input_dim, embedding_dim)
    self.rnn = nn.RNN(embedding_dim, hidden_dim)
    self.fc = nn.Linear(hidden_dim, output_dim)
  
  def forward(self, x):
    embedded = self.embedding(x)
    output, hidden = self.rnn(embedded)
    result = self.fc(hidden.squeeze(0))

    assert torch.equal(output[-1,:,:], hidden.squeeze(0))

    return result 

In [41]:
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM).to(device)
optimizer = optim.SGD(model.parameters(), lr=1e-1)
criterion = nn.BCEWithLogitsLoss().to(device)

In [42]:
history = []

for epoch in range(N_EPOCHS):
  model.train()
  history_accuracy = []

  for batch in train_iterator:
    X = batch.text
    Y = batch.label

    optimizer.zero_grad()

    predictions = model(X).squeeze(1)
    loss = criterion(predictions, Y)

    loss.backward()
    optimizer.step()

    history.append(loss.item())
    history_accuracy.append(accuracy(predictions, Y))

  print(f"Epoch = {epoch + 1}")
  print(f"Loss = {history[-1]}")
  print(f"Accuracy = {round(sum(history_accuracy) / len(train_iterator) * 100, 4)}%")

Epoch = 1
Loss = 0.6914979219436646
Accuracy = 49.4917%
Epoch = 2
Loss = 0.6919795870780945
Accuracy = 50.0081%
Epoch = 3
Loss = 0.6917856931686401
Accuracy = 49.6505%
Epoch = 4
Loss = 0.6919641494750977
Accuracy = 49.3825%
Epoch = 5
Loss = 0.6931459903717041
Accuracy = 50.1996%
Epoch = 6
Loss = 0.6923484206199646
Accuracy = 51.1617%
Epoch = 7
Loss = 0.6917925477027893
Accuracy = 50.2802%
Epoch = 8
Loss = 0.6942917108535767
Accuracy = 49.8306%
Epoch = 9
Loss = 0.6930854320526123
Accuracy = 49.9487%
Epoch = 10
Loss = 0.6934469938278198
Accuracy = 50.5116%
