<a href="https://colab.research.google.com/github/nandenjin/mlpr-class/blob/use-legacy-torchtext/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [3]:
import torchtext

In [39]:
def tokenizer(text):
    return text.split()

TEXT = torchtext.legacy.data.Field(sequential=True, batch_first=True, tokenize=tokenizer, lower=True)
LABEL = torchtext.legacy.data.Field(sequential=False)

In [40]:
train, valid, test = torchtext.legacy.datasets.SST.splits(TEXT, LABEL, root='data')

In [41]:
# To use IMDb, which is a movie review dataset with positive/negative sentiment tags, the following code is used.
# （ポジティブ/ネガティブの感情タグ付きの映画のレビューデータセットであるIMDbを用いる場合は以下のコードを用いる．）
#
# train, test = torchtext.datasets.IMDB.splits(TEXT, LABEL, root='data')

In [42]:
# To use your own dataset, use the following code.
# This code assumes that you have train.tsv and test.tsv files in the data/my_text_dataset folder, 
# which contain tab-delimited pairs of "text" and "label" for each line.
# （自作のデータセットを用いる場合は，以下のコードを用いる．
# このコードでは，data/my_text_dataset フォルダに，タブ区切りで「テキスト」と「ラベル」を
# 1行に1組ずつ列挙した train.tsv および test.tsv のファイルがあることを想定している．）
# 
# **Note**: In the following code, the text and labels of the batch (i.e., data) are accessed as
# data.text and data.label, but when using the TabularDataset class, these should be replaced with
# data.Text and data.Label. 
# As of June 24, 2020, the variable names in torchtext seem to be inconsistent.
# If you don't rewrite it, you will get the error `'Batch' object has no attribute 'text'`.
# （**注意**：以降のコードでは，バッチ（変数名を data とする）のテキストやラベルを data.text や data.label として
# アクセスしているが，TabularDataset クラスを用いる場合には，これを data.Text や data.Label と書き換えること．
# 2020年6月24日時点で，torchtextの中で変数名が一貫していないようである．
# 書き換えないと `'Batch' object has no attribute 'text'` というエラーが出る．）
#
# train, test = torchtext.data.TabularDataset.splits(path='./data/my_text_dataset',
#                                          train='train.tsv', test='test.tsv', format='tsv',
#                                          fields=[('Text', TEXT), ('Label', LABEL)])

In [43]:
TEXT.build_vocab(train, max_size=25000)
# To use a pre-trained word embedding vector, use the following code.
# （事前学習済みの単語埋め込みベクトルを用いる場合は，以下のコードを用いる．）
# TEXT.build_vocab(train, vectors="glove.6B.100d")

LABEL.build_vocab(train)

In [44]:
print(LABEL.vocab.stoi.items())

dict_items([('<unk>', 0), ('positive', 1), ('negative', 2), ('neutral', 3)])


In [45]:
print(list(TEXT.vocab.stoi.items())[:20])
print(len(TEXT.vocab))

[('<unk>', 0), ('<pad>', 1), ('.', 2), ('the', 3), (',', 4), ('a', 5), ('and', 6), ('of', 7), ('to', 8), ('is', 9), ("'s", 10), ('it', 11), ('that', 12), ('in', 13), ('as', 14), ('but', 15), ('film', 16), ('with', 17), ('for', 18), ('this', 19)]
16581


In [46]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
trainloader, testloader = torchtext.legacy.data.BucketIterator.splits((train, test), batch_size=4, device=device)

In [47]:
dataiter = iter(trainloader)

In [48]:
data = dataiter.__next__()
x, y = data.text, data.label
for x_i in x:
    print(' '.join(TEXT.vocab.itos[w] for w in x_i))
print([LABEL.vocab.itos[yi] for yi in y])

the picture runs a mere 84 minutes , but it 's no glance . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
it 's a drawling , slobbering , lovable run-on sentence of a film , a southern gothic with the emotional arc of its raw blues soundtrack .
topics that could make a sailor blush - but lots of laughs . <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
a crude teen-oriented variation on a theme that the playwright craig lucas explored with infinitely more grace and eloquence in his prelude to a kiss . <pad>
['positive', 'neutral', 'positive', 'neutral']


In [49]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        output, (hn, cn) = self.lstm(x)
        hn = hn.squeeze(0)
        return self.fc(hn)

In [50]:
from itertools import islice
rnn = RNN(len(TEXT.vocab), 100, 30, 3)
# If you want to use a pre-trained word embedding vector, insert the following code.
# （事前学習済みの単語埋め込みベクトルを用いる場合は，以下のコードを挿入する．）
# rnn.embedding.weight.data.copy_(TEXT.vocab.vectors)
rnn.to(device)
optimizer = optim.SGD(rnn.parameters(), lr = 0.01)
for epoch in range(10):
    sumloss = 0.0
    # In an environment with sufficient computing resources, it is better to use all the data.
    # （計算資源が十分ある環境では，全てのデータを使う方が良い）
    #for data in trainloader:  （計算資源が十分ある環境では，全てのデータを使う方が良い）
    for data in islice(trainloader, 250): # Using only 250 batches
        x, y = data.text, data.label - 1
        optimizer.zero_grad()
        a = rnn(x)
        loss = F.cross_entropy(a, y)
        loss.backward()
        optimizer.step()
        sumloss += loss.item()
    print('epoch: {}, loss: {:.4f}'.format(epoch, sumloss))

epoch: 0, loss: 262.0990
epoch: 1, loss: 261.7577
epoch: 2, loss: 264.9352
epoch: 3, loss: 263.5806
epoch: 4, loss: 260.6746
epoch: 5, loss: 258.7201
epoch: 6, loss: 264.9055
epoch: 7, loss: 261.0733
epoch: 8, loss: 259.9581
epoch: 9, loss: 261.9754


In [51]:
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        x, y = data.text, data.label - 1
        a = rnn(x)
        pred_y = torch.argmax(a, dim=1)
        correct += (pred_y == y).sum().item()
        total += pred_y.size(0)

print(correct / total)

0.4104072398190045


In [52]:
dataiter = iter(testloader)

In [53]:
data = dataiter.__next__()
x, y = data.text, data.label
for x_i in x:
    print(' '.join(TEXT.vocab.itos[w] for w in x_i))
a = rnn(x)
pred_y = torch.argmax(a, dim=1)
print([LABEL.vocab.itos[yi + 1] for yi in pred_y])
print([LABEL.vocab.itos[yi] for yi in y])

immediately .
no. .
<unk> .
hopkins .
['positive', 'positive', 'positive', 'positive']
['neutral', 'negative', 'neutral', 'neutral']
