In [1]:
%pylab inline

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


In [None]:
!pip install torchdata

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

In [3]:
import torchtext
import torchtext.transforms as T

In [4]:
def tokenize(text):
    return text.split()

In [5]:
# SST2 is a dataset with positive/negative sentiment labels.
# （SST2は、正負の感情ラベルを持つデータセットである。）
# Label numbers mean: 0=negative, 1=positive

from torchtext.datasets import SST2
train = SST2(split='train').map(lambda x: (x[0], str(x[1])))
test = SST2(split='dev').map(lambda x: (x[0], str(x[1])))



In [6]:
# Run the following code to use IMDb, which is a movie review dataset with positive/negative sentiment tags.
# While SST2 is a stream of (text, label) pairs, IMDb is of (label, text) pairs, so we apply map to align the type of the stream.
# See the following site to check the structure of other datasets:
# （ポジティブ/ネガティブの感情タグ付きの映画のレビューデータセットであるIMDbを用いる場合は以下のコードを用いる．
# 　SST2は(text, label)のペアのストリームですが、IMDbは(label, text)のペアなので、ストリームの型を揃えるために、mapを適用します。
# 　他のデータセットの構造を確認したい場合は、以下のサイトを参照してください。）
# https://pytorch.org/text/stable/datasets.html

# from torchtext.datasets import IMDB
# train = IMDB(split='train').map(lambda x: (x[1], x[0]))
# test = IMDB(split='test').map(lambda x: (x[1], x[0]))

In [7]:
# Run the following code to use AG News, which is a news article dataset with genre tags.
# （以下のコードを実行すると、ジャンルタグ付きのニュース記事データセットであるAG Newsが利用できます。）
# Labels numbers mean: 1=“World”, 2=“Sports”, 3=“Business”, 4=“Sci/Tech”

# from torchtext.datasets import AG_NEWS
# train = AG_NEWS(split='train').map(lambda x: (x[1], str(x[0])))
# test = AG_NEWS(split='test').map(lambda x: (x[1], str(x[0])))

In [8]:
# Run the following code to use your own dataset.
# This code assumes that you have train.tsv and test.tsv files in the data/my_text_dataset folder, 
# which contain tab-delimited pairs of "text" and "label" for each line.
# （自作のデータセットを用いる場合は，以下のコードを用いる．
# 　このコードでは，data/my_text_dataset フォルダに，タブ区切りで「テキスト」と「ラベル」を
# 　1行に1組ずつ列挙した train.tsv および test.tsv のファイルがあることを想定している．）

# import pandas as pd
# train = pd.read_table('./data/my_text_dataset/train.tsv', header=None).values
# test = pd.read_table('./data/my_text_dataset/test.tsv', header=None).values

In [9]:
from collections import Counter
token_counter = Counter()
label_counter = Counter()
for text, label in train:
    token_counter.update(tokenize(text))
    label_counter[label] += 1

In [10]:
token_counter.most_common(10)

[('the', 27205),
 (',', 25980),
 ('a', 21609),
 ('and', 19920),
 ('of', 17907),
 ('.', 12687),
 ('to', 12538),
 ("'s", 8764),
 ('is', 8685),
 ('that', 7759)]

In [11]:
label_counter

Counter({'0': 29780, '1': 37569})

In [12]:
from torchtext.vocab import vocab
text_vocab = vocab(token_counter, min_freq=2, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

In [13]:
# To use a pre-trained word embedding vector, run the following code.
# （事前学習済みの単語埋め込みベクトルを用いる場合は，以下のコードを用いる．）

# glove = torchtext.vocab.GloVe()
# text_vocab = vocab(glove.stoi, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

In [14]:
text_vocab.set_default_index(text_vocab['<unk>'])

In [15]:
label_vocab = torchtext.vocab.vocab(label_counter)

In [16]:
text_transform = T.Sequential(
    T.VocabTransform(text_vocab),
    T.Truncate(50-2),
    T.AddToken(token=text_vocab['<bos>'], begin=True),
    T.AddToken(token=text_vocab['<eos>'], begin=False),
    T.ToTensor(padding_value=text_vocab['<pad>'])
)

In [17]:
def collate_batch(batch):
    texts = text_transform([tokenize(text) for (text, label) in batch])
    labels = torch.tensor([label_vocab[label] for (text, label) in batch])
    return texts, labels

In [18]:
trainloader = DataLoader(train, batch_size=4, shuffle=True, collate_fn=collate_batch)
testloader = DataLoader(test, batch_size=4, shuffle=True, collate_fn=collate_batch)

In [19]:
data = trainloader.__iter__().__next__()
x, y = data
print(x)
print(y)
for x_i, y_i in zip(x, y):
    print(text_vocab.lookup_tokens(list(x_i)))
    print(label_vocab.lookup_token(y_i))

tensor([[ 2,  4,  5,  6,  7,  8,  9, 10,  3,  1,  1,  1,  1,  1],
        [ 2, 11, 12, 13, 14, 15, 16, 17,  3,  1,  1,  1,  1,  1],
        [ 2, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,  3],
        [ 2, 30, 31, 32, 33, 34,  8, 35, 36,  3,  1,  1,  1,  1]])
tensor([0, 0, 1, 0])
['<bos>', 'hide', 'new', 'secretions', 'from', 'the', 'parental', 'units', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
0
['<bos>', 'contains', 'no', 'wit', ',', 'only', 'labored', 'gags', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
0
['<bos>', 'that', 'loves', 'its', 'characters', 'and', 'communicates', 'something', 'rather', 'beautiful', 'about', 'human', 'nature', '<eos>']
1
['<bos>', 'remains', 'utterly', 'satisfied', 'to', 'remain', 'the', 'same', 'throughout', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>']
0


In [20]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        # If you want to use a pre-trained word embedding vector, insert the following code.
        # （事前学習済みの単語埋め込みベクトルを用いる場合は，以下のコードを挿入する．）
        # self.embedding = nn.Embedding.from_pretrained(glove.vectors, freeze=True)
        self.lstm = nn.LSTM(self.embedding.embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        output, (hn, cn) = self.lstm(x)
        hn = hn.squeeze(0)
        return self.fc(hn)

In [21]:
from itertools import islice

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
rnn = RNN(len(text_vocab), 100, 30, len(label_vocab))
rnn.to(device)
optimizer = optim.SGD(rnn.parameters(), lr = 0.1)

for epoch in range(50):
    sumloss = 0.0
    # In an environment with sufficient computing resources, it is better to use all the data.
    # （計算資源が十分ある環境では，全てのデータを使う方が良い）
    #for data in trainloader:  # Using all batches
    for data in islice(trainloader, 250): # Using only 250 batches
        x = data[0].to(device)
        y = data[1].to(device)
        optimizer.zero_grad()
        a = rnn(x)
        loss = F.cross_entropy(a, y)
        loss.backward()
        optimizer.step()
        sumloss += loss.item()
    print('epoch: {}, loss: {:.4f}'.format(epoch, sumloss))



epoch: 0, loss: 174.5874
epoch: 1, loss: 173.1876
epoch: 2, loss: 172.4343
epoch: 3, loss: 170.8540
epoch: 4, loss: 167.8444
epoch: 5, loss: 163.4136
epoch: 6, loss: 157.9567
epoch: 7, loss: 152.2603
epoch: 8, loss: 147.8479
epoch: 9, loss: 140.4318
epoch: 10, loss: 132.1370
epoch: 11, loss: 120.7141
epoch: 12, loss: 112.0275
epoch: 13, loss: 99.5499
epoch: 14, loss: 88.7921
epoch: 15, loss: 82.1006
epoch: 16, loss: 73.5971
epoch: 17, loss: 64.8628
epoch: 18, loss: 58.8344
epoch: 19, loss: 42.2226
epoch: 20, loss: 33.2559
epoch: 21, loss: 47.0197
epoch: 22, loss: 32.6064
epoch: 23, loss: 32.8828
epoch: 24, loss: 43.3440
epoch: 25, loss: 27.1222
epoch: 26, loss: 17.5141
epoch: 27, loss: 10.8909
epoch: 28, loss: 7.8283
epoch: 29, loss: 5.8629
epoch: 30, loss: 5.0388
epoch: 31, loss: 4.4577
epoch: 32, loss: 3.9687
epoch: 33, loss: 3.4488
epoch: 34, loss: 2.9562
epoch: 35, loss: 2.6984
epoch: 36, loss: 2.4928
epoch: 37, loss: 2.2985
epoch: 38, loss: 2.0502
epoch: 39, loss: 1.5529
epoch: 40

In [22]:
testloader.sort = False
testloader.sort_within_batch = False
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        x = data[0].to(device)
        y = data[1].to(device)
        a = rnn(x)
        pred_y = torch.argmax(a, dim=1)
        correct += (pred_y == y).sum().item()
        total += pred_y.size(0)

print(correct / total)

0.5240825688073395


In [23]:
dataiter = iter(testloader)

In [24]:
data = dataiter.__next__()
x = data[0].to(device)
y = data[1].to(device)
a = rnn(x)
pred_y = torch.argmax(a, dim=1)
for x_i, y_i, pred_y_i in zip(x, y, pred_y):
    print(' '.join(text_vocab.lookup_tokens(list(x_i))))
    print('true:' + label_vocab.lookup_token(y_i))
    print('pred:' + label_vocab.lookup_token(pred_y_i))

<bos> it 's a charming and often affecting journey . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
true:1
pred:1
<bos> <unk> bleak and desperate <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
true:0
pred:1
<bos> allows us to hope that nolan is <unk> to <unk> a major career as a commercial yet inventive filmmaker . <eos>
true:1
pred:1
<bos> the acting , costumes , music , cinematography and sound are all astounding given the production 's <unk> locales . <eos>
true:1
pred:1
