In [None]:
%pylab inline

In [None]:
!pip install torchdata

In [None]:
!pip install portalocker

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

In [None]:
import torchtext
import torchtext.transforms as T

In [None]:
def tokenize(text):
    return text.split()

In [None]:
# SST2 is a dataset with positive/negative sentiment labels.
# （SST2は、正負の感情ラベルを持つデータセットである。）
# Label numbers mean: 0=negative, 1=positive

from torchtext.datasets import SST2
train = SST2(split='train').map(lambda x: (x[0], str(x[1])))
test = SST2(split='dev').map(lambda x: (x[0], str(x[1])))

In [None]:
# Run the following code to use IMDb, which is a movie review dataset with positive/negative sentiment tags.
# While SST2 is a stream of (text, label) pairs, IMDb is of (label, text) pairs, so we apply map to align the type of the stream.
# See the following site to check the structure of other datasets:
# （ポジティブ/ネガティブの感情タグ付きの映画のレビューデータセットであるIMDbを用いる場合は以下のコードを用いる．
# 　SST2は(text, label)のペアのストリームですが、IMDbは(label, text)のペアなので、ストリームの型を揃えるために、mapを適用します。
# 　他のデータセットの構造を確認したい場合は、以下のサイトを参照してください。）
# https://pytorch.org/text/stable/datasets.html

# from torchtext.datasets import IMDB
# train = IMDB(split='train').map(lambda x: (x[1], str(x[0])))
# test = IMDB(split='test').map(lambda x: (x[1], str(x[0])))

In [None]:
# Run the following code to use AG News, which is a news article dataset with genre tags.
# （以下のコードを実行すると、ジャンルタグ付きのニュース記事データセットであるAG Newsが利用できます。）
# Labels numbers mean: 1=“World”, 2=“Sports”, 3=“Business”, 4=“Sci/Tech”

from torchtext.datasets import AG_NEWS
train = AG_NEWS(split='train').map(lambda x: (x[1], str(x[0])))
test = AG_NEWS(split='test').map(lambda x: (x[1], str(x[0])))

In [None]:
# Run the following code to use your own dataset.
# This code assumes that you have train.tsv and test.tsv files in the data/my_text_dataset folder, 
# which contain tab-delimited pairs of "text" and "label" for each line.
# （自作のデータセットを用いる場合は，以下のコードを用いる．
# 　このコードでは，data/my_text_dataset フォルダに，タブ区切りで「テキスト」と「ラベル」を
# 　1行に1組ずつ列挙した train.tsv および test.tsv のファイルがあることを想定している．）

# import pandas as pd
# train = pd.read_table('./data/my_text_dataset/train.tsv', header=None).values
# test = pd.read_table('./data/my_text_dataset/test.tsv', header=None).values

In [None]:
from collections import Counter
token_counter = Counter()
label_counter = Counter()
for text, label in train:
    token_counter.update(tokenize(text))
    label_counter[label] += 1

In [None]:
token_counter.most_common(10)

In [None]:
label_counter

In [None]:
from torchtext.vocab import vocab
text_vocab = vocab(token_counter, min_freq=2, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

In [None]:
# To use a pre-trained word embedding vector, run the following code.
# （事前学習済みの単語埋め込みベクトルを用いる場合は，以下のコードを用いる．）

# glove = torchtext.vocab.GloVe()
# text_vocab = vocab(glove.stoi, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

In [None]:
text_vocab.set_default_index(text_vocab['<unk>'])

In [None]:
label_vocab = vocab(label_counter)

In [None]:
print(list(text_vocab.get_stoi().items())[:20])
print(label_vocab.get_stoi().items())

In [None]:
text_transform = T.Sequential(
    T.VocabTransform(text_vocab),
    T.Truncate(50-2),
    T.AddToken(token=text_vocab['<bos>'], begin=True),
    T.AddToken(token=text_vocab['<eos>'], begin=False),
    T.ToTensor(padding_value=text_vocab['<pad>'])
)

In [None]:
def collate_batch(batch):
    texts = text_transform([tokenize(text) for (text, label) in batch])
    labels = torch.tensor([label_vocab[label] for (text, label) in batch])
    return texts, labels

In [None]:
trainloader = DataLoader(train, batch_size=4, shuffle=True, collate_fn=collate_batch)
testloader = DataLoader(test, batch_size=4, shuffle=True, collate_fn=collate_batch)

In [None]:
data = trainloader.__iter__().__next__()
x, y = data
print(x)
print(y)
for x_i, y_i in zip(x, y):
    print(text_vocab.lookup_tokens(list(x_i)))
    print(label_vocab.lookup_token(y_i))

In [None]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        # If you want to use a pre-trained word embedding vector, insert the following code.
        # （事前学習済みの単語埋め込みベクトルを用いる場合は，以下のコードを挿入する．）
        # self.embedding = nn.Embedding.from_pretrained(glove.vectors, freeze=True)
        self.lstm = nn.LSTM(self.embedding.embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        output, (hn, cn) = self.lstm(x)
        hn = hn.squeeze(0)
        return self.fc(hn)

In [None]:
from itertools import islice

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
rnn = RNN(len(text_vocab), 100, 30, len(label_vocab))
rnn.to(device)
optimizer = optim.SGD(rnn.parameters(), lr = 0.1)

for epoch in range(50):
    sumloss = 0.0
    # In an environment with sufficient computing resources, it is better to use all the data.
    # （計算資源が十分ある環境では，全てのデータを使う方が良い）
    #for data in trainloader:  # Using all batches
    for data in islice(trainloader, 250): # Using only 250 batches
        x = data[0].to(device)
        y = data[1].to(device)
        optimizer.zero_grad()
        a = rnn(x)
        loss = F.cross_entropy(a, y)
        loss.backward()
        optimizer.step()
        sumloss += loss.item()
    print('epoch: {}, loss: {:.4f}'.format(epoch, sumloss))

In [None]:
testloader.sort = False
testloader.sort_within_batch = False
ys = []
pred_ys = []
with torch.no_grad():
    for data in testloader:
        x = data[0].to(device)
        y = data[1].to(device)
        a = rnn(x)
        pred_y = torch.argmax(a, dim=1)
        ys += [y_i.item() for y_i in y]
        pred_ys += [y_i.item() for y_i in pred_y]

print((pred_y == y).sum().item() / pred_y.size(0))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(ys, pred_ys))

In [None]:
dataiter = iter(testloader)

In [None]:
data = dataiter.__next__()
x = data[0].to(device)
y = data[1].to(device)
a = rnn(x)
pred_y = torch.argmax(a, dim=1)
for x_i, y_i, pred_y_i in zip(x, y, pred_y):
    print(' '.join(text_vocab.lookup_tokens(list(x_i))))
    print('true:' + label_vocab.lookup_token(y_i))
    print('pred:' + label_vocab.lookup_token(pred_y_i))