In [3]:
import torch
from torch import nn, optim
from torch.utils.data import (Dataset,
                             DataLoader,
                             TensorDataset)
import tqdm

In [4]:
import glob
import pathlib
import re

remove_marks_regex = re.compile("[,\.\(\)\[\]\*:;]|<.*?>")
shift_marks_regex = re.compile("([?!])")

def text2ids(text, vocab_dict):
    # !?以外の記号の削除
    text = remove_marks_regex.sub("", text)
    # !?と単語の間にスペースを挿入
    text = shift_marks_regex.sub(r"\1", text)
    tokens = text.split()
    return [vocab_dict.get(token, 0) for token in tokens]

def list2tensor(token_idxes, max_len=100, padding=True):
    if len(token_idxes) > max_len:
        token_idxes = token_idxes[:max_len]
    n_tokens = len(token_idxes)
    if padding:
        token_idxes = token_idxes + [0] * (max_len - len(token_idxes))
    return torch.tensor(token_idxes, dtype=torch.int64), n_tokens

In [5]:
class IMDBDataset(Dataset):
    def __init__(self, dir_path, train=True,
                 max_len=100, padding=True):
        self.max_len = max_len
        self.padding = padding
        
        path = pathlib.Path(dir_path)
        vocab_path = path.joinpath("imdb.vocab")
        
        # ボキャブラリファイルを読み込み、行ごとに分割
        self.vocab_array = vocab_path.open() \
                            .read().strip().splitlines()
        # 単語をキーとし、値がIDのdictを作る
        self.vocab_dict = dict((w, i+1) \
            for (i, w) in enumerate(self.vocab_array))
    
        if train:
            target_path = path.joinpath("train")
        else:
            target_path = path.joinpath("test")
        pos_files = sorted(glob.glob(
            str(target_path.joinpath("pos/*.txt"))))
        neg_files = sorted(glob.glob(
            str(target_path.joinpath("neg/*.txt"))))
        # posは1, negは0のlabelを付けて
        # (file_path, label)のtupleのリストを作成
        self.labeled_files = \
            list(zip([0]*len(neg_files), neg_files )) + \
            list(zip([1]*len(pos_files), pos_files))
  
    @property
    def vocab_size(self):
        return len(self.vocab_array)

    def __len__(self):
        return len(self.labeled_files)

    def __getitem__(self, idx):
        label, f = self.labeled_files[idx]
        # ファイルのテキストデータを読み取って小文字に変換
        data = open(f).read().lower()
        # テキストデータをIDのリストに変換
        data = text2ids(data, self.vocab_dict)
        # IDのリストをTensorに変換
        data, n_tokens = list2tensor(data, self.max_len, self.padding)
        return data, label, n_tokens

In [6]:
"""
訓練用とテスト用のDataLoaderの作成
"""
train_data = IMDBDataset("ch5data/aclImdb/")
test_data = IMDBDataset("ch5data/aclImdb/", train=False)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True, num_workers=4)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False, num_workers=4)

In [7]:
"""
ネットワークの定義
"""
class SequenceTaggingNet(nn.Module):
    def __init__(self, num_embeddings,
                 embedding_dim=50, 
                 hidden_size=50,
                 num_layers=1,
                 dropout=0.2):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, embedding_dim,
                                padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_size, num_layers,
                            batch_first=True, dropout=dropout)
        self.linear = nn.Linear(hidden_size, 1)

    def forward(self, x, h0=None, l=None):
        # IDをEmbeddingで多次元のベクトルに変換する
        # xは(batch_size, step_size) 
        # -> (batch_size, step_size, embedding_dim)
        x = self.emb(x)
        # 初期状態h0と共にRNNにxを渡す
        # xは(batch_size, step_size, embedding_dim)
        # -> (batch_size, step_size, hidden_dim)
        x, h = self.lstm(x, h0)
        # 最後のステップのみ取り出す
        # xは(batch_size, step_size, hidden_dim)
        # -> (batch_size, 1)
        if l is not None:
            # 入力のもともとの長さがある場合はそれを使用する
            x = x[list(range(len(x))), l-1, :]
        else:
            # なければ単純に最後を使用する
            x = x[:, -1, :]
        # 取り出した最後のステップを線形層に入れる
        x = self.linear(x)
        # 余分な次元を削除する
        # (batch_size, 1) -> (batch_size, )
        x = x.squeeze()
        return x

In [8]:
"""
訓練の作成
"""
def eval_net(net, data_loader, device="cpu"):
    net.eval()
    ys = []
    ypreds = []
    for x, y, l in data_loader:
        x = x.to(device)
        y = y.to(device)
        l = l.to(device)
        with torch.no_grad():
            y_pred = net(x, l=l)
            y_pred = (y_pred > 0).long()
            ys.append(y)
            ypreds.append(y_pred)
        ys = torch.cat(ys)
        ypreds = torch.cat(ypreds)
        acc = (ys == ypreds).float().sum() / len(ys)
        return acc.item()

In [10]:
"""
評価の作成
"""
from statistics import mean

# num_embeddingsには0を含めてtrain_data.vocab_size+1を入れる
net = SequenceTaggingNet(train_data.vocab_size+1, num_layers=2)
net.to("cuda:0")
opt = optim.Adam(net.parameters())
loss_f = nn.BCEWithLogitsLoss()

for epoch in range(10):
    losses = []
    net.train()
    for x, y, l in tqdm.tqdm(train_loader):
        x = x.to("cuda:0")
        y = y.to("cuda:0")
        l = l.to("cuda:0")
        y_pred = net(x, l=l)
        loss = loss_f(y_pred, y.float())
        net.zero_grad()
        loss.backward()
        opt.step()
        losses.append(loss.item())
    train_acc = eval_net(net, train_loader, "cuda:0")
    val_acc = eval_net(net, test_loader, "cuda:0")
    print(epoch, mean(losses), train_acc, val_acc)

100%|██████████| 782/782 [00:31<00:00, 24.68it/s]
  0%|          | 0/782 [00:00<?, ?it/s]

0 0.6750903708093307 0.5 0.9375


100%|██████████| 782/782 [00:29<00:00, 26.31it/s]
  0%|          | 0/782 [00:00<?, ?it/s]

1 0.673973117033234 0.59375 0.53125


100%|██████████| 782/782 [00:29<00:00, 26.53it/s]
Exception ignored in: <bound method _DataLoaderIter.__del__ of <torch.utils.data.dataloader._DataLoaderIter object at 0x7f7824ed6be0>>
Traceback (most recent call last):
  File "/home/melty0404/.conda/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 349, in __del__
    self._shutdown_workers()
  File "/home/melty0404/.conda/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 328, in _shutdown_workers
    self.worker_result_queue.get()
  File "/home/melty0404/.conda/envs/pytorch/lib/python3.6/multiprocessing/queues.py", line 337, in get
    return _ForkingPickler.loads(res)
  File "/home/melty0404/.conda/envs/pytorch/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 70, in rebuild_storage_fd
    fd = df.detach()
  File "/home/melty0404/.conda/envs/pytorch/lib/python3.6/multiprocessing/resource_sharer.py", line 57, in detach
    with _resource_sharer.get_connect

2 0.609384436703399 0.75 0.65625


100%|██████████| 782/782 [00:28<00:00, 27.58it/s]
  0%|          | 0/782 [00:00<?, ?it/s]

3 0.45854991643934906 0.8125 0.8125


100%|██████████| 782/782 [00:27<00:00, 28.49it/s]
  0%|          | 0/782 [00:00<?, ?it/s]

4 0.3570050961430878 0.90625 0.71875


100%|██████████| 782/782 [00:27<00:00, 28.81it/s]
  0%|          | 0/782 [00:00<?, ?it/s]

5 0.2880880700524353 0.90625 0.625


100%|██████████| 782/782 [00:27<00:00, 28.19it/s]
Exception ignored in: <bound method _DataLoaderIter.__del__ of <torch.utils.data.dataloader._DataLoaderIter object at 0x7f77d7476dd8>>
Traceback (most recent call last):
  File "/home/melty0404/.conda/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 349, in __del__
    self._shutdown_workers()
  File "/home/melty0404/.conda/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 328, in _shutdown_workers
    self.worker_result_queue.get()
  File "/home/melty0404/.conda/envs/pytorch/lib/python3.6/multiprocessing/queues.py", line 337, in get
    return _ForkingPickler.loads(res)
  File "/home/melty0404/.conda/envs/pytorch/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 70, in rebuild_storage_fd
    fd = df.detach()
  File "/home/melty0404/.conda/envs/pytorch/lib/python3.6/multiprocessing/resource_sharer.py", line 57, in detach
    with _resource_sharer.get_connect

6 0.23003383134217822 1.0 0.8125


100%|██████████| 782/782 [00:27<00:00, 28.17it/s]
  0%|          | 0/782 [00:00<?, ?it/s]

7 0.1802888982869742 0.96875 0.75


100%|██████████| 782/782 [00:27<00:00, 28.35it/s]
Exception ignored in: <bound method _DataLoaderIter.__del__ of <torch.utils.data.dataloader._DataLoaderIter object at 0x7f7824ed6be0>>
Traceback (most recent call last):
  File "/home/melty0404/.conda/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 349, in __del__
    self._shutdown_workers()
  File "/home/melty0404/.conda/envs/pytorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 328, in _shutdown_workers
    self.worker_result_queue.get()
  File "/home/melty0404/.conda/envs/pytorch/lib/python3.6/multiprocessing/queues.py", line 337, in get
    return _ForkingPickler.loads(res)
  File "/home/melty0404/.conda/envs/pytorch/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 70, in rebuild_storage_fd
    fd = df.detach()
  File "/home/melty0404/.conda/envs/pytorch/lib/python3.6/multiprocessing/resource_sharer.py", line 57, in detach
    with _resource_sharer.get_connect

8 0.13917251949048484 1.0 0.8125


100%|██████████| 782/782 [00:27<00:00, 28.45it/s]


9 0.10593734282399993 0.96875 0.78125


In [13]:
"""
RNNを使用しないモデルの作成
"""
from sklearn.datasets import load_svmlight_file
from sklearn.linear_model import LogisticRegression

train_X, train_y = load_svmlight_file("ch5data/aclImdb/train/labeledBow.feat")
test_X, test_y = load_svmlight_file("ch5data/aclImdb/test/labeledBow.feat", n_features=train_X.shape[1])

model = LogisticRegression(C=0.1, max_iter=1000)
model.fit(train_X, train_y)
model.score(train_X, train_y), model.score(test_X, test_y)



(0.89888, 0.39608)