In [19]:
import torch
import torch.nn as nn
import gensim
import pandas as pd

# Word2Vecモデルのロード
# gensimで学習済みモデルを読み込む
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(
    "../data/word2vec_ja/jawiki.word_vectors.100d.txt", binary=False
)

# 重みを取得
weights = torch.FloatTensor(word2vec_model.vectors)


class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, weights):
        super(RNNModel, self).__init__()

        # Embedding層の作成
        self.embedding = nn.Embedding(vocab_size, embedding_dim, _weight=weights)

        # Embedding層の重みをfreeze
        self.embedding.weight.requires_grad = False

        # RNN層の作成
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)

        # 線形層の作成
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        rnn_output, _ = self.rnn(embedded)
        output = self.fc(rnn_output[:, -1, :])
        return output


# パラメータ
vocab_size = len(word2vec_model.index_to_key)  # Word2Vecの語彙サイズ
embedding_dim = 100
hidden_dim = 32

RNNModel(
  (embedding): Embedding(751361, 100)
  (rnn): RNN(100, 32, batch_first=True)
  (fc): Linear(in_features=32, out_features=2, bias=True)
)


In [12]:
import math
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

import MeCab

mecab = MeCab.Tagger(
    "-O wakati -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd"
)


# テキストをIDのシーケンスに変換する関数
def text_to_sequence(text, word2vec_model):
    return [
        word2vec_model.key_to_index.get(word, 0) for word in mecab.parse(text).split()
    ]


# データセットクラス
class TextDataset(Dataset):
    def __init__(self, texts, labels, word2vec_model, max_length):
        self.texts = [text_to_sequence(text, word2vec_model) for text in texts]
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # パディングの追加
        text = self.texts[idx]
        if len(text) < self.max_length:
            text += [0] * (self.max_length - len(text))
        text = text[: self.max_length]
        return torch.tensor(text, dtype=torch.long), torch.tensor(
            self.labels[idx], dtype=torch.float
        )

In [22]:
train_df = pd.read_csv("../data/train.tsv", sep="\t")
valid_df = pd.read_csv("../data/valid.tsv", sep="\t")
test_df = pd.read_csv("../data/test.tsv", sep="\t")

# 文の最大長を決定
max_length = max(
    len(text_to_sequence(text, word2vec_model))
    for text in pd.concat([train_df["poem"], valid_df["poem"], test_df["poem"]])
)

# データセットの分割
train_dataset = TextDataset(
    train_df["poem"], train_df["label"], word2vec_model, max_length
)
valid_dataset = TextDataset(
    valid_df["poem"], valid_df["label"], word2vec_model, max_length
)
test_dataset = TextDataset(
    test_df["poem"], test_df["label"], word2vec_model, max_length
)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=2)
test_loader = DataLoader(test_dataset, batch_size=2)

# モデル、損失関数、最適化手法の設定
model = RNNModel(vocab_size, embedding_dim, hidden_dim, 1, weights)  # 出力は1次元
loss_function = nn.BCEWithLogitsLoss()  # 2値分類のための損失関数
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001)


# 訓練関数
def train(model, iterator, optimizer, loss_function):
    model.train()
    total_loss = 0

    for text, label in iterator:
        optimizer.zero_grad()
        predictions = model(text).squeeze(1)
        loss = loss_function(predictions, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(iterator)


# 評価関数
def evaluate(model, iterator, loss_function):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for text, label in iterator:
            predictions = model(text).squeeze(1)
            loss = loss_function(predictions, label)
            total_loss += loss.item()

    return total_loss / len(iterator)


# 訓練ループ
EPOCHS = 100

for epoch in range(EPOCHS):
    train_loss = train(model, train_loader, optimizer, loss_function)
    valid_loss = evaluate(model, valid_loader, loss_function)
    print(f"Epoch: {epoch+1:02}")
    print(f"\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):.3f}")
    print(f"\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):.3f}")

Epoch: 01
	Train Loss: 0.598 | Train PPL: 1.818
	 Val. Loss: 0.608 |  Val. PPL: 1.837
Epoch: 02
	Train Loss: 0.578 | Train PPL: 1.783
	 Val. Loss: 0.608 |  Val. PPL: 1.837
Epoch: 03
	Train Loss: 0.575 | Train PPL: 1.778
	 Val. Loss: 0.621 |  Val. PPL: 1.861
Epoch: 04
	Train Loss: 0.577 | Train PPL: 1.780
	 Val. Loss: 0.610 |  Val. PPL: 1.840
Epoch: 05
	Train Loss: 0.573 | Train PPL: 1.773
	 Val. Loss: 0.608 |  Val. PPL: 1.837
Epoch: 06
	Train Loss: 0.573 | Train PPL: 1.774
	 Val. Loss: 0.614 |  Val. PPL: 1.849
Epoch: 07
	Train Loss: 0.573 | Train PPL: 1.774
	 Val. Loss: 0.624 |  Val. PPL: 1.867
Epoch: 08
	Train Loss: 0.570 | Train PPL: 1.767
	 Val. Loss: 0.615 |  Val. PPL: 1.850
Epoch: 09
	Train Loss: 0.572 | Train PPL: 1.772
	 Val. Loss: 0.608 |  Val. PPL: 1.838
Epoch: 10
	Train Loss: 0.571 | Train PPL: 1.770
	 Val. Loss: 0.611 |  Val. PPL: 1.842
Epoch: 11
	Train Loss: 0.569 | Train PPL: 1.766
	 Val. Loss: 0.609 |  Val. PPL: 1.839
Epoch: 12
	Train Loss: 0.571 | Train PPL: 1.769
	 Val.

In [15]:
all_predictions = []

with torch.no_grad():
    for inputs, _ in test_loader:  # "_"はラベルやターゲットを使わない場合
        outputs = model(inputs)

        # 予測結果をリストに追加
        all_predictions.extend(outputs.cpu().numpy())

In [18]:
all_predictions

[array([1.3887568], dtype=float32),
 array([1.4108727], dtype=float32),
 array([1.3927056], dtype=float32),
 array([1.8779539], dtype=float32),
 array([1.7121079], dtype=float32),
 array([1.3825288], dtype=float32),
 array([1.3263783], dtype=float32),
 array([1.4227681], dtype=float32),
 array([1.3441978], dtype=float32),
 array([2.0596204], dtype=float32),
 array([1.8088042], dtype=float32),
 array([1.4110069], dtype=float32),
 array([1.2077798], dtype=float32),
 array([1.4284291], dtype=float32),
 array([1.5877194], dtype=float32),
 array([1.3353667], dtype=float32),
 array([1.0406523], dtype=float32),
 array([1.4238864], dtype=float32),
 array([2.2286155], dtype=float32),
 array([1.5552645], dtype=float32),
 array([0.8097642], dtype=float32),
 array([0.97168744], dtype=float32),
 array([0.57724035], dtype=float32),
 array([1.3819319], dtype=float32),
 array([1.4538323], dtype=float32),
 array([1.7465686], dtype=float32),
 array([1.3662088], dtype=float32),
 array([2.2249575], dtype=