In [29]:
import random
import numpy as np
import torch


def torch_fix_seed(seed=42):
    # Python random
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Pytorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.use_deterministic_algorithms = True


torch_fix_seed()

In [None]:
import gensim
import pandas as pd

# Word2Vecモデルのロード
# gensimで学習済みモデルを読み込む
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(
    "../data/word2vec_ja/jawiki.word_vectors.100d.txt", binary=False
)

# 重みを取得
weights = torch.FloatTensor(word2vec_model.vectors)

In [30]:
import torch.nn as nn


class SelfAttentionModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, weights):
        super(SelfAttentionModel, self).__init__()

        # Embedding層の作成
        self.embedding = nn.Embedding(vocab_size, embedding_dim, _weight=weights)

        # Embedding層の重みをfreeze
        self.embedding.weight.requires_grad = False

        # Self-Attention層の作成
        self.self_attention = nn.MultiheadAttention(
            embed_dim=embedding_dim, num_heads=1, batch_first=True
        )

        # 線形層の作成
        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)

        # Self-Attentionの入力は(batch_size, seq_len, embedding_dim)である必要がある
        # embeddedは(batch_size, seq_len, embedding_dim)の形状を持っていると仮定
        attention_output, _ = self.self_attention(embedded, embedded, embedded)

        attention_output_mean = (attention_output + embedded).mean(dim=1)

        # Self-Attentionの出力を最終的な出力に変換するために、
        # seq_lenの最後のベクトルを使用して線形層に渡します。
        # attention_outputは(batch_size, seq_len, embedding_dim)の形状を持っています。
        # ここでは、シーケンスの最後のベクトルを使用します。
        output = self.fc(attention_output_mean)
        return output


# パラメータ
vocab_size = len(word2vec_model.index_to_key)  # Word2Vecの語彙サイズ
embedding_dim = 100
hidden_dim = 32

In [31]:
import math
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

import MeCab

mecab = MeCab.Tagger(
    "-O wakati -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd"
)


# テキストをIDのシーケンスに変換する関数
def text_to_sequence(text, word2vec_model):
    return [
        word2vec_model.key_to_index.get(word, 0) for word in mecab.parse(text).split()
    ]


# データセットクラス
class TextDataset(Dataset):
    def __init__(self, texts, labels, word2vec_model, max_length):
        self.texts = [text_to_sequence(text, word2vec_model) for text in texts]
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # パディングの追加
        text = self.texts[idx]
        if len(text) < self.max_length:
            text += [0] * (self.max_length - len(text))
        text = text[: self.max_length]
        return torch.tensor(text, dtype=torch.long), torch.tensor(
            self.labels[idx], dtype=torch.float
        )

In [32]:
train_df = pd.read_csv("../data/train.tsv", sep="\t")
valid_df = pd.read_csv("../data/valid.tsv", sep="\t")
test_df = pd.read_csv("../data/test.tsv", sep="\t")

# 文の最大長を決定
max_length = max(
    len(text_to_sequence(text, word2vec_model))
    for text in pd.concat([train_df["poem"], valid_df["poem"], test_df["poem"]])
)

# データセットの分割
train_dataset = TextDataset(
    train_df["poem"], train_df["label"], word2vec_model, max_length
)
valid_dataset = TextDataset(
    valid_df["poem"], valid_df["label"], word2vec_model, max_length
)
test_dataset = TextDataset(
    test_df["poem"], test_df["label"], word2vec_model, max_length
)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=2)
test_loader = DataLoader(test_dataset, batch_size=2)

# モデル、損失関数、最適化手法の設定
model = SelfAttentionModel(vocab_size, embedding_dim, hidden_dim, 1, weights)  # 出力は1次元
loss_function = nn.BCEWithLogitsLoss()  # 2値分類のための損失関数
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001)


# 訓練関数
def train(model, iterator, optimizer, loss_function):
    model.train()
    total_loss = 0

    for text, label in iterator:
        optimizer.zero_grad()
        predictions = model(text).squeeze(1)
        loss = loss_function(predictions, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(iterator)


# 評価関数
def evaluate(model, iterator, loss_function):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for text, label in iterator:
            predictions = model(text).squeeze(1)
            loss = loss_function(predictions, label)
            total_loss += loss.item()

    return total_loss / len(iterator)


# 訓練ループ
EPOCHS = 100

for epoch in range(EPOCHS):
    train_loss = train(model, train_loader, optimizer, loss_function)
    valid_loss = evaluate(model, valid_loader, loss_function)
    print(f"Epoch: {epoch+1:02}")
    print(f"\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):.3f}")
    print(f"\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):.3f}")

Epoch: 01
	Train Loss: 0.563 | Train PPL: 1.756
	 Val. Loss: 0.515 |  Val. PPL: 1.673
Epoch: 02
	Train Loss: 0.321 | Train PPL: 1.378
	 Val. Loss: 0.599 |  Val. PPL: 1.821
Epoch: 03
	Train Loss: 0.292 | Train PPL: 1.339
	 Val. Loss: 0.350 |  Val. PPL: 1.419
Epoch: 04
	Train Loss: 0.189 | Train PPL: 1.208
	 Val. Loss: 0.213 |  Val. PPL: 1.237
Epoch: 05
	Train Loss: 0.200 | Train PPL: 1.222
	 Val. Loss: 0.241 |  Val. PPL: 1.272
Epoch: 06
	Train Loss: 0.188 | Train PPL: 1.207
	 Val. Loss: 0.200 |  Val. PPL: 1.222
Epoch: 07
	Train Loss: 0.137 | Train PPL: 1.146
	 Val. Loss: 0.232 |  Val. PPL: 1.261
Epoch: 08
	Train Loss: 0.167 | Train PPL: 1.182
	 Val. Loss: 0.296 |  Val. PPL: 1.345
Epoch: 09
	Train Loss: 0.149 | Train PPL: 1.161
	 Val. Loss: 0.220 |  Val. PPL: 1.247
Epoch: 10
	Train Loss: 0.147 | Train PPL: 1.158
	 Val. Loss: 0.220 |  Val. PPL: 1.246
Epoch: 11
	Train Loss: 0.114 | Train PPL: 1.121
	 Val. Loss: 0.221 |  Val. PPL: 1.248
Epoch: 12
	Train Loss: 0.112 | Train PPL: 1.119
	 Val.

In [33]:
all_predictions = []

with torch.no_grad():
    for inputs, _ in test_loader:  # "_"はラベルやターゲットを使わない場合
        outputs = model(inputs)

        # 予測結果をリストに追加
        all_predictions.extend(outputs.cpu().numpy())

In [34]:
import numpy as np
from sklearn.metrics import accuracy_score

pred = (np.stack(all_predictions).flatten() > 0).astype(float)
accuracy_score(test_df["label"], pred)

0.8333333333333334