In [3]:
import pandas as pd
import numpy as np
import gensim
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
import MeCab

mecab = MeCab.Tagger(
    "-O wakati -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd"
)

In [5]:
train_df = pd.read_csv("../data/train.tsv", sep="\t")
valid_df = pd.read_csv("../data/valid.tsv", sep="\t")
test_df = pd.read_csv("../data/test.tsv", sep="\t")

len(train_df), len(valid_df), len(test_df)

(162, 54, 54)

In [6]:
# train_dfのpoemカラムをgensimの学習済みモデルでベクトル化する
# https://www.cl.ecei.tohoku.ac.jp/~m-suzuki/jawiki_vector/
# これを採用している: https://github.com/singletongue/WikiEntVec/releases
# https://radimrehurek.com/gensim/models/word2vec.html

In [7]:
# gensimで学習済みモデルを読み込む
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(
    "../data/word2vec_ja/jawiki.word_vectors.100d.txt", binary=False
)

In [8]:
import numpy as np


def cos_sim(v1, v2):
    return np.dot(v1, v2) / np.sqrt(np.dot(v1, v1) * np.dot(v2, v2))


print("`東京`と`大阪`の類似度", cos_sim(word2vec_model["東京"], word2vec_model["大阪"]))
print("`東京`と`静岡`の類似度", cos_sim(word2vec_model["東京"], word2vec_model["静岡"]))
print("`東京`と`みかん`の類似度", cos_sim(word2vec_model["東京"], word2vec_model["みかん"]))

`東京`と`大阪`の類似度 0.849611
`東京`と`静岡`の類似度 0.7409164
`東京`と`みかん`の類似度 0.2301355


In [9]:
# train_dfのpoemをMecabで形態素解析して、gensimの学習済みモデルでベクトル化する
def get_vector(text):
    words = mecab.parse(text).split()
    vectors = []
    for word in words:
        try:
            vector = word2vec_model[word]
            vectors.append(vector)
        except:
            pass
    if len(vectors) == 0:
        return np.zeros(100)
    return np.mean(vectors, axis=0)


train_df["vector"] = train_df["poem"].map(get_vector)
test_df["vector"] = test_df["poem"].map(get_vector)

In [10]:
# ロジスティック回帰を行う
model = LogisticRegression(random_state=1)
model.fit(np.stack(train_df["vector"].values), train_df["label"])
pred = model.predict(np.stack(test_df["vector"].values))
accuracy_score(test_df["label"], pred)

0.8333333333333334

In [11]:
accuracy_score(test_df["label"], [1] * len(test_df))

0.7037037037037037

In [21]:
# 単語ロジスティック回帰で誤っていた問題を解けていたか確認

test_df["pred"] = model.predict(np.stack(test_df["vector"].values))
test_df[test_df["poem"].map(lambda x: "眼" in x)]

Unnamed: 0,poem,label,vector,pred
3,時は常に背後から迫り唸りを上げて眼前に流れ去る踏み止まれ時がお前を美しい世界へ押し流そうとど...,0,"[0.10835943, 0.0106099155, -0.12551703, -0.333...",0
15,ああおれたちは皆眼をあけたまま空を飛ぶ夢を見てるんだ,0,"[0.13821492, -0.0131085515, -0.31000143, -0.30...",1


In [18]:
# 単語それぞれに対する回帰モデルの推論結果
# 眼がBLEACHらしいと判定はできているが、それ以外にLO寄りの単語が多い

pd.DataFrame(
    {
        "word": mecab.parse(test_df["poem"][15]).split(),
        "pred": model.predict_proba(
            np.stack(
                [
                    word2vec_model[word]
                    for word in mecab.parse(test_df["poem"][15]).split()
                ]
            )
        )[:, 1],
    }
)

Unnamed: 0,word,pred
0,ああ,0.787445
1,おれ,0.809368
2,たち,0.480314
3,は,0.805338
4,皆,0.088855
5,眼,0.043415
6,を,0.47753
7,あけ,0.627053
8,た,0.745257
9,まま,0.147882
