In [1]:
import pandas as pd
import numpy as np

# scikit-learnでロジスティック回帰を行う
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# pandasの表示列数を増やす
pd.set_option("display.max_columns", 100)

In [3]:
train_df = pd.read_csv("../data/train.tsv", sep="\t")
valid_df = pd.read_csv("../data/valid.tsv", sep="\t")
test_df = pd.read_csv("../data/test.tsv", sep="\t")

len(train_df), len(valid_df), len(test_df)

(162, 54, 54)

In [4]:
train_df.head()

Unnamed: 0,poem,label
0,誇りを一つ捨てるたび我等は獣に一歩近付く心を一つ殺すたび我等は獣から一歩遠退く,0
1,初恋は、歳上でした。,1
2,一緒に数えてくれるかい君についた僕の歯型を,0
3,まっ白いお米は、どろんこからできます。,1
4,"てごわい敵とバトルだ！""マスク少女萌え""とか言ってる場合じゃない!?",1


In [67]:
# train_dfのpoem_morphsをすべて使ってCountVectorizerを作成する
vectorizer = CountVectorizer(analyzer="char")
vectorizer.fit(train_df["poem"])
print(len(vectorizer.vocabulary_))
vectorizer.vocabulary_

556


{'誇': 485,
 'り': 103,
 'を': 108,
 '一': 148,
 'つ': 69,
 '捨': 318,
 'て': 70,
 'る': 104,
 'た': 65,
 'び': 83,
 '我': 308,
 '等': 433,
 'は': 79,
 '獣': 401,
 'に': 75,
 '歩': 354,
 '近': 501,
 '付': 160,
 'く': 51,
 '心': 289,
 '殺': 358,
 'す': 59,
 'か': 47,
 'ら': 102,
 '遠': 515,
 '退': 504,
 '初': 189,
 '恋': 294,
 '、': 36,
 '歳': 356,
 '上': 150,
 'で': 71,
 'し': 57,
 '。': 37,
 '緒': 444,
 '数': 325,
 'え': 45,
 'れ': 105,
 'い': 43,
 '君': 210,
 '僕': 177,
 'の': 78,
 '歯': 355,
 '型': 226,
 'ま': 91,
 'っ': 68,
 '白': 414,
 'お': 46,
 '米': 436,
 'ど': 73,
 'ろ': 106,
 'ん': 109,
 'こ': 54,
 'き': 49,
 'ご': 55,
 'わ': 107,
 '敵': 326,
 'と': 72,
 'バ': 131,
 'ト': 127,
 'ル': 141,
 'だ': 66,
 '！': 549,
 '"': 2,
 'マ': 134,
 'ス': 123,
 'ク': 116,
 '少': 269,
 '女': 242,
 '萌': 470,
 '言': 482,
 '場': 229,
 '合': 205,
 'じ': 58,
 'ゃ': 96,
 'な': 74,
 '!': 1,
 '?': 9,
 'カ': 114,
 'ワ': 144,
 'イ': 111,
 '地': 225,
 '獄': 400,
 'へ': 87,
 'よ': 101,
 'う': 44,
 'そ': 63,
 '♥': 35,
 '々': 38,
 'が': 48,
 '岩': 272,
 '壁': 231,
 '花': 467,
 '美': 451,
 '思': 

In [68]:
X_train = vectorizer.transform(train_df["poem"])
X_valid = vectorizer.transform(valid_df["poem"])
X_test = vectorizer.transform(test_df["poem"])

In [69]:
X_train = vectorizer.transform(train_df["poem"])
X_train.toarray()[0, :]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2,
       0, 0, 0, 2, 1, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 2, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [70]:
# ロジスティック回帰を行う
model = LogisticRegression(random_state=1)
model.fit(X_train, train_df["label"])
pred = model.predict(X_test)
accuracy_score(test_df["label"], pred)

0.8518518518518519

In [71]:
accuracy_score(test_df["label"], [1] * len(test_df))

0.7037037037037037

In [80]:
print("`子`の回帰係数", model.coef_[0][vectorizer.vocabulary_["子"]])
print("`ロ(カタカナ)`の回帰係数", model.coef_[0][vectorizer.vocabulary_["ロ"]])
print("`死`の回帰係数", model.coef_[0][vectorizer.vocabulary_["死"]])
print("`血`の回帰係数", model.coef_[0][vectorizer.vocabulary_["血"]])

`子`の回帰係数 0.37441579768804206
`ロ(カタカナ)`の回帰係数 0.24967837793221417
`死`の回帰係数 -0.44132817477868375
`血`の回帰係数 -0.19845319026575342


In [78]:
# vectorizer.vocabulary_の語彙とモデルの係数を対応させたlistを作る
vocab_coef = []
for word, idx in sorted(vectorizer.vocabulary_.items(), key=lambda x: x[1]):
    coef = model.coef_[0][idx]
    vocab_coef.append((word, f"{coef:.4f}"))

vocab_coef

[(' ', '0.1109'),
 ('!', '0.0892'),
 ('"', '0.1063'),
 ('-', '0.1424'),
 ('.', '0.0041'),
 ('0', '0.0233'),
 ('3', '0.0233'),
 ('7', '0.0418'),
 ('9', '0.0193'),
 ('?', '0.0531'),
 ('a', '0.1756'),
 ('b', '0.0331'),
 ('c', '0.0309'),
 ('e', '0.1116'),
 ('f', '0.1043'),
 ('g', '0.1080'),
 ('h', '0.1676'),
 ('i', '0.1952'),
 ('j', '0.0011'),
 ('k', '0.0442'),
 ('l', '0.1042'),
 ('m', '0.0309'),
 ('n', '0.0722'),
 ('o', '0.1380'),
 ('p', '0.0248'),
 ('r', '0.0997'),
 ('s', '0.0495'),
 ('t', '0.1281'),
 ('u', '0.0258'),
 ('v', '0.0237'),
 ('w', '0.0372'),
 ('y', '0.0360'),
 ('…', '0.0008'),
 ('○', '0.1014'),
 ('☆', '0.0733'),
 ('♥', '0.0786'),
 ('、', '-0.2658'),
 ('。', '1.9766'),
 ('々', '-0.3897'),
 ('「', '0.3157'),
 ('」', '0.3157'),
 ('ぁ', '0.0118'),
 ('あ', '0.0984'),
 ('い', '0.1741'),
 ('う', '-0.1024'),
 ('え', '-0.1481'),
 ('お', '0.0316'),
 ('か', '0.1523'),
 ('が', '0.3098'),
 ('き', '-0.0045'),
 ('ぎ', '0.1355'),
 ('く', '-0.1377'),
 ('け', '0.4201'),
 ('げ', '0.0191'),
 ('こ', '-0.0373'),
 ('

In [10]:
# X_trainを、語彙をカラム名とするデータフレームに変換する
X_train_df = pd.DataFrame(X_train.toarray(), columns=vocab_coef)
# X_train_df["poem"] = train_df["poem"].tolist()

X_valid_df = pd.DataFrame(X_valid.toarray(), columns=vocab_coef)
# X_valid_df["poem"] = valid_df["poem"].tolist()


X_test_df = pd.DataFrame(X_test.toarray(), columns=vocab_coef)
# X_test_df["poem"] = test_df["poem"].tolist()

X_train_df.head()

Unnamed: 0,"( , 0.1109)","(!, 0.0892)","("", 0.1063)","(-, 0.1424)","(., 0.0041)","(0, 0.0233)","(3, 0.0233)","(7, 0.0418)","(9, 0.0193)","(?, 0.0531)","(a, 0.1756)","(b, 0.0331)","(c, 0.0309)","(e, 0.1116)","(f, 0.1043)","(g, 0.1080)","(h, 0.1676)","(i, 0.1952)","(j, 0.0011)","(k, 0.0442)","(l, 0.1042)","(m, 0.0309)","(n, 0.0722)","(o, 0.1380)","(p, 0.0248)","(r, 0.0997)","(s, 0.0495)","(t, 0.1281)","(u, 0.0258)","(v, 0.0237)","(w, 0.0372)","(y, 0.0360)","(…, 0.0008)","(○, 0.1014)","(☆, 0.0733)","(♥, 0.0786)","(、, -0.2658)","(。, 1.9766)","(々, -0.3897)","(「, 0.3157)","(」, 0.3157)","(ぁ, 0.0118)","(あ, 0.0984)","(い, 0.1741)","(う, -0.1024)","(え, -0.1481)","(お, 0.0316)","(か, 0.1523)","(が, 0.3098)","(き, -0.0045)",...,"(速, 0.0913)","(連, 0.0306)","(逸, -0.1561)","(遅, 0.1586)","(運, -0.0041)","(過, 0.0787)","(達, 0.0866)","(違, -0.0114)","(遙, -0.0117)","(遠, -0.0054)","(那, 0.0017)","(醜, 0.0508)","(重, 0.0377)","(針, -0.0094)","(鎧, -0.0117)","(開, 0.0454)","(間, 0.0166)","(陽, -0.1605)","(雀, -0.0013)","(離, -0.0167)","(雨, 0.0340)","(雪, 0.0574)","(雷, -0.0593)","(青, 0.0364)","(静, 0.0118)","(非, 0.0454)","(靴, -0.0499)","(音, 0.0913)","(飛, -0.0000)","(食, -0.0001)","(飾, 0.0000)","(館, -0.1115)","(駆, -0.0117)","(験, 0.0363)","(驕, -0.0165)","(骨, -0.0838)","(高, 0.0460)","(髪, 0.0306)","(魔, -0.2066)","(鳴, -0.1115)","(鴉, -0.1115)","(黒, -0.3259)","(黙, 0.0091)","(！, 0.2710)","(（, 0.3142)","(）, 0.2410)","(？, -0.0515)","(ｨ, 0.0118)","(ﾘ, 0.0118)","(ﾜ, 0.0118)"
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,1,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [24]:
# X_valid_dfのpred==0の行を抽出し、いずれかの行で非ゼロの値が含まれるカラムを抽出し、そのカラムにしぼってX_valid_dfを表示する
X_valid_df[pred == 0].loc[:, (X_valid_df[pred == 0] != 0).any(axis=0)]

Unnamed: 0,"(!, 0.0908)","(-, 0.1432)","(., 0.0055)","(a, 0.1807)","(b, 0.0332)","(c, 0.0353)","(e, 0.1132)","(f, 0.1123)","(h, 0.1686)","(i, 0.2026)","(k, 0.0434)","(l, 0.1120)","(o, 0.1439)","(s, 0.0490)","(t, 0.1310)","(u, 0.0263)","(、, -0.2666)","(。, 1.9769)","(々, -0.3897)","(あ, 0.0992)","(い, 0.1736)","(う, -0.1031)","(か, 0.1522)","(が, 0.3100)","(き, -0.0053)","(ぎ, 0.1355)","(く, -0.1381)","(け, 0.4191)","(こ, -0.0378)","(さ, -0.0241)","(し, -0.0520)","(じ, -0.1267)","(す, -0.6759)","(ず, -0.2023)","(せ, -0.0862)","(そ, -0.3929)","(た, -0.0136)","(だ, -0.1405)","(ち, -0.1300)","(っ, 0.3790)","(つ, -0.1646)","(て, -0.4256)","(で, 0.8774)","(と, -0.3890)","(な, 0.1747)","(に, 0.0060)","(ぬ, -0.3468)","(の, -0.4020)","(は, -0.5254)","(ば, -0.3595)",...,"(わ, 0.1178)","(を, -0.7018)","(ん, 0.5522)","(ー, 0.0122)","(上, -0.0002)","(世, 0.0354)","(何, 0.1836)","(供, 0.2222)","(全, 0.1598)","(出, 0.0134)","(前, 0.0860)","(君, -0.2404)","(在, -0.0180)","(地, 0.1948)","(夏, 0.2909)","(夢, 0.0275)","(大, 0.0886)","(太, -0.3028)","(好, 0.0591)","(始, 0.0121)","(子, 0.3740)","(届, 0.1372)","(後, 0.0165)","(心, -0.0872)","(思, 0.1008)","(恋, 0.0989)","(恐, 0.1518)","(我, -0.4983)","(星, -0.0499)","(最, 0.0889)","(死, -0.4412)","(海, -0.1227)","(生, 0.1874)","(界, 0.1050)","(皆, -0.1734)","(知, -0.0616)","(私, 0.2660)","(空, 0.0595)","(終, 0.0187)","(続, 0.2253)","(者, -0.0329)","(裂, -0.0188)","(見, 0.4424)","(足, -0.0028)","(輝, -0.0001)","(達, 0.0865)","(遠, -0.0054)","(陽, -0.1609)","(雨, 0.0340)","(！, 0.2708)"
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,2,3,1,2,0,0,2,0,2,0,0,0,0,1,3,2,0,1,0,0,1,3,2,2,0,2,0,0,...,2,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0
12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
17,1,0,0,2,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,2,1,0,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,0,4,2,2,0,0,...,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0
21,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
44,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,1,3,0,2,1,0,...,0,2,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0


In [84]:
# 語彙的なカバレッジ
test_df["poem"].map(
    lambda x: np.mean([(t in vectorizer.vocabulary_.keys()) for t in x])
).mean()

0.8625163131059695

In [97]:
test_df["poem"][20]

'魂燃え立つ天の降るとも'

In [94]:
# '魂燃え立つ天の降るとも'のうちtrainに含まれる文字だけを抽出

s = "".join([t for t in test_df["poem"][20] if t in vectorizer.vocabulary_.keys()])
s

'え立つ天のるとも'

In [96]:
for t in s:
    print(t, model.coef_[0][vectorizer.vocabulary_[t]])

え -0.14814421217860155
立 -0.18468311395408235
つ -0.16432817566780536
天 -0.10888103546038971
の -0.4025723292603906
る -0.7633578793703615
と -0.38874295896748473
も -0.41794943611905827


In [85]:
test_df["pred"] = model.predict(X_test)
test_df

Unnamed: 0,poem,label,pred
0,船と君と僕の、静かな生活。,1,1
1,ぼくたちはひかれあう水滴のように惑星のようにぼくたちは反発しあう磁石のように肌の色のように,0,0
2,今度の終末はどこへ行く？,1,1
3,時は常に背後から迫り唸りを上げて眼前に流れ去る踏み止まれ時がお前を美しい世界へ押し流そうとど...,0,0
4,「言いたいこと、全部忘れた。」,1,1
5,らしくないね。きれいな私。,1,1
6,NO MORE UNHAPPY TV-正月番組なんていらない-,1,1
7,LOっぽい？,1,1
8,世界一嫌いだと言ってくれ,0,1
9,この秋ふたりが友達だったことを【保存】しました,1,1
