In [1]:
# ライブラリとloggingの設定
import MeCab
import argparse
import codecs
import evaluate
import glove
import logging
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

parser = argparse.ArgumentParser(
	description="Build a GloVe vector-space model from the provided corpus."
)
parser.add_argument(
	'--window_size',
    default=10,
	type=int,
    help="Number of context words to track to left and right of each word."
)
parser.add_argument(
	'--min_count',
    default=10,
    type=int,
    help=("Discard cooccurrence pairs where at "
		  "least one of the words occurs fewer "
		  "than this many times in the training corpus")
)
parser.add_argument(
	'--embedding_size',
    default=100,
    type=int,
    help="Dimensionality of output word vectors"
)
parser.add_argument(
	'--iterations',
    default=25,
    type=int,
    help="Number of training iterations."
)

args = parser.parse_args(args=[])


logging.basicConfig(level=logging.DEBUG, format="%(asctime)s\t%(message)s")
logger = logging.getLogger("glove")

In [2]:
with open('suihanki_wakati.txt', mode='r', encoding='utf-8') as f:
    corpus = f.read()

# corpus[i文目][j語目]の2次元配列を作る
cps = []
s = []
w = ''
for c in corpus:
    if c == '\n':
        s.append(w)
        w = ''
        cps.append(s)
        s = []
    elif c == ' ':
        s.append(w)
        w = ''
    else:
        w += c
corpus = cps

In [3]:
# 単語辞書の作成
vocab = glove.build_vocab(corpus)
logger.info("Vocab has %i elements.\n", len(vocab))
logger.info("Fetching cooccurrence list..")
cooccur = glove.build_cooccur(
	vocab,
	corpus,
	window_size=args.window_size,
	min_count=args.min_count
)
logger.info("Cooccurrence list fetch complete (%i pairs).\n", len(cooccur))
id2word = evaluate.make_id2word(vocab)

2021-01-26 22:56:43,886	Building vocab from corpus
2021-01-26 22:56:43,927	Done building vocab from corpus.
2021-01-26 22:56:43,934	Vocab has 1918 elements.

2021-01-26 22:56:43,940	Fetching cooccurrence list..
2021-01-26 22:56:43,960	Building cooccurrence matrix: on line 0
2021-01-26 22:56:46,588	Building cooccurrence matrix: on line 1000
2021-01-26 22:56:50,150	Building cooccurrence matrix: on line 2000
2021-01-26 22:56:51,985	Cooccurrence list fetch complete (22905 pairs).



In [4]:
# 学習
logger.info("Beginning GloVe training..")
W = glove.train_glove(
	vocab,
	cooccur,
	vector_size=args.embedding_size,
	iterations=args.iterations
)
W = evaluate.merge_main_context(W)

2021-01-26 22:56:52,010	Beginning GloVe training..
2021-01-26 22:56:52,199		Beginning iteration 0..
2021-01-26 22:56:54,327			Done (cost 2110.563916)
2021-01-26 22:56:54,330		Beginning iteration 1..
2021-01-26 22:56:58,054			Done (cost 1613.682383)
2021-01-26 22:56:58,056		Beginning iteration 2..
2021-01-26 22:57:03,969			Done (cost 1409.071523)
2021-01-26 22:57:03,972		Beginning iteration 3..
2021-01-26 22:57:10,909			Done (cost 1279.724101)
2021-01-26 22:57:10,913		Beginning iteration 4..
2021-01-26 22:57:15,930			Done (cost 1178.128112)
2021-01-26 22:57:15,933		Beginning iteration 5..
2021-01-26 22:57:20,176			Done (cost 1075.820170)
2021-01-26 22:57:20,178		Beginning iteration 6..
2021-01-26 22:57:24,258			Done (cost 971.787264)
2021-01-26 22:57:24,262		Beginning iteration 7..
2021-01-26 22:57:28,651			Done (cost 893.479254)
2021-01-26 22:57:28,653		Beginning iteration 8..
2021-01-26 22:57:32,730			Done (cost 842.412218)
2021-01-26 22:57:32,732		Beginning iteration 9..
2021-01-26 2

In [5]:
# モデルの保存
glove.save_model(W, "suihanki.pkl")

2021-01-26 22:58:49,058	Saved vectors to suihanki.pkl


In [6]:
# '炊飯'に似ている単語を見つける
sample = '炊飯'
tokens = evaluate.most_similar(W, vocab, id2word, sample)
for n, token in enumerate(tokens):
	print("Similar word to the {}: No.{} {}".format(sample, n+1, token))

Similar word to the 炊飯: No.1 スチーム
Similar word to the 炊飯: No.2 器
Similar word to the 炊飯: No.3 コース
Similar word to the 炊飯: No.4 予約
Similar word to the 炊飯: No.5 」
Similar word to the 炊飯: No.6 「
Similar word to the 炊飯: No.7 ボタン
Similar word to the 炊飯: No.8 保温
Similar word to the 炊飯: No.9 ｣
Similar word to the 炊飯: No.10 本体
Similar word to the 炊飯: No.11 時
Similar word to the 炊飯: No.12 中
Similar word to the 炊飯: No.13 ●
Similar word to the 炊飯: No.14 ごはん
Similar word to the 炊飯: No.15 洗い


In [8]:
W

array([[-0.12503674,  0.05938223, -0.10863096, ...,  0.04887136,
         0.12237998,  0.17955263],
       [-0.10567155, -0.00523037, -0.03543894, ...,  0.05692163,
        -0.01669103,  0.03056564],
       [-0.07422362,  0.07169773,  0.22255532, ...,  0.11500449,
        -0.08311284,  0.05730083],
       ...,
       [ 0.18337403, -0.07107848,  0.07728845, ...,  0.03935498,
        -0.04053991, -0.04611406],
       [ 0.01999824,  0.07661286, -0.11690537, ..., -0.04865835,
         0.03669333, -0.09124776],
       [ 0.18946176,  0.18527365, -0.06452272, ...,  0.20115283,
        -0.00851138,  0.07824027]])