# word2vecでモデル構築

独自のデータでモデルを構築する

手順

- 単語をリストにする
- 学習する
- 似た単語を見つける
- 計算してみる

In [1]:
import urllib.request
import base64

out = []
text = ""

url = "https://hit-u-data-text-processing.herokuapp.com/data/20210502-news-text.txt"
auth_str = base64.b64encode(b"reader:hit-u")
req = urllib.request.Request(url,
                            headers={"Authorization": "Basic " + auth_str.decode("utf-8")})
with urllib.request.urlopen(req) as req:
    text = req.read().decode("utf-8")
    for line in text.split("\n"):
        if line.strip() == "===":
            if text:
                out.append(text)
            text = ""
        else:
            text += line
url = "https://hit-u-data-text-processing.herokuapp.com/data/20210612-news-text.txt"
auth_str = base64.b64encode(b"reader:hit-u")
req = urllib.request.Request(url,
                            headers={"Authorization": "Basic " + auth_str.decode("utf-8")})
with urllib.request.urlopen(req) as req:
    text = req.read().decode("utf-8")
    for line in text.split("\n"):
        if line.strip() == "===":
            if text:
                out.append(text)
            text = ""
        else:
            text += line

In [2]:
len(out)

34

In [None]:
# Colabで実行している場合
# pip install janome

In [3]:
from janome.tokenizer import Tokenizer
from janome.tokenfilter import TokenFilter
from janome.tokenfilter import CompoundNounFilter
from janome.tokenfilter import POSKeepFilter
from janome.tokenfilter import LowerCaseFilter
from janome.charfilter import UnicodeNormalizeCharFilter
from janome.analyzer import Analyzer

In [4]:
class StopWordFilter(TokenFilter):
    def __init__(self, words):
        self.stop_words = words
    
    def apply(self, tokens):
        for token in tokens:
            if token.surface not in self.stop_words:
                yield token

In [5]:
import urllib.request
import base64
url = "https://hit-u-data-text-processing.herokuapp.com/data/stopwords.txt"
auth_str = base64.b64encode(b"reader:hit-u")
req = urllib.request.Request(url,
                            headers={"Authorization": "Basic " + auth_str.decode("utf-8")})

stop_words = []
with urllib.request.urlopen(req) as req:
    lines = req.read().decode("utf-8")
    for line in lines.split("\n"):
        if line.strip():
            stop_words.append(line.strip())
stop_words

['大学', '一橋大学', '===', 'こと', 'the', 'ため', 'よう', 'of', '(', ')', '様']

In [6]:
stop_word_filter = StopWordFilter(stop_words)

In [7]:
token_filters = [CompoundNounFilter(),
                POSKeepFilter(["名詞", "動詞", "形容詞"]),
                LowerCaseFilter(),
                stop_word_filter]

In [8]:
char_filters = [UnicodeNormalizeCharFilter()]
tokenizer = Tokenizer()
analyzer = Analyzer(char_filters=char_filters, 
                    tokenizer=tokenizer, 
                    token_filters=token_filters)

In [9]:
words = [[] for _ in range(len(out))]
for i, text in enumerate(out):
    words[i] = []
    for token in analyzer.analyze(text):
        words[i].append(token.base_form)

In [10]:
len(words)

34

In [11]:
type(words[0])

list

In [12]:
words[0][:10]

['ソニー',
 'パナソニック',
 '富士通',
 '資生堂',
 '共同',
 'デザイン組織',
 '共通評価指標',
 '検討',
 '作成',
 '2021年4月30日']

In [13]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [15]:
# Jupyter labでPython 3.9 gensim 4.2.0 を使っている場合
model = Word2Vec(sentences=words, sg=1, min_count=1, vector_size=20)

In [None]:
# Colabでgensim3.6.0を使っている場合
model = Word2Vec(sentences=words, sg=1, min_count=1, size=20)

In [16]:
model.wv.most_similar("富士通")

[('デザイン組織', 0.9902030229568481),
 ('活動', 0.9899303317070007),
 ('ソニー', 0.98918217420578),
 ('デザイン', 0.9888842105865479),
 ('いる', 0.9888550639152527),
 ('研究面', 0.9888089299201965),
 ('広い', 0.988551914691925),
 ('られる', 0.9885348677635193),
 ('パナソニック', 0.9883995056152344),
 ('言える', 0.9882439374923706)]

In [17]:
model.wv.most_similar(positive=["ソニー"], negative=["学長"])

[('ディスカッションパート1)', 0.1494382619857788),
 ('一橋大学博士(法学)。著書:『核', 0.13839954137802124),
 ('講演者プロフィール', 0.1341756284236908),
 ('公共政策大学院教授開会挨拶', 0.10643982887268066),
 ('bayesian', 0.10532860457897186),
 ('2021年2月27日(土)', 0.10432818531990051),
 ('国立市', 0.09989476948976517),
 ('一橋大学博士(法学)。', 0.0985269546508789),
 ('講演1', 0.09746456146240234),
 ('2020年12月9日', 0.09218787401914597)]

In [18]:
model.wv.most_similar(positive=["富士通", "ソニー", "資生堂"], negative=["研究"])

[('シカゴ', 0.9885695576667786),
 ('デザイン組織', 0.9882736206054688),
 ('おる', 0.9873428344726562),
 ('違う', 0.9865394234657288),
 ('大きい', 0.9863448143005371),
 ('いる', 0.9861999154090881),
 ('パナソニック', 0.9859302639961243),
 ('制度(インハウスデザイナー制度)', 0.9853355884552002),
 ('世界', 0.985306441783905),
 ('持つ', 0.9852427840232849)]

In [19]:
model.wv.similarity("富士通", "ソニー")

0.9891821

In [20]:
model.wv.similarity("富士通", "学長")

0.97469413

In [21]:
model.wv.similarity("国立大学法人", "学長")

0.9262252

In [22]:
model.wv.similarity("国立大学法人", "慶應義塾大学")

0.92455274