In [3]:
import MeCab
import os
import pandas as pd

# 辞書があるpath: "C:\Program Files\MeCab\dic\ipadic"
path_dic = "C:\\Program Files\\MeCab\\dic\\ipadic"

# データ格納用のDataFrameを作成
df = pd.DataFrame(columns=["topic", "date", "words"])

# ファイルがあるディレクトリのパスのリスト
folders = [
  "dokujo-tsushin",
  "it-life-hack",
  "kaden-channel",
  "livedoor-homme",
  "movie-enter",
  "peachy",
  "smax",
  "sports-watch",
  "topic-news"
]

# MeCab.Tagger() でインスタンスを生成
tagger = MeCab.Tagger(path_dic)

# 形態素解析
## "text/" + foldersファイルを読み込む
for folder in folders:
  files = os.listdir("text/" + folder)
  for file in files:
    with open("text/" + folder + "/" + file, "r", encoding="utf-8") as f:
      lines = f.readlines()
      date = lines[1].replace("\n", "")
      text = "".join(lines[2:])
        
    # parse() で形態素解析を行う
    result = tagger.parse(text)
    
    # 形態素解析の結果の単語のみを取り出す
    words = []
    for row in result.split("\n"):
      if row == "EOS":
        break
      word = row.split("\t")[0]
      words.append(word)
    
    # 単語のリストを' 'で連結
    words = " ".join(words)
    
    # データをDataFrameに追加
    df = pd.concat([df, pd.DataFrame([[folder, date, words]], columns=["topic", "date", "words"])], join="inner")

In [9]:
# Word2Vecの学習
sentences = []
for text in df["words"]:
  text_list = text.split(" ")
  sentences.append(text_list)

from gensim.models import Word2Vec
model = Word2Vec(sentences,  sg=1, vector_size=100, window=5, min_count=1)

In [15]:
# 類義語の表示
for result in model.wv.most_similar("講義"):
  print(result)

('アドバイザー', 0.8483713865280151)
('可否', 0.8460174202919006)
('入部', 0.8424232602119446)
('キュレーター', 0.8387464880943298)
('ラサール', 0.8377898931503296)
('啓発', 0.8316361904144287)
('許可なく', 0.8306527733802795)
('立案', 0.8303689956665039)
('通達', 0.8287867903709412)
('解答', 0.8286617398262024)


In [14]:
# 単語同士の算術演算
# 例: "日本" + "パリ" - "東京" = 
for result in model.wv.most_similar(positive=["日本", "パリ"], negative=["東京"]):
    print(result)

('巨匠', 0.6439781785011292)
('スウェーデン', 0.629744827747345)
('死闘', 0.6239414215087891)
('フランス', 0.6202606558799744)
('裏切り', 0.6161167621612549)
('ボクシング', 0.6129253506660461)
('イギリス', 0.6093736886978149)
('凱旋', 0.6053586602210999)
('純真', 0.605124294757843)
('終焉', 0.6035944223403931)
