In [1]:
from tokenize_aozora import Task,DownloadAozora,TokenCounter,SortByFreq,noun_filter,DataSet,Process
from janome.tokenizer import Tokenizer
from gensim.models import word2vec
import re

class RemoveMetaData(Process):
    
    def apply(self, lines):
        
        def remove(line):
            s = line
            s = s.replace('|','')
            s = re.sub(r'《.+?》','',s) # ルビをとる
            s = re.sub(r'［＃.+?］','',s) # 入力注をとる
            return s
        
        return [remove(line) for line in lines]
        
class Tokenize(Process):
    
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    
    def apply(self, lines):
        
        def filter_by_pos(token, allow_poses):
            return token.part_of_speech.split(',')[0] in allow_poses
        
        def token_to_surface(token):
            return token.surface if token.base_form == '*' else token.base_form
        
        ALLOWED_POS = ['名詞','形容詞','動詞','記号']
        return [' '.join([token_to_surface(token) for token in self.tokenizer.tokenize(line) if filter_by_pos(token, ALLOWED_POS)]) for line in lines]
        
class GenshiModel(Process):
    
    def __init__(self, filename):
        self.filename = filename
    
    def apply(self, lines):
        
        # 分かち書きしたテキストを保存
        tokenized_file = self.filename + '.tokenized'
        with open(tokenized_file, 'w',encoding='utf-8') as fp:
            fp.write("\n".join(lines))
        
        #モデルを生成
        model_file = self.filename + '.model'
        data = word2vec.LineSentence(tokenized_file)
        model = word2vec.Word2Vec(data,size=200, window=10, hs = 1, min_count =2, sg = 1)
        model.save(model_file)
        return model


In [2]:
text = DataSet.get('At the Mountains of Madness')

t = Task()
t.process(DownloadAozora(text))
t.process(RemoveMetaData())
t.process(Tokenize(Tokenizer()))
t.process(GenshiModel(text.filename()))
model = t.run(None)

In [27]:
model.most_similar(positive=['古', '邪悪'])

[('アナロジー', 0.9499032497406006),
 ('神', 0.9487597346305847),
 ('禁断', 0.9486730694770813),
 ('永遠', 0.9450716972351074),
 ('闇', 0.9423696398735046),
 ('適う', 0.9399266839027405),
 ('逃げる', 0.9393948316574097),
 ('道理', 0.9382576942443848),
 ('叫び', 0.9366800785064697),
 ('追い立てる', 0.9357254505157471)]

In [31]:
model.most_similar(positive=['死'], negative=['温度'])

[('恐怖', 0.33436575531959534),
 ('世界', 0.3195968568325043),
 ('くれる', 0.3060491383075714),
 ('最後', 0.2898232340812683),
 ('前', 0.2875790297985077),
 ('自分', 0.26906710863113403),
 ('音', 0.2648683488368988),
 ('外部', 0.25517863035202026),
 ('自身', 0.2540249228477478),
 ('しまう', 0.2498125433921814)]