In [1]:
with open('suihanki_wakati.txt', mode='r', encoding='utf-8') as f:
    text = f.read()

In [2]:
# 共起行列(と単語のリスト)を返す
# w_size: ウィンドウ幅
def cooccurrence(text, w_size):
    import numpy as np

    # i文目のj語目が格納された二次元配列sentence(i, j)
    sentence = text.split('\n')
    for i in range(len(sentence)):
        sentence[i] = sentence[i].split()

    # 登場した単語のリスト
    words = list(set(text.split()))

    # words数サイズのゼロ行列を作る
    matrix = np.zeros((len(words), len(words)))

    # number番目の単語がw
    for number, w in enumerate(words):
        for i in range(len(sentence)):
            for j in range(len(sentence[i])):
                # wとi文目j単語目とが合致したら
                if w == sentence[i][j]:
                    # 行列の、窓幅分前の単語のところを+1する
                    for d in range(1, w_size+1):
                        if j-d >= 0:
                            matrix[number][words.index(sentence[i][j-d])] += 1
                    # 行列の、窓幅分後の単語のところを+1する
                    for d in range(1, w_size+1):
                        if j+d < len(sentence[i]):
                            matrix[number][words.index(sentence[i][j+d])] += 1

    return words, matrix

In [3]:
# ベクトルの0でない成分を返す
# wは単語ベクトル
def F(w):
    a = []
    for n, i in enumerate(w):
        if i != 0:
            a.append(n)
    return set(a)

In [4]:
def WeedsP(w1, w2):
    m = 0
    for i in list(F(w1)):
        m += w1[i]
    
    if m == 0:
        return 0

    c = 0
    for i in list(F(w1)&F(w2)):
        c += w1[i]
    return c / m

In [5]:
def weeds_list(text, w_size=1):
    # words: textに登場する単語のリスト
    # matrix: 共起行列
    words, matrix = cooccurrence(text, w_size=1)
    
    # words[i]が下位・words[j]が上位、と仮定してweedsを計算する
    for i in range(len(words)):
        for j in range(len(words)):
            if i != j:
                w = WeedsP(matrix[i], matrix[j])
                if w >= 0.5:
                    with open('weeds.txt', mode='a', encoding='utf-8') as f:
                        f.write(', '.join([words[i], words[j], str(w)]))
                        f.write('\n')

共起行列作る関数を直してもう少し早くする

In [1]:
def c_matrix(text, w_size, w_list):
    import numpy as np
    #import pickle
    from szk_lib import genkei

    # i文目のj語目が格納された二次元配列sentence(i, j)
    sentence = text.split('\n')
    for i in range(len(sentence)):
        sentence[i] = sentence[i].split()

    # 登場した単語(原形)のリスト
    #with open(w_list, mode='rb') as f:
    #    words = pickle.load(f)
    words = w_list

    # words数サイズのゼロ行列を作る
    matrix = np.zeros((len(words), len(words)))

    for i in range(len(sentence)):
        for j in range(len(sentence[i])):
            number = words.index(genkei(sentence[i][j]))
            # 行列の、窓幅分前の単語のところを+1する
            for d in range(1, w_size+1):
                if j-d >= 0:
                    matrix[number][words.index(genkei(sentence[i][j-d]))] += 1
                if j+d < len(sentence[i]):
                    matrix[number][words.index(genkei(sentence[i][j+d]))] += 1

    #with open('c_matrix', mode='wb') as f:
    #    pickle.dump(matrix , f)
    return matrix

In [4]:
text = 'トンネルを抜けるとそこは雪国だった\n恥の多い生涯を送ってきました'
import MeCab
import szk_lib
tagger = MeCab.Tagger("-Owakati")
tagger.parse('')
text = tagger.parse(text)
words = list(set(text.split()))
for i in range(len(words)):
    words[i] = szk_lib.genkei(words[i])
words = list(set(words))
print(text)
print(words)
print(c_matrix(text, 1, words))

トンネル を 抜ける と そこ は 雪国 だっ た 恥 の 多い 生涯 を 送っ て き まし た 

['トンネル', 'は', 'そこ', '生涯', 'き', '多い', '恥', '抜ける', '雪国', 'だ', 'ます', '送る', 'て', 'と', 'を', 'の', 'た']
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 

In [32]:
# 原形入手・改善版
def genkei(word):
    import MeCab
    tagger = MeCab.Tagger("-Ochasen")
    tagger.parse('')
    node = tagger.parseToNode(word)

    while node:
        word = node.surface
        wclass = node.feature.split(',')
        # wclass:品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用形,活用型,原形,読み,発音

        if wclass[0] != u'BOS/EOS':
            if wclass[6] == None:
                return word
            else:
                return wclass[6]
        node = node.next

In [42]:
# 品詞入手・改善版
def hinshi(word):
    import MeCab
    tagger = MeCab.Tagger("-Ochasen")
    tagger.parse('')
    node = tagger.parseToNode(genkei(word))

    while node:
        word = node.surface
        wclass = node.feature.split(',')
        # wclass:品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用形,活用型,原形,読み,発音

        if wclass[0] != u'BOS/EOS':
            return wclass[0]
        node = node.next

In [43]:
print(genkei('帰り'))
print(hinshi('帰り'))

帰る
動詞
