In [2]:
import re
import Levenshtein
import numpy as np
import MeCab
from operator import methodcaller
import text_similarity as sim

In [3]:
def similarity(x,y,formula='jaccard'):
    """" alias to different methods  """
    func = methodcaller(f'{formula}_similarity',x,y)
    return func(sim)

In [11]:
dataset = 'datasets/ishida'
files=['d1.txt', 'd20.txt', 'd40.txt', 'd60.txt', 'd80.txt']
n = len(files)

In [12]:
def create_mecab(parts_of_speech=['名詞'], stop_words=[]):
    def _mecab(text):
        tagger = MeCab.Tagger()
        node = tagger.parseToNode(text)
        rs = []
        while node:
            word=node.surface

            if node.feature.split(",")[0] == u"動詞": 
                 word=node.feature.split(",")[6]

       
            hinshi = node.feature.split(",")[0]
            if hinshi in parts_of_speech and not word in stop_words:
                rs += [word]

            node = node.next
        return rs
    
    return _mecab

In [13]:
a="今日はいい天気ですね。"
b='今日勉強します。'
mecab=create_mecab(parts_of_speech=['名詞','動詞','形容詞'])
# list_a=' '.join(mecab(a))
# list_b=' '.join(mecab(b))
list_a, list_b = mecab(a), mecab(b)
print(list_a)
print(list_b)
sim.levenshtein_similarity(list_a,list_b)

['今日', 'いい', '天気']
['今日', '勉強', 'する']


0.33333333333333337

In [18]:
rs=[]
methods = ['jaccard','simpson','levenshtein']
inmethods= ['levenshtein']
for m in methods:
    for i in range(n):
        for j in range(i+1,n):
            file1, file2=f'{dataset}/{files[i]}', f'{dataset}/{files[j]}'
            #print(file1, file2)
            with open(file1, mode="r", encoding="utf-8") as f:
                text1 = f.read()
            with open(file2, mode="r", encoding="utf-8") as f:
                text2 = f.read()
            #print(text1[:20])
            #print(text2[:20])
            list1, list2=mecab(text1), mecab(text2)
#             if m in inmethods:
#                 list1, list2=' '.join(list1), ' '.join(list2) 
            #print(jaccard_similarity(list1,list2))
            #score=jaccard_similarity(list1,list2)
            
            score=similarity(list1, list2, formula=m)
            rs += [(m, i, j, score)]

for m in methods:
    t = [(t[1:])for t in rs if t[0]==m]
    print(m)
    t = sorted(t, key=lambda x: x[2], reverse=True ) 
    for s in t:
        print(s)

jaccard
(2, 3, 0.7155322862129145)
(0, 4, 0.7150170648464164)
(1, 2, 0.6978798586572438)
(3, 4, 0.6888888888888889)
(0, 3, 0.5007363770250368)
(1, 3, 0.4961832061068702)
(2, 4, 0.47938931297709925)
(0, 2, 0.34048257372654156)
(1, 4, 0.31607629427792916)
(0, 1, 0.2143727161997564)
simpson
(0, 4, 0.8603696098562629)
(2, 3, 0.8506224066390041)
(3, 4, 0.8275154004106776)
(1, 2, 0.824634655532359)
(0, 3, 0.6786427145708582)
(1, 3, 0.6784968684759917)
(2, 4, 0.6514522821576764)
(0, 2, 0.5269709543568465)
(1, 4, 0.48434237995824636)
(0, 1, 0.3674321503131524)
levenshtein
(1, 2, 0.7780979827089337)
(3, 4, 0.7774674115456238)
(2, 3, 0.7192488262910799)
(1, 3, 0.5089201877934273)
(2, 4, 0.5009310986964619)
(1, 4, 0.2914338919925512)
(0, 2, 0.050091074681238634)
(0, 1, 0.03551912568306015)
(0, 3, 0.030965391621129323)
(0, 4, 0.024590163934426257)
