In [1]:
import text_transformer as tf

# split transformer

In [2]:
a = "a cat is acrossing the road"
b = "but the cat is running into the forest"
tf.word_seq(a)

['a', 'cat', 'is', 'acrossing', 'the', 'road']

# janome transformer

In [3]:
text = '英語は単語ごとにスペースで区切られているので分割するのが簡単だが、日本語は難しい。'
pos = ['名詞','動詞','形容詞']
stop = ['の','いる','する','ごと','こと','[',']']

janome = tf.create_parser(parts_of_speech=pos,stop_words=stop)
tf.word_seq(text, parser = janome)

['英語', '単語', 'スペース', '区切る', 'れる', '分割', '簡単', '日本語', '難しい']

# mecab transformer

In [4]:
mecab = tf.create_parser(worker='mecab', parts_of_speech=pos,stop_words=stop)
tf.word_seq(text, parser = mecab)

['英語', '単語', 'スペース', '区切る', 'れる', '分割', '簡単', '日本語', '難しい']

# Compute similarity scores 

In [5]:
import text_similarity as sim
from score import score 
from operator import methodcaller

"""
map : apply `func` to all members in a list
reduce: aggregate all members in a list to a single value by applying `func` 
filter: filter a list to a sub-list whose members evaluated to be True by the `func`

"""

def copy_degree(doc1, doc2, method='jaccard', sent_cutoff=0.10, para_cutoff=0.10):

    def _similar_to(x,y,formula='jaccard'):
        """ alias to different methods  
        """
        func = methodcaller(f'{formula}_similarity',x,y)
        return func(sim)

    def _word_seq(text):
        return  tf.word_seq(text, parser=mecab)
 
    doc1 = [ list(map(_word_seq, par)) for par in map(tf.sent_seq, tf.para_seq(doc1))] 
    doc2 = [ list(map(_word_seq, par)) for par in map(tf.sent_seq, tf.para_seq(doc2))] 

    data_para = []
    for i in range(len(doc1)):
        for j in range(len(doc2)):
            p1, p2 = doc1[i], doc2[j]
            data_sent = []
            for ii in range (len(p1)):
                for jj in range(len(p2)):
                    data_sent += [(ii,jj, _similar_to(p1[ii], p2[jj], formula=method))]

            data_para += [(i,j, score(data_sent,cutoff=sent_cutoff))]    
            
    return score(data_para, cutoff=para_cutoff )       


file1 = 'datasets/ishida/d20.txt'
file2 = 'datasets/ishida/d40.txt'

with open(file1, mode="r", encoding="utf-8") as f:
     text1 = f.read()
with open(file2, mode="r", encoding="utf-8") as f:
    text2 = f.read()
    
copy_degree(text1,text2, sent_cutoff=0.1)

0.36363636363636365

In [6]:
dataset = 'datasets/ishida'
files=['d1.txt', 'd20.txt', 'd40.txt', 'd60.txt', 'd80.txt']
methods = ['jaccard','simpson','levenshtein']
n = len(files)
for m in methods:
    print(m.upper())
    for i in range(n-1):
        for j in range(i+1,n):
            file1, file2=f'{dataset}/{files[i]}', f'{dataset}/{files[j]}'
            with open(file1, mode="r", encoding="utf-8") as f:
                text1 = f.read()
            with open(file2, mode="r", encoding="utf-8") as f:
                text2 = f.read()
            degree = copy_degree(text1,text2, method=m, sent_cutoff=0.1,para_cutoff=0.08)
            print(f'{degree:.3f} ({files[i]} & {files[j]})')

JACCARD
0.029 (d1.txt & d20.txt)
0.091 (d1.txt & d40.txt)
0.151 (d1.txt & d60.txt)
0.227 (d1.txt & d80.txt)
0.413 (d20.txt & d40.txt)
0.193 (d20.txt & d60.txt)
0.096 (d20.txt & d80.txt)
0.212 (d40.txt & d60.txt)
0.154 (d40.txt & d80.txt)
0.194 (d60.txt & d80.txt)
SIMPSON
0.512 (d1.txt & d20.txt)
0.490 (d1.txt & d40.txt)
0.511 (d1.txt & d60.txt)
0.556 (d1.txt & d80.txt)
0.706 (d20.txt & d40.txt)
0.619 (d20.txt & d60.txt)
0.576 (d20.txt & d80.txt)
0.587 (d40.txt & d60.txt)
0.556 (d40.txt & d80.txt)
0.569 (d60.txt & d80.txt)
LEVENSHTEIN
0.037 (d1.txt & d20.txt)
0.080 (d1.txt & d40.txt)
0.134 (d1.txt & d60.txt)
0.210 (d1.txt & d80.txt)
0.238 (d20.txt & d40.txt)
0.136 (d20.txt & d60.txt)
0.091 (d20.txt & d80.txt)
0.168 (d40.txt & d60.txt)
0.137 (d40.txt & d80.txt)
0.174 (d60.txt & d80.txt)
