In [1]:
from text_vectorian import SpBertVectorian

tokenizer_filename = '../bert-japanese/model/wiki-ja.model'
vectorizer_filename = '../bert-japanese/model/model.ckpt-1400000'

bert_vectorian = SpBertVectorian(
    tokenizer_filename=tokenizer_filename,
    vectorizer_filename=vectorizer_filename
)

Using TensorFlow backend.


In [2]:
from text_vectorian import SentencePieceVectorian

word2vec_vectorian = SentencePieceVectorian()

/tmp/.keras/.models/wikija-sentencepiece_300.model
/tmp/.keras/.models/wikija-sentencepieced_word2vec_300.model
/tmp/.keras/.models/wikija-sentencepieced_word2vec_300.model.wv.vectors.npy
/tmp/.keras/.models/wikija-sentencepieced_word2vec_300.model.trainables.syn1neg.npy


In [3]:
import numpy as np
import pandas as pd

def cossim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

def get_vectors(vectorian, text):
    vectors = vectorian.fit(text).vectors
    tokens = vectorian._tokenizer._tokenize(text)
    
    return tokens, vectors

def get_df(original_text, my_text):
    data = []

    original_bert = get_vectors(bert_vectorian, original_text)
    original_word2vec = get_vectors(word2vec_vectorian, original_text)
    my_bert = get_vectors(bert_vectorian, my_text)
    my_word2vec = get_vectors(word2vec_vectorian, my_text)

    for i, token in enumerate(original_word2vec[0]):
        original_token = original_bert[0][i]
        original_bert_vector = original_bert[1][i]
        original_word2vec_vector = original_word2vec[1][i]
        my_token = my_bert[0][i]
        my_bert_vector = my_bert[1][i]
        my_word2vec_vector = my_word2vec[1][i]

        bert_sim = cossim(original_bert_vector, my_bert_vector)
        word2vec_sim = cossim(original_word2vec_vector, my_word2vec_vector)
    
        data.append((i, original_token, my_token, bert_sim, word2vec_sim))

        df = pd.DataFrame(data, columns=('index', 'original token', 'my token', 'cos diff(bert)', 'cos diff(word2vec)')).set_index('index')
        df['cos diff(bert)'] = 1- df['cos diff(bert)']
        df['cos diff(bert)'] = df['cos diff(bert)'].clip(0.01, 1).replace(0.01, 0)
        df['cos diff(word2vec)'] = 1- df['cos diff(word2vec)']
        df['cos diff(word2vec)'] = df['cos diff(word2vec)'].clip(0.01, 1).replace(0.01, 0)
        
    return df

In [4]:
original_text = '今日は室内が大変暑いです。'
same_text = '今日は室内が大変暑いです。'
my_text = '今日は部屋がとても寒いです。'

same_df = get_df(original_text, same_text)
diff_df = get_df(original_text, my_text)

display(same_df.style.bar())
display(diff_df.style.bar())

Unnamed: 0_level_0,original token,my token,cos diff(bert),cos diff(word2vec)
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,▁,▁,0,0
1,今日,今日,0,0
2,は,は,0,0
3,室内,室内,0,0
4,が,が,0,0
5,大変,大変,0,0
6,暑,暑,0,0
7,い,い,0,0
8,です,です,0,0


Unnamed: 0_level_0,original token,my token,cos diff(bert),cos diff(word2vec)
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,▁,▁,0.0653358,0.0
1,今日,今日,0.0119917,0.0
2,は,は,0.0401339,0.0
3,室内,部屋,0.0302926,0.610473
4,が,が,0.159882,0.0
5,大変,とても,0.0588447,0.289395
6,暑,寒,0.378446,0.215572
7,い,い,0.194352,0.0
8,です,です,0.0573767,0.0
