# Textual similarity

In [1]:
from gensim.models import Word2Vec

model_path = "/home/datasets/datasets1/word2vec-embeddings/GoogleNews-vectors-negative300.bin.gz"
model_word2vec = Word2Vec.load_word2vec_format(model_path, binary=True)

Using gpu device 0: Graphics Device (CNMeM is disabled, CuDNN 3007)


In [68]:
import numpy as np
from nltk.tokenize import RegexpTokenizer
from scipy.stats import pearsonr

In [10]:
import glob
datasets_file = sorted(glob.glob("dataset_texsim/*.input.*"))
gold_standard_file = sorted(glob.glob("dataset_texsim/*.gs.*"))

In [75]:
def read_dataset(filename_input, gs_file):
    with open(filename_input, "r") as f:
        input_pair = [x.split("\t") for x in f.read().splitlines()]
    with open(gs_file, "r") as f:
        input_gs = [float(x) for x in f.read().splitlines()]
    return input_pair, input_gs

In [63]:
def text_sim_monge(sen1, sen2, p, lex_sim_fun):
    #tokenizing here
    tokenizer = RegexpTokenizer(r'\w+')
    words_sen1 = tokenizer.tokenize(sen1)
    words_sen2 = tokenizer.tokenize(sen2)
    word_sim = []
    for word1 in words_sen1:
        word_sim.append([lex_sim_fun(model_word2vec, word1, word2) for word2 in words_sen2])
    word_sim = np.array(word_sim)
    return np.sum([np.power(np.max(x), p) for x in word_sim])/len(words_sen1)

In [101]:
def lex_sim_w2v(model, word1, word2):
    #model has to be gensim's model
    if (word1 in model_word2vec.vocab) and (word2 in model_word2vec.vocab):
        return model_word2vec.similarity(word1, word2)
    else:
        return 0.

In [76]:
pairs, gs = read_dataset(datasets_file[1], gold_standard_file[1])

In [57]:
word_sim = []
for word1 in tokenizer.tokenize(pairs[1][0]):
    word_sim.append([lex_sim_w2v(model_word2vec, word1, word2) for word2 in tokenizer.tokenize(pairs[1][1])])
word_sim = np.array(word_sim)

In [58]:
word_sim

array([[ 0.68358763,  0.04238769,  0.07125915,  0.19398442,  0.0467174 ,
         0.09638663],
       [ 0.06270984,  0.32290686,  1.        ,  0.04031163,  0.0612379 ,
         0.0902412 ],
       [ 0.2028677 ,  0.01897395,  0.04031163,  1.        ,  0.09532947,
         0.05088437],
       [ 0.02006767,  0.04898579,  0.0612379 ,  0.09532947,  1.        ,
         0.13625418],
       [-0.0399349 ,  0.10082886,  0.0902412 ,  0.05088437,  0.13625418,
         1.        ]])

In [61]:
np.max(word_sim, axis=1)

array([ 0.68358763,  1.        ,  1.        ,  1.        ,  1.        ])

In [95]:
for dataset, gs_file in zip(datasets_file, gold_standard_file):
    pairs, gs = read_dataset(dataset, gs_file)
    predictions = [text_sim_monge(sen1, sen2, 1, lex_sim_w2v) for sen1, sen2 in pairs]
    pearson = pearsonr(gs, predictions)[0]
    print(dataset, len(pairs), pearson)

('dataset_texsim/2012.input.MSRpar.txt', 750, 0.43385402711875631)
('dataset_texsim/2012.input.MSRvid.txt', 750, 0.42962853892838276)
('dataset_texsim/2012.input.OnWN.txt', 750, 0.52360912174631069)
('dataset_texsim/2012.input.SMTeuroparl.txt', 459, 0.38817345996370839)
('dataset_texsim/2012.input.SMTnews.txt', 399, 0.45451554573392566)
('dataset_texsim/2013.input.FNWN.txt', 189, 0.34788248671992644)
('dataset_texsim/2013.input.OnWN.txt', 561, 0.47859521829926593)
('dataset_texsim/2013.input.SMT.txt', 750, nan)
('dataset_texsim/2013.input.headlines.txt', 750, 0.58882688783521342)
('dataset_texsim/2014.input.OnWN.txt', 750, 0.5769935586146)
('dataset_texsim/2014.input.deft-forum.txt', 450, 0.3530931258566975)
('dataset_texsim/2014.input.deft-news.txt', 300, 0.50566661482529607)
('dataset_texsim/2014.input.headlines.txt', 750, 0.57220288208569359)
('dataset_texsim/2014.input.images.txt', 750, 0.54444668618352487)
('dataset_texsim/2014.input.tweet-news.txt', 750, 0.68361677643476682)
('da

In [96]:
for dataset, gs_file in zip(datasets_file, gold_standard_file):
    pairs, gs = read_dataset(dataset, gs_file)
    predictions = [text_sim_monge(sen1, sen2, 2, lex_sim_w2v) for sen1, sen2 in pairs]
    pearson = pearsonr(gs, predictions)[0]
    print(dataset, len(pairs), pearson)

('dataset_texsim/2012.input.MSRpar.txt', 750, 0.43882435929567665)
('dataset_texsim/2012.input.MSRvid.txt', 750, 0.410555272976737)
('dataset_texsim/2012.input.OnWN.txt', 750, 0.5489341680335138)
('dataset_texsim/2012.input.SMTeuroparl.txt', 459, 0.42874884686381221)
('dataset_texsim/2012.input.SMTnews.txt', 399, 0.44691726842112972)
('dataset_texsim/2013.input.FNWN.txt', 189, 0.32872815806656769)
('dataset_texsim/2013.input.OnWN.txt', 561, 0.47554178204642367)
('dataset_texsim/2013.input.SMT.txt', 750, nan)
('dataset_texsim/2013.input.headlines.txt', 750, 0.58759594159991657)
('dataset_texsim/2014.input.OnWN.txt', 750, 0.57854622239888143)
('dataset_texsim/2014.input.deft-forum.txt', 450, 0.38361484062488593)
('dataset_texsim/2014.input.deft-news.txt', 300, 0.52682536680896697)
('dataset_texsim/2014.input.headlines.txt', 750, 0.56352898025159037)
('dataset_texsim/2014.input.images.txt', 750, 0.56453560076683906)
('dataset_texsim/2014.input.tweet-news.txt', 750, 0.69101209430436594)
('

In [97]:
pairs, gs = read_dataset(datasets_file[7], gold_standard_file[7])

In [100]:
print(datasets_file[7])

dataset_texsim/2013.input.SMT.txt


In [102]:
predictions = [text_sim_monge(sen1, sen2, 1, lex_sim_w2v) for sen1, sen2 in pairs]

In [99]:
predictions

[0.58943237054898345,
 0.58011762621832597,
 0.59917285183169122,
 0.58721392656339944,
 0.52732752380663561,
 0.79622809019442664,
 0.7487018586240628,
 0.72811581018397908,
 0.55452763110386549,
 0.57320228291263375,
 0.77757053531232667,
 0.52260266686092194,
 0.37993436622026172,
 0.36370340390152806,
 0.5320864138882847,
 0.60373193877978237,
 0.52761665306261762,
 0.62529684007609454,
 0.68641834583428063,
 0.48735298963869611,
 0.94901290276161354,
 0.43840244454876193,
 0.55306041376156223,
 0.62999585904785282,
 0.98522329158102473,
 0.57164706933350051,
 0.56165512725574041,
 0.48099564533249423,
 0.52334451174703056,
 0.60145744238910825,
 0.44893259707233779,
 0.57508329575033823,
 0.53006768799715709,
 0.59285935723509497,
 0.72367731552933867,
 0.49102386692168759,
 0.89285119716825001,
 0.76860661474341274,
 0.59643969045425616,
 0.29906058352935333,
 0.34081627981141283,
 0.52676575753491006,
 0.5389769277459816,
 0.63132544280446734,
 0.47046017899923931,
 0.4905272735

In [103]:
pearsonr(gs, predictions)

(nan, 1.0)

In [106]:
np.isnan(predictions)

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [109]:
for i in range(len(predictions)):
    if np.isnan(predictions[i]):
        print(i)

419


In [110]:
pairs[419]

['. ', '. ']

In [111]:
gs[419]

5.0