In [1]:
import gensim
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import math
import nltk
import numpy as np
from sklearn.decomposition import TruncatedSVD
import tensorflow as tf
import tensorflow_hub as hub
import torch
from models import BLSTMEncoder

  from ._conv import register_converters as _register_converters


# Word2Vec - Baseline

In [2]:
# Load Google's pre-trained Word2Vec model.
model = gensim.models.KeyedVectors.load_word2vec_format('packages/word2vec/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
# Loading Glove
model = gensim.models.KeyedVectors.load_word2vec_format('packages/glove/glove.840B.300d.txt')

In [4]:
STOP = set(nltk.corpus.stopwords.words("english"))

class Sentence:
    
    def __init__(self, sentence):
        self.raw = sentence
        normalized_sentence = sentence.replace("‘", "'").replace("’", "'")
        self.tokens = [t.lower() for t in nltk.word_tokenize(normalized_sentence)]
        self.tokens_without_stop = [t for t in self.tokens if t not in STOP]

In [5]:
import csv

PATH_TO_FREQUENCIES_FILE = "packages/frequencies.tsv"
PATH_TO_DOC_FREQUENCIES_FILE = "packages/doc_frequencies.tsv"

def read_tsv(f):
    frequencies = {}
    with open(f) as tsv:
        tsv_reader = csv.reader(tsv, delimiter="\t")
        for row in tsv_reader: 
            frequencies[row[0]] = int(row[1])
        
    return frequencies
        
frequencies = read_tsv(PATH_TO_FREQUENCIES_FILE)
doc_frequencies = read_tsv(PATH_TO_DOC_FREQUENCIES_FILE)
doc_frequencies["NUM_DOCS"] = 1288431

In [6]:
def run_avg_benchmark(sentences1, sentences2, model=None, use_stoplist=False, doc_freqs=None): 

    if doc_freqs is not None:
        N = doc_freqs["NUM_DOCS"]
    
    sims = []
    for (sent1, sent2) in zip(sentences1, sentences2):
    
        tokens1 = sent1.tokens_without_stop if use_stoplist else sent1.tokens
        tokens2 = sent2.tokens_without_stop if use_stoplist else sent2.tokens

        tokens1 = [token for token in tokens1 if token in model]
        tokens2 = [token for token in tokens2 if token in model]
        
        if len(tokens1) == 0 or len(tokens2) == 0:
            sims.append(0)
            continue
        
        tokfreqs1 = Counter(tokens1)
        tokfreqs2 = Counter(tokens2)
        
        weights1 = [tokfreqs1[token] * math.log(N/(doc_freqs.get(token, 0)+1)) 
                    for token in tokfreqs1] if doc_freqs else None
        weights2 = [tokfreqs2[token] * math.log(N/(doc_freqs.get(token, 0)+1)) 
                    for token in tokfreqs2] if doc_freqs else None
                
        embedding1 = np.average([model[token] for token in tokfreqs1], axis=0, weights=weights1).reshape(1, -1)
        embedding2 = np.average([model[token] for token in tokfreqs2], axis=0, weights=weights2).reshape(1, -1)

        sim = cosine_similarity(embedding1, embedding2)[0][0]
        sims.append(sim)

    return sims

In [7]:
sentence1 = [""]

In [8]:
sentence2 = [""]

In [9]:
#sentence1 = [str(sent.encode('utf8')) for sent in sentence1]
#sentence2 = [str(sent.encode('utf8')) for sent in sentence2]

In [10]:
sent1 = [Sentence(s) for s in sentence1]
sent2 = [Sentence(s) for s in sentence2]

In [11]:
run_avg_benchmark(sent1,sent2,model,False,doc_frequencies)

[0.9606351431025658,
 0.9444375114173821,
 0.8595259437957676,
 0.8522309476752151,
 0.589845285130551,
 0.9030021298570485,
 0.7715324444714292,
 0.7918179021785309,
 0.7713043761163555,
 0.483166788785355,
 0.4794985889836423,
 0.16011943896731723,
 0.8523399716377958,
 0.8469120881365616,
 0.5263579924165382,
 0.8418263004708006,
 0.7995137820521838,
 0.7919734155375269,
 0.2644288998649582]

# Word2Vec - Smooth Inverse Frequency

In [12]:
def remove_first_principal_component(X):
    svd = TruncatedSVD(n_components=1, n_iter=7, random_state=0)
    svd.fit(X)
    pc = svd.components_
    XX = X - X.dot(pc.transpose()) * pc
    return XX


def run_sif_benchmark(sentences1, sentences2, model, freqs={}, use_stoplist=False, a=0.001): 
    total_freq = sum(freqs.values())
    
    embeddings = []
    
    # SIF requires us to first collect all sentence embeddings and then perform 
    # common component analysis.
    for (sent1, sent2) in zip(sentences1, sentences2): 
        
        tokens1 = sent1.tokens_without_stop if use_stoplist else sent1.tokens
        tokens2 = sent2.tokens_without_stop if use_stoplist else sent2.tokens
        
        tokens1 = [token for token in tokens1 if token in model]
        tokens2 = [token for token in tokens2 if token in model]
        
        weights1 = [a/(a+freqs.get(token,0)/total_freq) for token in tokens1]
        weights2 = [a/(a+freqs.get(token,0)/total_freq) for token in tokens2]
        
        embedding1 = np.average([model[token] for token in tokens1], axis=0, weights=weights1)
        embedding2 = np.average([model[token] for token in tokens2], axis=0, weights=weights2)
        
        embeddings.append(embedding1)
        embeddings.append(embedding2)
        
    embeddings = remove_first_principal_component(np.array(embeddings))
    sims = [cosine_similarity(embeddings[idx*2].reshape(1, -1), 
                              embeddings[idx*2+1].reshape(1, -1))[0][0] 
            for idx in range(int(len(embeddings)/2))]

    return sims

In [13]:
run_sif_benchmark(sent1,sent2,model,doc_frequencies,False,0.001)

[0.9251584579692678,
 0.9146054914274682,
 0.6367094461293079,
 0.6392371656639659,
 0.022145205178109983,
 0.8088041624517824,
 0.5774890236283197,
 0.1920933201008883,
 0.3974954813805184,
 0.3968340615284116,
 0.27284369853604884,
 -0.11301698134231103,
 0.6849786778983309,
 0.6096601629859936,
 0.22084516669357337,
 0.7123838488959815,
 0.5017663014115277,
 0.6210849004338944,
 0.1911594244651889]

# Google sentence Encoder

In [12]:
embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")

INFO:tensorflow:Using C:\Users\rshars\AppData\Local\Temp\tfhub_modules to cache modules.


## Method1

In [13]:
# list1 will have the list of documents
with tf.Session() as session:
  session.run([tf.global_variables_initializer(), tf.tables_initializer()])
  message_embeddings = session.run(embed(sentence1))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [14]:
# list2 will have the list of documents
with tf.Session() as session:
  session.run([tf.global_variables_initializer(), tf.tables_initializer()])
  message_embeddings1 = session.run(embed(sentence2))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [15]:
sims = []
for i in range(0,len(message_embeddings)):
    sims.append(cosine_similarity(message_embeddings[i].reshape(1, -1),message_embeddings1[i].reshape(1, -1))[0][0])

In [16]:
sims

[0.9472048,
 0.9085176,
 0.77995145,
 0.7688799,
 0.4001631,
 0.91555846,
 0.8617412,
 0.54845995,
 0.8196933,
 0.7820327,
 0.73287976,
 0.5415521,
 0.72583914,
 0.8032801,
 0.71069163,
 0.77885765,
 0.78309333,
 0.71182215,
 0.45662346]

## Method2

In [17]:
def run_gse_benchmark(sentences1, sentences2):
    sts_input1 = tf.placeholder(tf.string, shape=(None))
    sts_input2 = tf.placeholder(tf.string, shape=(None))

    sts_encode1 = tf.nn.l2_normalize(embed(sts_input1))
    sts_encode2 = tf.nn.l2_normalize(embed(sts_input2))
        
    sim_scores = tf.reduce_sum(tf.multiply(sts_encode1, sts_encode2), axis=1)
    
    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
      
        [gse_sims] = session.run(
            [sim_scores],
            feed_dict={
                sts_input1: [sent1.raw for sent1 in sentences1],
                sts_input2: [sent2.raw for sent2 in sentences2]
            })
    return gse_sims

In [None]:
run_gse_benchmark(sent1,sent2)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
