In [26]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats.stats import pearsonr
import nltk

In [27]:
PARAGRAM_PHRASE_FNAME = "paragram-phrase-XXL.txt"
PARAGRAM_SL999_FNAME = "paragram_300_sl999.txt"

In [28]:
def load_embeddings(fname):
    embeddings = {}
    
    file = open(fname)
    
    while 1:
        lines = file.readlines(100000)
        if not lines:
            break
        for line in lines:
            spl = line.split(" ")
            
            word = spl[0]
            embed = np.array([float(i) for i in spl[1:]])
            embeddings[word] = embed
    
    file.close()
    
    return embeddings

In [29]:
phrase_embed = load_embeddings(PARAGRAM_PHRASE_FNAME)
print "Phrase embeddings loaded."
sl999_embed = load_embeddings(PARAGRAM_SL999_FNAME)
print "SL999 embeddings loaded."
embedding_set = [phrase_embed, sl999_embed]

Phrase embeddings loaded.
SL999 embeddings loaded.


In [30]:
# key for unknown tokens (in Paragram Phrase XXL)
UNKNOWN_KEY = 'UUUNKKK'
UNKNOWN_EMBED = phrase_embed[UNKNOWN_KEY]

In [31]:
# exclude the top 10 most common English words from contributing
# to the embedding
#embed_exclude = ["the", "be", "to", "of", "and", "a", "in"]

def generate_sentence_embedding(snt, embeddings):
    # replace period, apostrophe, comma tokens
    p_snt = snt.lower()
    tokens = nltk.word_tokenize(p_snt.decode('utf8'))
    
    embed = None
    
    for tk in tokens:
        w_embed = None
        for e in embeddings:
            if (tk in e):
                w_embed = np.copy(e[tk])
                break
        if (w_embed is None):
            w_embed = np.copy(UNKNOWN_EMBED)
        
        if (embed is None):
            embed = np.copy(w_embed)
        else:
            embed += w_embed
    
    embed /= float(len(tokens))
    
    return embed

In [32]:
generate_sentence_embedding("I'm a big banana bread shaped baby.", embedding_set)

array([ -1.38630444e-02,   2.89955111e-02,   8.39412667e-02,
        -1.68017000e-02,   2.83757778e-03,   3.17517000e-02,
        -1.54995212e-01,  -4.64025556e-02,   8.02574222e-02,
         1.11696044e-01,  -2.41144444e-01,   5.39162333e-02,
         1.06982956e-01,  -4.84446556e-03,   9.25152333e-02,
         6.52265811e-02,   1.32761589e-02,   2.15576690e-01,
         5.37402222e-03,   1.41420133e-01,   2.03408333e-01,
        -1.36282856e-01,  -1.25107511e-01,   1.66213556e-02,
         3.34568667e-02,  -2.29350333e-01,  -4.89215333e-02,
         1.78415400e-01,  -5.84089556e-02,  -1.18479511e-01,
        -1.05766189e-01,   2.06150267e-01,   1.18677194e-01,
        -5.15157111e-02,   2.46176356e-01,   2.00301567e-01,
         5.37523000e-02,  -3.87808000e-02,  -5.94594333e-02,
         1.05790311e-01,   1.12133600e-01,   4.43498889e-03,
        -1.53323689e-01,  -1.82321989e-01,   1.23379122e-01,
         6.94702222e-02,  -5.44971333e-02,   4.37944556e-02,
        -3.00498744e-02,

In [33]:
train_data = ['MSRpar.txt', 'MSRvid.txt', 'SMTeuroparl.txt']

score_prefix = "train/STS2012-en-train/STS.gs."
input_prefix = "train/STS2012-en-train/STS.input."

cnt = 0

calculated_scores = {}

for tx in train_data:
    scr_file = open(score_prefix + tx)
    inp_file = open(input_prefix + tx)
    
    scr_content = scr_file.readlines()
    inp_content = inp_file.readlines()
    
    sentences = []
    scores = []
    
    for i in xrange(0, len(scr_content)):
        snts = inp_content[i].split("\t")
        snts = [x.replace("\n", "") for x in snts]
        sentences.append((snts[0], snts[1]))
        
        scores.append(float(scr_content[i]))
    
    scores = np.array(scores)
    
    calculated_similarity = []
    for (s1, s2) in sentences:
        embed_s1 = generate_sentence_embedding(s1, embedding_set)
        embed_s2 = generate_sentence_embedding(s2, embedding_set)
        
        embed_s1 = embed_s1.reshape(1, -1)
        embed_s2 = embed_s2.reshape(1, -1)
        
        calculated_similarity.append(cosine_similarity(embed_s1, embed_s2))
    
    scores = np.array(scores)
    calculated_similarity = np.array(calculated_similarity)
    calculated_similarity = calculated_similarity.reshape(scores.shape)
    
    corr, _ = pearsonr(scores, calculated_similarity)
    print tx, corr
    
    store = []
    for i in xrange(0, scores.shape[0]):
        store.append((sentences[i], calculated_similarity[i], scores[i]))
    calculated_scores[tx] = store
    
    scr_file.close()
    inp_file.close()

MSRpar.txt 0.488736937213
MSRvid.txt 0.783698707731
SMTeuroparl.txt 0.658256509218




In [34]:
def compute_scores(inp_fname, out_fname):
    inp_file = open(inp_fname)
    inp_content = inp_file.readlines()
    inp_file.close()

    calculated_similarity = []
    for i in xrange(0, len(inp_content)):
        snts = inp_content[i].split("\t")

        embed_s1 = generate_sentence_embedding(snts[0], embedding_set)
        embed_s1 = embed_s1.reshape(1, -1)

        embed_s2 = generate_sentence_embedding(snts[1], embedding_set)
        embed_s2 = embed_s2.reshape(1, -1)

        calculated_similarity.append(cosine_similarity(embed_s1, embed_s2))

    calculated_similarity = np.array(calculated_similarity)
    calculated_similarity = calculated_similarity.flatten()

    np.savetxt(out_fname, calculated_similarity)

In [35]:
compute_scores("test/STS2016.input.answer-answer.txt", "answer-answer.predictions")
compute_scores("test/STS2016.input.headlines.txt", "headlines.predictions")
compute_scores("test/STS2016.input.plagiarism.txt", "plagiarism.predictions")
compute_scores("test/STS2016.input.postediting.txt", "postediting.predictions")
compute_scores("test/STS2016.input.question-question.txt", "question-question.predictions")

In [36]:
import math
idx = 0

sc = calculated_scores['MSRpar.txt']
for i in xrange(0, len(sc)):
    snts, calc, actual = sc[i]
    
    if (abs(actual - (calc * 5.0)) > 0.1):
        continue

    print i
    print snts
    print actual / 5.0, actual
    print calc, calc * 5.0
    print ""

1
('Micron has declared its first quarterly profit for three years.', "Micron's numbers also marked the first quarterly profit in three years for the DRAM manufacturer.")
0.75 3.75
0.76231891576 3.8115945788

7
('Chavez said investigators feel confident they\'ve got "at least one of the fires resolved in that regard."', 'Albuquerque Mayor Martin Chavez said investigators felt confident that with the arrests they had "at least one of the fires resolved."')
0.76 3.8
0.778624907268 3.89312453634

36
('Waksal has pleaded guilty to securities fraud and is to be sentenced next week.', 'Waksal pleaded guilty to insider trading charges last year, and he is scheduled to be sentenced June 10.')
0.7 3.5
0.71121279907 3.55606399535

39
('Boeing said the final agreement is expected to be signed during the next few weeks.', 'The Korean Air deal is expected to be finalized "in the next several weeks," Boeing spokesman Bob Saling said.')
0.6 3.0
0.600725147704 3.00362573852

43
('The weakness exists i