In [1]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats.stats import pearsonr
import nltk

In [2]:
PARAGRAM_PHRASE_FNAME = "paragram-phrase-XXL.txt"
PARAGRAM_SL999_FNAME = "paragram_300_sl999.txt"

In [3]:
def load_embeddings(fname):
    embeddings = {}
    
    file = open(fname)
    
    while 1:
        lines = file.readlines(100000)
        if not lines:
            break
        for line in lines:
            spl = line.split(" ")
            
            word = spl[0]
            embed = np.array([float(i) for i in spl[1:]])
            embeddings[word] = embed
    
    file.close()
    
    return embeddings

In [4]:
phrase_embed = load_embeddings(PARAGRAM_PHRASE_FNAME)
print "Phrase embeddings loaded."
sl999_embed = load_embeddings(PARAGRAM_SL999_FNAME)
print "SL999 embeddings loaded."
embedding_set = [phrase_embed, sl999_embed]

Phrase embeddings loaded.
SL999 embeddings loaded.


In [11]:
# key for unknown tokens (in Paragram Phrase XXL)
UNKNOWN_KEY = 'UUUNKKK'
UNKNOWN_EMBED = phrase_embed[UNKNOWN_KEY]

In [12]:
# exclude the top 10 most common English words from contributing
# to the embedding
#embed_exclude = ["the", "be", "to", "of", "and", "a", "in"]

def generate_sentence_embedding(snt, embeddings):
    # replace period, apostrophe, comma tokens
    p_snt = snt.lower()
    tokens = nltk.wordpunct_tokenize(p_snt)
    '''
    p_snt = p_snt.replace(".", " . ")
    p_snt = p_snt.replace(",", "")
    p_snt = p_snt.replace("'", " '")
    
    tokens = p_snt.split(" ")
    '''
    
    embed = None
    
    for tk in tokens:
        w_embed = None
        for e in embeddings:
            if (tk in e):
                w_embed = np.copy(e[tk])
                break
        if (w_embed is None):
            w_embed = np.copy(UNKNOWN_EMBED)
        
        if (embed is None):
            embed = np.copy(w_embed)
        else:
            embed += w_embed
    
    embed /= float(len(tokens))
    
    return embed

In [13]:
generate_sentence_embedding("I'm a big banana bread shaped baby.", embedding_set)

array([ 0.02145891,  0.01980786,  0.01566592,  0.03454341,  0.05284471,
       -0.03635827, -0.12828479, -0.01929458,  0.04937593,  0.14469414,
       -0.19015587,  0.05935121,  0.13937566, -0.02097366,  0.0767659 ,
        0.09187943,  0.02417306,  0.27704952, -0.07365263,  0.17273723,
        0.1721963 , -0.06514027, -0.1115499 ,  0.00271728,  0.04272548,
       -0.2770561 , -0.09489096,  0.25701236,  0.03370794, -0.1114329 ,
       -0.02391095,  0.19763558,  0.07582888, -0.095224  ,  0.17163322,
        0.1001437 ,  0.05222531, -0.02610413, -0.10390669,  0.05251578,
        0.16708791, -0.02831472, -0.20408081, -0.16155014,  0.09230184,
        0.0071894 , -0.07491042,  0.05629261, -0.01113949, -0.06069164,
       -0.08809049,  0.0565194 , -0.05463463,  0.15845133,  0.22879134,
       -0.03235061, -0.09324567,  0.1945562 ,  0.05712636, -0.11348776,
        0.02825785,  0.19870934,  0.16976406,  0.0916146 , -0.07538186,
       -0.20267189,  0.2206958 , -0.18111157, -0.20988867,  0.06

In [14]:
def compute_scores(inp_fname, out_fname):
    inp_file = open(inp_fname)
    inp_content = inp_file.readlines()
    inp_file.close()

    calculated_similarity = []
    for i in xrange(0, len(inp_content)):
        snts = inp_content[i].split("\t")

        embed_s1 = generate_sentence_embedding(snts[0], embedding_set)
        embed_s1 = embed_s1.reshape(1, -1)

        embed_s2 = generate_sentence_embedding(snts[1], embedding_set)
        embed_s2 = embed_s2.reshape(1, -1)

        calculated_similarity.append(cosine_similarity(embed_s1, embed_s2))

    calculated_similarity = np.array(calculated_similarity)
    calculated_similarity = calculated_similarity.flatten()

    np.savetxt(out_fname, calculated_similarity)

In [15]:
compute_scores("test/STS2016.input.answer-answer.txt", "answer-answer.predictions")
compute_scores("test/STS2016.input.headlines.txt", "headlines.predictions")
compute_scores("test/STS2016.input.plagiarism.txt", "plagiarism.predictions")
compute_scores("test/STS2016.input.postediting.txt", "postediting.predictions")
compute_scores("test/STS2016.input.question-question.txt", "question-question.predictions")

In [84]:
train_data = ['MSRpar.txt', 'MSRvid.txt', 'SMTeuroparl.txt']

score_prefix = "train/STS2012-en-train/STS.gs."
input_prefix = "train/STS2012-en-train/STS.input."

cnt = 0

calculated_scores = {}

for tx in train_data:
    scr_file = open(score_prefix + tx)
    inp_file = open(input_prefix + tx)
    
    scr_content = scr_file.readlines()
    inp_content = inp_file.readlines()
    
    sentences = []
    scores = []
    
    for i in xrange(0, len(scr_content)):
        snts = inp_content[i].split("\t")
        snts = [x.replace("\n", "") for x in snts]
        sentences.append((snts[0], snts[1]))
        
        scores.append(float(scr_content[i]))
    
    scores = np.array(scores)
    
    calculated_similarity = []
    for (s1, s2) in sentences:
        embed_s1 = generate_sentence_embedding(s1, embedding_set)
        embed_s2 = generate_sentence_embedding(s2, embedding_set)
        
        embed_s1 = embed_s1.reshape(1, -1)
        embed_s2 = embed_s2.reshape(1, -1)
        
        calculated_similarity.append(cosine_similarity(embed_s1, embed_s2))
    
    scores = np.array(scores)
    calculated_similarity = np.array(calculated_similarity)
    calculated_similarity = calculated_similarity.reshape(scores.shape)
    
    corr, _ = pearsonr(scores, calculated_similarity)
    print tx, corr
    
    store = []
    for i in xrange(0, scores.shape[0]):
        store.append((sentences[i], calculated_similarity[i], scores[i]))
    calculated_scores[tx] = store
    
    scr_file.close()
    inp_file.close()

MSRpar.txt 0.439731780795
MSRvid.txt 0.783034963789
SMTeuroparl.txt 0.652135418538


In [None]:
import math
idx = 0

sc = calculated_scores['MSRpar.txt']
for i in xrange(0, len(sc)):
    snts, calc, actual = sc[i]
    
    if (abs(actual - (calc * 5.0)) > 0.1):
        continue

    print i
    print snts
    print actual / 5.0, actual
    print calc, calc * 5.0
    print ""