In [1]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats.stats import pearsonr

In [2]:
# load paragram phrase embeddings
embeddings = {}

cnt = 0
with open("paragram-phrase-XXL.txt", "r") as ins:
    for line in ins:
        spl = line.split(" ")
        
        word = spl[0]
        embed = np.array([float(i) for i in spl[1:]])
        embeddings[word] = embed
        
print len(embeddings)

57403


In [3]:
train_data = ['MSRpar.txt', 'MSRvid.txt', 'SMTeuroparl.txt']

score_prefix = "train/STS2012-en-train/STS.gs."
input_prefix = "train/STS2012-en-train/STS.input."

In [4]:
# exclude the top 10 most common English words from contributing
# to the embedding
embed_exclude = ["the", "be", "to", "of", "and", "a", "in"]

# key for unknown tokens
UNKNOWN_KEY = 'UUUNKKK'

def generate_sentence_embedding(snt):
    # replace period, apostrophe, comma tokens
    p_snt = snt.lower()
    p_snt = p_snt.replace(".", " . ")
    p_snt = p_snt.replace(",", "")
    p_snt = p_snt.replace("'", " '")
    
    tokens = p_snt.split(" ")
    
    embed = None
    
    for tk in tokens:
        if (tk in embed_exclude): continue
            
        if (tk in embeddings):
            w_embed = np.copy(embeddings[tk])
        else:
            w_embed = np.copy(embeddings[UNKNOWN_KEY])
        
        if (embed is None):
            embed = np.copy(w_embed)
        else:
            embed += w_embed
    
    embed /= float(len(tokens))
    
    return embed

In [5]:
generate_sentence_embedding("I'm a banana bread.")

array([ 0.08272017, -0.0240963 ,  0.19763781,  0.0127988 , -0.10337241,
        0.0186414 , -0.17947724,  0.00541829,  0.05503789, -0.14970237,
       -0.19154424,  0.08200344,  0.13841786, -0.07519051,  0.08786204,
        0.12373757, -0.02274408,  0.00984109, -0.04086139,  0.1190146 ,
        0.32392957,  0.13972657, -0.03135561, -0.05066251,  0.15584202,
       -0.09838443, -0.05120711,  0.10024286, -0.0698986 , -0.22579137,
        0.01611736,  0.01273414,  0.04818078,  0.06993207,  0.0625086 ,
        0.22322516,  0.04402224, -0.04637233, -0.02190523,  0.26198261,
        0.11384891,  0.00663643, -0.09369043, -0.25205019,  0.10511761,
        0.21353314,  0.01177551,  0.11308824, -0.06628112,  0.00731351,
        0.01387944,  0.01134849,  0.17159253,  0.1230085 ,  0.20220729,
       -0.09297496, -0.04773596,  0.20767896,  0.01554781, -0.13748374,
        0.08533771,  0.08392154,  0.28348239,  0.12206729,  0.01077601,
       -0.23239174,  0.11436821, -0.04515459, -0.16799414,  0.12

In [7]:
cnt = 0

for tx in train_data:
    scr_file = open(score_prefix + tx)
    inp_file = open(input_prefix + tx)
    
    scr_content = scr_file.readlines()
    inp_content = inp_file.readlines()
    
    sentences = []
    scores = []
    
    for i in xrange(0, len(scr_content)):
        snts = inp_content[i].split("\t")
        sentences.append((snts[0], snts[1]))
        
        scores.append(float(scr_content[i]))
    
    scores = np.array(scores)
    
    calculated_similarity = []
    for (s1, s2) in sentences:
        embed_s1 = generate_sentence_embedding(s1)
        embed_s2 = generate_sentence_embedding(s2)
        
        embed_s1 = embed_s1.reshape(1, -1)
        embed_s2 = embed_s2.reshape(1, -1)
        
        calculated_similarity.append(cosine_similarity(embed_s1, embed_s2))
    
    scores = np.array(scores)
    calculated_similarity = np.array(calculated_similarity)
    calculated_similarity = calculated_similarity.reshape(scores.shape)
    
    corr, _ = pearsonr(scores, calculated_similarity)
    print tx, corr
    
    scr_file.close()
    inp_file.close()

MSRpar.txt 0.496389136129
MSRvid.txt 0.795938846204
SMTeuroparl.txt 0.642742130816
