In [2]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats.stats import pearsonr
import nltk
import pickle



In [3]:
PARAGRAM_PHRASE_FNAME = "paragram-phrase-XXL.txt"
PARAGRAM_SL999_FNAME = "paragram_300_sl999.txt"

In [46]:
def load_embeddings(fname):
    embeddings = {}
    
    file = open(fname)
    
    while 1:
        lines = file.readlines(100000)
        if not lines:
            break
        for line in lines:
            spl = line.split(" ")
            
            word = spl[0]
            embed = np.array([float(i) for i in spl[1:]])
            embeddings[word] = embed
    
    file.close()
    
    return embeddings

In [47]:
phrase_embed = load_embeddings(PARAGRAM_PHRASE_FNAME)
print "Phrase embeddings loaded."
sl999_embed = load_embeddings(PARAGRAM_SL999_FNAME)
print "SL999 embeddings loaded."
embedding_set = [phrase_embed, sl999_embed]

Phrase embeddings loaded.
SL999 embeddings loaded.


In [48]:
# key for unknown tokens (in Paragram Phrase XXL)
UNKNOWN_KEY = 'UUUNKKK'
UNKNOWN_EMBED = phrase_embed[UNKNOWN_KEY]

In [49]:
# exclude the top 10 most common English words from contributing
# to the embedding
#embed_exclude = ["the", "be", "to", "of", "and", "a", "in"]

# turns sentence into 300xN matrix
def sentence_to_embedding_space(snt, embeddings):
    # replace period, apostrophe, comma tokens
    p_snt = snt.lower()
    tokens = nltk.wordpunct_tokenize(p_snt)
    
    embed_space = None
    
    for tk in tokens:
        w_embed = None
        for e in embeddings:
            if (tk in e):
                w_embed = np.copy(e[tk])
                break
        if (w_embed is None):
            w_embed = np.copy(UNKNOWN_EMBED)
        w_embed = np.reshape(w_embed, (300, 1))
        
        if (embed_space is None):
            embed_space = w_embed
        else:
            embed_space = np.append(embed_space, w_embed, axis = -1)
    
    return embed_space

In [50]:
train_files = ['MSRpar.txt', 'MSRvid.txt', 'SMTeuroparl.txt']

score_prefix = "train/STS2012-en-train/STS.gs."
input_prefix = "train/STS2012-en-train/STS.input."

for tx in train_files:
    scr_file = open(score_prefix + tx)
    inp_file = open(input_prefix + tx)
    
    scr_content = scr_file.readlines()
    inp_content = inp_file.readlines()
    
    train_data = []
    
    for i in xrange(0, len(scr_content)):
        snts = inp_content[i].split("\t")
        snts = [x.replace("\n", "") for x in snts]
        
        t = (snts[0], snts[1],
             sentence_to_embedding_space(snts[0], embedding_set), \
             sentence_to_embedding_space(snts[1], embedding_set), \
             float(scr_content[i]))
        train_data.append(t)
        
    output = open("preprocessed_feats/" + tx + '.features_pkl', 'wb')
    pickle.dump(train_data, output)
    
    scr_file.close()
    inp_file.close()

In [51]:
test_files = ['answer-answer.txt', 'headlines.txt', 'plagiarism.txt', \
             'postediting.txt', 'question-question.txt']

input_prefix = "test/STS2016.input."

for tx in test_files:
    inp_file = open(input_prefix + tx)   
    inp_content = inp_file.readlines()
    
    test_data = []
    
    for i in xrange(0, len(inp_content)):
        snts = inp_content[i].split("\t")
        snts = [x.replace("\n", "") for x in snts]
        
        t = (snts[0], snts[1],
             sentence_to_embedding_space(snts[0], embedding_set), \
             sentence_to_embedding_space(snts[1], embedding_set), \
             -1.0)
        test_data.append(t)
        
    output = open("preprocessed_feats/" + tx + '.features_pkl', 'wb')
    pickle.dump(test_data, output)
    
    inp_file.close()