In [1]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats.stats import pearsonr
import nltk
import sys
import os

# requires monolingual word aligner
old_path = os.getcwd()
os.chdir('/home/sri/Desktop/monolingual-word-aligner')
from aligner import *
os.chdir(old_path)

In [2]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [8]:
def sentence_pairs_to_scores(snts, verbose = False):
    calculated_similarity = []
    i = 0
    
    for (s1, s2) in snts:
        tkn_s1 = nltk.wordpunct_tokenize(s1.decode('unicode_escape').encode('ascii','ignore'))
        tkn_s2 = nltk.wordpunct_tokenize(s2.decode('unicode_escape').encode('ascii','ignore'))
        
        try:
            alignments = align(tkn_s1, tkn_s2)
            num_aligned = len(alignments[0])
            num_total = min(len(tkn_s1), len(tkn_s2))
        
            score = float(num_aligned) / float(num_total)
        except:
            score = 0.5
        
        calculated_similarity.append(score)
        
        if (verbose):
            print "        \r", (i + 1), "/", len(snts),
            i += 1
        
    calculated_similarity = np.array(calculated_similarity)
    return calculated_similarity




In [9]:
def compute_scores(inp_fname, out_fname, verbose = False):
    inp_file = open(inp_fname)
    inp_content = inp_file.readlines()
    inp_file.close()
    
    sentences = []
    for i in xrange(0, len(inp_content)):
        snts = inp_content[i].split("\t")
        sentences.append((snts[0], snts[1]))

    calculated_similarity = sentence_pairs_to_scores(sentences, verbose)
    np.savetxt(out_fname, calculated_similarity)

In [10]:
compute_scores("test/STS2016.input.answer-answer.txt", "answer-answer.predictions", True)
compute_scores("test/STS2016.input.headlines.txt", "headlines.predictions", True)
compute_scores("test/STS2016.input.plagiarism.txt", "plagiarism.predictions", True)
compute_scores("test/STS2016.input.postediting.txt", "postediting.predictions", True)
compute_scores("test/STS2016.input.question-question.txt", "question-question.predictions", True)

209 / 209


In [None]:
train_data = ['MSRpar.txt', 'MSRvid.txt', 'SMTeuroparl.txt']

score_prefix = "train/STS2012-en-train/STS.gs."
input_prefix = "train/STS2012-en-train/STS.input."

cnt = 0

calculated_scores = {}

for tx in train_data:
    scr_file = open(score_prefix + tx)
    inp_file = open(input_prefix + tx)
    
    scr_content = scr_file.readlines()
    inp_content = inp_file.readlines()
    
    sentences = []
    scores = []
    
    for i in xrange(0, len(scr_content)):
        snts = inp_content[i].split("\t")
        snts = [x.replace("\n", "") for x in snts]
        sentences.append((snts[0], snts[1]))
        
        scores.append(float(scr_content[i]))
    
    scores = np.array(scores)
    
    calculated_similarity = sentence_pairs_to_scores(sentences, True)
    calculated_similarity = calculated_similarity.reshape(scores.shape)
    
    corr, _ = pearsonr(scores, calculated_similarity)
    print tx, corr
    
    store = []
    for i in xrange(0, scores.shape[0]):
        store.append((sentences[i], calculated_similarity[i], scores[i]))
    calculated_scores[tx] = store
    
    scr_file.close()
    inp_file.close()

In [None]:
import math
idx = 0

sc = calculated_scores['MSRpar.txt']
for i in xrange(0, len(sc)):
    snts, calc, actual = sc[i]
    
    if (abs(actual - (calc * 5.0)) > 0.1):
        continue

    print i
    print snts
    print actual / 5.0, actual
    print calc, calc * 5.0
    print ""