In [1]:
import sys
import logging
import warnings
import numpy as np
import pandas as pd

from scipy.stats import pearsonr
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

logging.disable(logging.WARNING)
warnings.filterwarnings('ignore')
np.set_printoptions(threshold=sys.maxsize)

In [2]:
trainpath = 'data/train.csv'
testpath = 'data/test.csv'
valpath = 'data/validation.csv'

traindata = pd.read_csv(trainpath)
testdata = pd.read_csv(testpath)
valdata = pd.read_csv(valpath)

Load the pretrained Word2Vec model <br />
Downloaded from [link](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?resourcekey=0-wjGZdNAUop6WykTtMip30g)

In [3]:
modelpath = "data/GoogleNews-vectors-negative300.bin"
model = KeyedVectors.load_word2vec_format(modelpath, binary=True)

In [4]:
def get_sentences_embedding(sentences):
    sentence_embedding=[]
    for sentence in sentences:
        words = sentence
        unk_token = "unk"
        words = [word if word in model.key_to_index else unk_token for word in words]
        if len(words) == 0:
            words = ["unk"]
        embeddings = [model[word] for word in words]
        embedding = np.mean(embeddings, axis=0)
        sentence_embedding.append(embedding)
    return np.array(sentence_embedding)

In [5]:
def pearson_corr(y_true, y_pred):
    corr, _ = pearsonr(y_true, y_pred)
    return corr

In [6]:
x_train1 = get_sentences_embedding(traindata['sentence1'].apply(eval))
x_train2 = get_sentences_embedding(traindata['sentence2'].apply(eval))
y_train = list(traindata['score'])

x_val1 = get_sentences_embedding(valdata['sentence1'].apply(eval))
x_val2 = get_sentences_embedding(valdata['sentence2'].apply(eval))
y_val = list(valdata['score'])

test_x1 = get_sentences_embedding(testdata['sentence1'].apply(eval))
test_x2 = get_sentences_embedding(testdata['sentence2'].apply(eval))
y_test = list(testdata['score'])

In [7]:
cosine_similarities = []
human_similarity_scores = []

for i in range(len(x_train1)):
    embedding1 = x_train1[i]
    embedding2 = x_train2[i]    
    cosine_sim = cosine_similarity([embedding1], [embedding2])[0][0]
    
    cosine_similarities.append(cosine_sim)
    human_similarity_scores.append(y_train[i])

pearson_corr, _ = pearsonr(cosine_similarities, human_similarity_scores)
print("Pearson coefficient:", pearson_corr)

Pearson coefficient: 0.684437678796524


In [8]:
cosine_similarities = []
human_similarity_scores = []

for i in range(len(x_val1)):
    embedding1 = x_val1[i]
    embedding2 = x_val2[i]    
    cosine_sim = cosine_similarity([embedding1], [embedding2])[0][0]
    
    cosine_similarities.append(cosine_sim)
    human_similarity_scores.append(y_val[i])
 
pearson_corr, _ = pearsonr(cosine_similarities, human_similarity_scores)
print("Pearson coefficient:", pearson_corr)

Pearson coefficient: 0.7365135183185004


In [9]:
cosine_similarities = []
human_similarity_scores = []

for i in range(len(test_x1)):
    embedding1 = test_x1[i]
    embedding2 = test_x2[i]    
    cosine_sim = cosine_similarity([embedding1], [embedding2])[0][0]
    
    cosine_similarities.append(cosine_sim)
    human_similarity_scores.append(y_test[i])

pearson_corr, _ = pearsonr(cosine_similarities, human_similarity_scores)
print("Pearson coefficient:", pearson_corr)

Pearson coefficient: 0.6292495507627769
