In [1]:
import pandas as pd
import numpy as np
from math import sqrt, log
from collections import defaultdict
from itertools import chain, product

In [2]:
def vectorize(sentence, vocabulary):
    result = [sentence.split().count(i) for i in vocabulary]
    return result

def convert_words_to_vectors(sentence):
    vectorized_sentence = []
    vocabulary = sorted(set(chain(*[words.lower().split() for words in sentence])))
    for words in sentence:
        vectorized_sentence.append((words, vectorize(words, vocabulary)))
    return vectorized_sentence, vocabulary

def dot_product_of_vectors(vector_one, vector_two):
    result = np.dot(vector_one, vector_two) / (sqrt(np.dot(vector_one, vector_one)) * sqrt(np.dot(vector_two, vector_two)))
    return result

def cosine_sim(sentence_one, sentence_two):
    sentences = [sentence_one, sentence_two]
    corpus, vocabulary = convert_words_to_vectors(sentences)
    similarity = [dot_product_of_vectors(a[1], b[1]) for a, b in product(corpus, corpus)]
    return similarity[1]

In [4]:
raw_data = pd.read_csv('peer_reviews.csv')
raw_data = raw_data[['abstract', 'review']].copy()
raw_data.head()

Unnamed: 0,abstract,review
0,Neural networks are vulnerable to adversarial ...,This paper proposes a principled methodology t...
1,Neural networks are vulnerable to adversarial ...,This paper applies recently developed ideas in...
2,Neural networks are vulnerable to adversarial ...,"In this very good paper, the objective is to p..."
3,"In this paper, we present a layer-wise learnin...",This paper proposes a learning method (PIB) ba...
4,"In this paper, we present a layer-wise learnin...",This paper presents a new way of training stoc...


In [13]:
raw_data["similarity_score"] = np.nan
raw_data['similarity_score'] = raw_data.apply(lambda x: cosine_sim(x['abstract'], x['review']), axis=1)

In [14]:
raw_data.head()

Unnamed: 0,abstract,review,similarity_score
0,Neural networks are vulnerable to adversarial ...,This paper proposes a principled methodology t...,0.476387
1,Neural networks are vulnerable to adversarial ...,This paper applies recently developed ideas in...,0.403208
2,Neural networks are vulnerable to adversarial ...,"In this very good paper, the objective is to p...",0.441847
3,"In this paper, we present a layer-wise learnin...",This paper proposes a learning method (PIB) ba...,0.737839
4,"In this paper, we present a layer-wise learnin...",This paper presents a new way of training stoc...,0.752842


In [15]:
raw_data.to_csv('reviews_scores.csv', index=False)