In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('peer_reviews.csv', usecols = ['abstract', 'review'])
data.tail()

Unnamed: 0,abstract,review
2743,Traditional recurrent neural network (RNN) or ...,This paper proposes a tree-to-tree model aimin...
2744,Traditional recurrent neural network (RNN) or ...,This paper presents a model to encode and deco...
2745,We study the problem of knowledge base (KB) em...,The paper proposes a unified view of multiple ...
2746,We study the problem of knowledge base (KB) em...,This paper deals with the problem of represent...
2747,We study the problem of knowledge base (KB) em...,The paper proposes a new method to train knowl...


In [3]:
# !pip install transformers
# !wget -O scibert_uncased.tar https://s3-us-west-2.amazonaws.com/ai2-s2-research/scibert/huggingface_pytorch/scibert_scivocab_uncased.tar
# !tar -xvf scibert_uncased.tar
import torch
from transformers import BertTokenizer, BertModel

In [4]:
# conda install -c conda-forge transformers
# conda install -c pytorch pytorch

In [5]:
model_version = 'scibert_scivocab_uncased'
do_lower_case = True
model = BertModel.from_pretrained(model_version)
tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case)

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

def embed_text(text, model):
    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)  # Batch size 1
    outputs = model(input_ids)
    last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    return last_hidden_states 

def get_similarity(em, em2):
    return cosine_similarity(em.detach().numpy(), em2.detach().numpy())

In [7]:
def get_sim(ab, re):
    try:
        abstract = embed_text(ab, model).mean(1)
        review = embed_text(re, model).mean(1)
        return get_similarity(abstract, review)
    except:
        return 999999999

data['semantic_similarity'] = data.apply(lambda row: get_sim(row['abstract'], row['review']), axis=1)
data.tail()

Unnamed: 0,abstract,review,semantic_similarity
2743,Traditional recurrent neural network (RNN) or ...,This paper proposes a tree-to-tree model aimin...,[[0.8362424]]
2744,Traditional recurrent neural network (RNN) or ...,This paper presents a model to encode and deco...,[[0.88113457]]
2745,We study the problem of knowledge base (KB) em...,The paper proposes a unified view of multiple ...,[[0.8258419]]
2746,We study the problem of knowledge base (KB) em...,This paper deals with the problem of represent...,[[0.8553514]]
2747,We study the problem of knowledge base (KB) em...,The paper proposes a new method to train knowl...,999999999


In [8]:
len(data[data['semantic_similarity'] == 999999999])

940

In [9]:
from spacy.lang.en import English
import re
import spacy

def data_preprocessing(text):
    
    nlp = English()
    
    tokenizer = nlp.Defaults.create_tokenizer(nlp)
    text = tokenizer(text)
    
    spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
    prep = []
    
    for token in text:
        token = token.lemma_
        if token.lower() not in spacy_stopwords:
            token = re.sub('<[^>]*>', '', token)
            token = re.sub('[\W]+', '', token.lower())
            prep.append(token)

                
    return ' '.join(prep)
    
        
data['abstract_clean'] = data.apply(lambda row: data_preprocessing(row['abstract']), axis=1)
data['review_clean'] = data.apply(lambda row: data_preprocessing(row['review']), axis=1)
data.tail()

Unnamed: 0,abstract,review,semantic_similarity,abstract_clean,review_clean
2743,Traditional recurrent neural network (RNN) or ...,This paper proposes a tree-to-tree model aimin...,[[0.8362424]],traditional recurrent neural network rnn con...,paper propose tree tree model aim encode inp...
2744,Traditional recurrent neural network (RNN) or ...,This paper presents a model to encode and deco...,[[0.88113457]],traditional recurrent neural network rnn con...,paper present model encode decode tree distrib...
2745,We study the problem of knowledge base (KB) em...,The paper proposes a unified view of multiple ...,[[0.8258419]],study problem knowledge base kb embed usual...,paper propose unify view multiple method learn...
2746,We study the problem of knowledge base (KB) em...,This paper deals with the problem of represent...,[[0.8553514]],study problem knowledge base kb embed usual...,paper deal problem representation learn knowle...
2747,We study the problem of knowledge base (KB) em...,The paper proposes a new method to train knowl...,999999999,study problem knowledge base kb embed usual...,paper propose new method train knowledge base ...


In [10]:
# conda install -c conda-forge spacy

In [11]:
data['semantic_similarity_aftercleaning'] = data.apply(lambda row: get_sim(row['abstract_clean'], row['review_clean']), axis=1)
data.tail()

Unnamed: 0,abstract,review,semantic_similarity,abstract_clean,review_clean,semantic_similarity_aftercleaning
2743,Traditional recurrent neural network (RNN) or ...,This paper proposes a tree-to-tree model aimin...,[[0.8362424]],traditional recurrent neural network rnn con...,paper propose tree tree model aim encode inp...,[[0.8119329]]
2744,Traditional recurrent neural network (RNN) or ...,This paper presents a model to encode and deco...,[[0.88113457]],traditional recurrent neural network rnn con...,paper present model encode decode tree distrib...,[[0.8385515]]
2745,We study the problem of knowledge base (KB) em...,The paper proposes a unified view of multiple ...,[[0.8258419]],study problem knowledge base kb embed usual...,paper propose unify view multiple method learn...,[[0.839614]]
2746,We study the problem of knowledge base (KB) em...,This paper deals with the problem of represent...,[[0.8553514]],study problem knowledge base kb embed usual...,paper deal problem representation learn knowle...,[[0.87139916]]
2747,We study the problem of knowledge base (KB) em...,The paper proposes a new method to train knowl...,999999999,study problem knowledge base kb embed usual...,paper propose new method train knowledge base ...,[[0.7906568]]


In [12]:
len(data[data['semantic_similarity_aftercleaning'] == 999999999])

97

In [13]:
data.to_csv('scores.csv', index = False)

In [32]:
a = """
 authors propose transform code solution extend work balle 2016  define hyperprior entropy coder model
 spatial relation transform coefficient   paper good write  trouble follow  result proposal state
 art  extremely exhaustive comparison method   opinion work good quality present iclr   think excellent improve
 detail think improve      main issues  main concern motivation relate  1 refer hyperprior motivation  clear
 gdn propose eliminate statistical dependency pixel image  main motivation gdn coefficient independent
 confusion resolve broaden explanation figure 2  2 concern clear modify probability distribution entropy
 encoder improve gdn model  think interest issue  outside scope work  far know  theoretical
 solution find right balance complexity transformation entropy encoder   interest discuss main novelty work
 compare method image compression base deep learn      issues  introduction    model optimize end   end
 minimize total expect code length learn balance information expect improvement entropy model    think point
 interest  good number happen result present  train procedure  example  simple comparison numb bit signal
 information depend compression rate numb iteration model train   compression variational models   miss 
 sentence    arithmetic code   transmit     fig1  clear read leave hand scheme  possible include distribution
 specifically  strange tiledy  scheme different conditional dependency  thing symbol  
 appear figure use section 2    easy follow change symbol function parameter like theta theta     distortion
 expect difference    expect  word use      substitute additive uniform noise   phrase correct  author balle
 2016 substitute additive uniform noise    equation  1   1 term zero constant  talk equation  7  author
 1 term constant       sentence  previous work assume   sound strange    example fig  2 extremely important
 understand motivation hyperprior think need little explanation  example important need explain begin work
 real example  model train normalization  specify  gdn able eliminate spatial dependency  dependency eliminate
 normalization apply spatial coefficient  remove dependency layer different parameter gdn   introduction
 scale hyperprior   typo   center pane       propose follow extension model  figure 3    colon  maybe miss
 maybe dot instead colon  lack explanation model   results      probability mass function  need construct
 fly     computationally costly      batch normalization learn rate decay find beneficial effect  local
 normalization property gdn  contain global normalization special case     extremely interest  connection batch 
 normalization  decay learn rate   clarify  mean use gdn instead regular nonlinearity long need use batch 
 normalization  word  think batch normalization useful special case gsn  useful community assess benefit 
 local normalization versus global normalization      combination 8 different value  order cover range rate  
 distortion tradeoff    possible method include lambda input model parameter information    guess include 
 information compute total entropy  numb bit   different way compress image information    metric train evaluate 
 little bite mislead  evaluation plot use different perceptual metric helpful   since ms  ssim yield value
 0  wrong  1    compare method achieve value good 09  convert quantity decibel order improve legibility    
 difference ms  ssim conversion significant  transformation necessary  lose intuition   probably fault able  
 unconvert  db ms  ssim unit  instance  20 curve surpass value       
 result differ substantially depend distortion metric use loss function train    informative understand 
 parameter change depend metric employ  little intuition set parameter adapt     figs 5  8 9  curve aggregate 
 different image  mean rate value  note depend totally mislead    nice include result method  like bpg rippel 
 2017  compare visually   related work  balle et al  publish work include perceptual metric end   
 end train procedure  think main contribution work  include relate work    end   end optimization 
 nonlinear transform code perceptual quality    laparra  ep simoncelli  pcs  picture coding symposium   
 2016   discussion  paragraph discussion section look like 2 section  relate work    think interest author 
 discuss relevance putt effort model hyperprior distribution image  transformation   thing equivalent  
 reason include hyperprior model ga transformation  clear model distribution output  principle   transformation
 enforce  use train procedure  transform datum follow impose distribution  gdn powerful output independent  
 beneficial compression divide problem   references   balle 2016 theis 2017 publish conference year  different 
 year reference confuse    strange reference    j  v laparra  e p simoncelli  2016    density modeling 
 images use generalized  normalization transformation    intl  conf  learning representations  iclr2016   
 url   httpsarxivorgabs151106281  valero laparra  eero p simoncelli  2015    density modeling images gen  
 eralized normalization transformation    arxiv e  print  published conference paper  4th international 
 conference learning representations  san juan  2016  arxiv  1511   06281     2016    end   end optimized 
 image compression    arxiv e  print  5th int  conf  learn  ing representations  
"""

In [33]:
b = """
describe end   end trainable model image compression base variational autoencoders  model incorporate 
hyperprior effectively capture spatial dependency latent representation  hyperprior relate information  concept 
universal virtually modern image codecs  largely unexplore image compression use artificial neural network  
anns   unlike exist autoencoder compression method  model train complex prior jointly underlie autoencoder  
demonstrate model lead state    art image compression measure visual quality use popular ms  ssim index  yield 
rate  distortion performance surpass publish ann  base method evaluate use traditional metric base square error  
psnr   furthermore  provide qualitative comparison model train different distortion metric
"""

In [35]:
get_similarity(a, b)

AttributeError: 'str' object has no attribute 'detach'