In [1]:
import xml.etree.ElementTree as ET

tree1 = ET.parse("/home/siddhant/Projects/have-a-look-at-my-paper/data/output/xml/P19-1106.tei.xml")
root1 = tree1.getroot()

tree2 = ET.parse("/home/siddhant/Projects/have-a-look-at-my-paper/data/output/xml/210603714.tei.xml")
root2 = tree2.getroot()

tree3 = ET.parse("/home/siddhant/Projects/have-a-look-at-my-paper/data/output/xml/210503404.tei.xml")
root3 = tree3.getroot()

In [2]:
from bs4 import BeautifulSoup, NavigableString
parsed_file = BeautifulSoup(open("/home/siddhant/Projects/have-a-look-at-my-paper/data/output/xml/P19-1106.tei.xml"), "lxml")

In [3]:
def parse_authors(article):
    """
    Parse authors from a given BeautifulSoup of an article
    """
    author_names = article.find("sourcedesc").findAll("persname")
    authors = []
    for author in author_names:
        firstname = author.find("forename", {"type": "first"})
        firstname = firstname.text.strip() if firstname is not None else ""
        middlename = author.find("forename", {"type": "middle"})
        middlename = middlename.text.strip() if middlename is not None else ""
        lastname = author.find("surname")
        lastname = lastname.text.strip() if lastname is not None else ""
        if middlename is not "":
            authors.append(firstname + " " + middlename + " " + lastname)
        else:
            authors.append(firstname + " " + lastname)
    authors = "; ".join(authors)
    return authors

In [4]:
auth = parse_authors(parsed_file)
print(auth)

Tirthankar Ghosal; Rajeev Verma; Asif Ekbal; Pushpak Bhattacharyya


In [5]:
def parse_date(article):
    """
    Parse date from a given BeautifulSoup of an article
    """
    pub_date = article.find("publicationstmt")
    year = pub_date.find("date")
    year = year.attrs.get("when") if year is not None else ""
    return year

In [6]:
y = parse_date(parsed_file)
print(y)

2019


In [7]:
def parse_abstract(article):
    """
    Parse abstract from a given BeautifulSoup of an article
    """
    div = article.find("abstract")
    abstract = ""
    for p in list(div.children):
        if not isinstance(p, NavigableString) and len(list(p)) > 0:
            abstract += " ".join(
                [elem.text for elem in p if not isinstance(elem, NavigableString)]
            )
    return abstract


In [8]:
ab = parse_abstract(parsed_file)
print(ab)

Automatically validating a research artefact is one of the frontiers in Artificial Intelligence (AI) that directly brings it close to competing with human intellect and intuition. Although criticized sometimes, the existing peer review system still stands as the benchmark of research validation. The present-day peer review process is not straightforward and demands profound domain knowledge, expertise, and intelligence of human reviewer(s), which is somewhat elusive with the current state of AI. However, the peer review texts, which contains rich sentiment information of the reviewer, reflecting his/her overall attitude towards the research in the paper, could be a valuable entity to predict the acceptance or rejection of the manuscript under consideration. Here in this work, we investigate the role of reviewers sentiments embedded within peer review texts to predict the peer review outcome. Our proposed deep neural architecture takes into account three channels of information: the pap

In [9]:


def parse_sections(article, as_list: bool = False):
    """
    Parse list of sections from a given BeautifulSoup of an article
    Parameters
    ==========
    as_list: bool, if True, output text as a list of paragraph instead
        of joining it together as one single text
    """
    article_text = article.find("text")
    divs = article_text.find_all("div", attrs={"xmlns": "http://www.tei-c.org/ns/1.0"})
    sections = []
    for div in divs:
        div_list = list(div.children)
        if len(div_list) == 0:
            heading = ""
            text = ""
        elif len(div_list) == 1:
            if isinstance(div_list[0], NavigableString):
                heading = str(div_list[0])
                text = ""
            else:
                heading = ""
                text = div_list[0].text
        else:
            text = []
            heading = div_list[0]
            if isinstance(heading, NavigableString):
                heading = str(heading)
                p_all = list(div.children)[1:]
            else:
                heading = ""
                p_all = list(div.children)
            for p in p_all:
                if p is not None:
                    try:
                        text.append(p.text)
                    except:
                        pass
            if not False:
                text = "\n".join(text)
        if heading is not "" or text is not "":
            ref_dict = calculate_number_of_references(div)
            sections.append(
                {
                    "heading": heading,
                    "text": text,
                    "n_publication_ref": ref_dict["n_publication_ref"],
                    "n_figure_ref": ref_dict["n_figure_ref"],
                }
            )
    return sections

In [10]:
def calculate_number_of_references(div):
    """
    For a given section, calculate number of references made in the section
    """
    n_publication_ref = len(
        [ref for ref in div.find_all("ref") if ref.attrs.get("type") == "bibr"]
    )
    n_figure_ref = len(
        [ref for ref in div.find_all("ref") if ref.attrs.get("type") == "figure"]
    )
    return {"n_publication_ref": n_publication_ref, "n_figure_ref": n_figure_ref}


In [11]:
se = parse_sections(parsed_file)
print(se[0]['text'])

The rapid increase in research article submissions across different venues is posing a significant management challenge for the journal editors and conference program chairs 1 . Among the load of works like assigning reviewers, ensuring timely receipt of reviews, slot-filling against the non-responding reviewer, taking informed decisions, communicating to the authors, etc., editors/program chairs are usually overwhelmed with many such demanding yet crucial tasks. However, the major hurdle lies in to decide the acceptance and rejection of the manuscripts based on the reviews received from the reviewers.
The quality, randomness, bias, inconsistencies in peer reviews is well-debated across the academic community (Bornmann and Daniel, 2010). Due to the rise in article submissions and nonavailability of expert reviewers, editors/program chairs are sometimes left with no other options than to assign papers to the novice, out of domain reviewers which sometimes results in more inconsistencies

In [12]:
def ref_sentences(article):
    article_text = article.find("text")
    divs = article_text.find_all("div", attrs={"xmlns": "http://www.tei-c.org/ns/1.0"})
    sections = []
    for div in divs:
        for p in div.find_all("p"):
            sentences = p.text.replace("al.", " ").split(".")
            for ref in p.find_all("ref"):
                if ref.attrs.get("type") == "bibr":
                    ref_text = ref.text
                    ref_id = ref.attrs.get("target")
                    ref_sentences = []
                    
                    for sentence in sentences:
                        if ref_text[:7] in sentence and len(ref_sentences)<1:
                            ref_sentences.append(sentence)
                            sentences.remove(sentence)
                    sections.append(
                        {
                            "ref_id": ref_id,
                            "ref_text": ref_text,
                            "ref_sentences": ref_sentences,
                        }
                    )



                
    return sections

In [13]:
sec = ref_sentences(parsed_file)
print(len(sec))
for s in sec:
    print("-----------------------------------------------------------")
    print(s)
    print("-----------------------------------------------------------")

23
-----------------------------------------------------------
{'ref_id': '#b0', 'ref_text': '(Bornmann and Daniel, 2010)', 'ref_sentences': ['The quality, randomness, bias, inconsistencies in peer reviews is well-debated across the academic community (Bornmann and Daniel, 2010)']}
-----------------------------------------------------------
-----------------------------------------------------------
{'ref_id': '#b8', 'ref_text': '(Langford and Guzdial, 2015)', 'ref_sentences': [' To study the arbitrariness inherent in the existing peer review system, organisers of the NIPS 2014 conference assigned 10% submissions to two different sets of reviewers and observed that the two committees disagreed for more than quarter of the papers (Langford and Guzdial, 2015)']}
-----------------------------------------------------------
-----------------------------------------------------------
{'ref_id': '#b11', 'ref_text': '(Smith, 2006)', 'ref_sentences': [' Many are of the opinion that the existing

In [14]:
def parse_references(article):
    """
    Parse list of references from a given BeautifulSoup of an article
    """
    reference_list = []
    references = article.find("text").find("div", attrs={"type": "references"})
    references = references.find_all("biblstruct") if references is not None else []
    reference_list = []
    for reference in references:
        title = reference.find("title", attrs={"level": "a"})
        if title is None:
            title = reference.find("title", attrs={"level": "m"})
        title = title.text if title is not None else ""
        journal = reference.find("title", attrs={"level": "j"})
        journal = journal.text if journal is not None else ""
        if journal is "":
            journal = reference.find("publisher")
            journal = journal.text if journal is not None else ""
        year = reference.find("date")
        year = year.attrs.get("when") if year is not None else ""
        authors = []
        for author in reference.find_all("author"):
            firstname = author.find("forename", {"type": "first"})
            firstname = firstname.text.strip() if firstname is not None else ""
            middlename = author.find("forename", {"type": "middle"})
            middlename = middlename.text.strip() if middlename is not None else ""
            lastname = author.find("surname")
            lastname = lastname.text.strip() if lastname is not None else ""
            if middlename is not "":
                authors.append(firstname + " " + middlename + " " + lastname)
            else:
                authors.append(firstname + " " + lastname)
        authors = "; ".join(authors)
        reference_list.append(
            {"title": title, "journal": journal, "year": year, "authors": authors}
        )
    return reference_list

In [15]:
ref = parse_references(parsed_file)
print(ref[0])

{'title': "Reliability of reviewers' ratings when using public peer review: a case study", 'journal': 'Learned Publishing', 'year': '2010-04', 'authors': 'Lutz Bornmann; H-D Daniel'}


In [None]:
for child in root1:
    print(child.tag, child.attrib)

In [None]:
for child in root2:
    print(child.tag, child.attrib)

In [4]:
for child in root3:
    print(child.tag, child.attrib)

{http://www.tei-c.org/ns/1.0}teiHeader {'{http://www.w3.org/XML/1998/namespace}lang': 'en'}
{http://www.tei-c.org/ns/1.0}text {'{http://www.w3.org/XML/1998/namespace}lang': 'en'}


In [5]:
for child in root1.findall("{http://www.tei-c.org/ns/1.0}teiHeader"):
    fileDesc = child.find("{http://www.tei-c.org/ns/1.0}fileDesc")
    title = fileDesc.find("{http://www.tei-c.org/ns/1.0}titleStmt").find("{http://www.tei-c.org/ns/1.0}title").text
    publisher = fileDesc.find("{http://www.tei-c.org/ns/1.0}publicationStmt").find("{http://www.tei-c.org/ns/1.0}publisher").text
    availability = fileDesc.find("{http://www.tei-c.org/ns/1.0}publicationStmt").find("{http://www.tei-c.org/ns/1.0}availability").find("{http://www.tei-c.org/ns/1.0}p").text
    date = fileDesc.find("{http://www.tei-c.org/ns/1.0}publicationStmt").find("{http://www.tei-c.org/ns/1.0}date").text
    names = []
    for author in fileDesc.findall("{http://www.tei-c.org/ns/1.0}sourceDesc"):
        a = author.find("{http://www.tei-c.org/ns/1.0}analytic")
        print(a)

    print(names)
    #print(title)
    #print(publisher)
    #print(availability)
    #print(date)


None
[]


In [79]:
headings, content = [], []
for child in root1.findall("{http://www.tei-c.org/ns/1.0}text"):
    body = child.find("{http://www.tei-c.org/ns/1.0}body")
    for div in body.findall("{http://www.tei-c.org/ns/1.0}div"):
        head = div.find("{http://www.tei-c.org/ns/1.0}head").text
        headings.append(head)
        para = div.findall("{http://www.tei-c.org/ns/1.0}p")
        paraTemp = []
        for p in para:
            paraTemp.append(p)
        content.append(paraTemp)
                
        for p in div.findall("{http://www.tei-c.org/ns/1.0}p/{http://www.tei-c.org/ns/1.0}ref"):
            if p.attrib['type'] == 'bibr':
                print(p.tag, p.attrib)


{http://www.tei-c.org/ns/1.0}ref {'type': 'bibr', 'target': '#b0'}
{http://www.tei-c.org/ns/1.0}ref {'type': 'bibr', 'target': '#b8'}
{http://www.tei-c.org/ns/1.0}ref {'type': 'bibr', 'target': '#b11'}
{http://www.tei-c.org/ns/1.0}ref {'type': 'bibr', 'target': '#b7'}
{http://www.tei-c.org/ns/1.0}ref {'type': 'bibr', 'target': '#b10'}
{http://www.tei-c.org/ns/1.0}ref {'type': 'bibr', 'target': '#b9'}
{http://www.tei-c.org/ns/1.0}ref {'type': 'bibr', 'target': '#b2'}
{http://www.tei-c.org/ns/1.0}ref {'type': 'bibr'}
{http://www.tei-c.org/ns/1.0}ref {'type': 'bibr', 'target': '#b12'}
{http://www.tei-c.org/ns/1.0}ref {'type': 'bibr', 'target': '#b7'}
{http://www.tei-c.org/ns/1.0}ref {'type': 'bibr', 'target': '#b7'}
{http://www.tei-c.org/ns/1.0}ref {'type': 'bibr', 'target': '#b7'}
{http://www.tei-c.org/ns/1.0}ref {'type': 'bibr', 'target': '#b7'}
{http://www.tei-c.org/ns/1.0}ref {'type': 'bibr', 'target': '#b3'}
{http://www.tei-c.org/ns/1.0}ref {'type': 'bibr', 'target': '#b6'}
{http://w

In [18]:
import pandas as pd


In [19]:
sections = parse_sections(parsed_file)
heads = []
content = []
for section in sections:
    heads.append(section['heading'])
    content.append(section['text'])

paper_sections = pd.DataFrame()
paper_sections['heading'] = heads
paper_sections['content'] = content


In [20]:
paper_sections.head()

Unnamed: 0,heading,content
0,Introduction,The rapid increase in research article submiss...
1,Related Work,Artificial Intelligence in academic peer revie...
2,Data Description and Analysis,"The PeerRead dataset consists of papers, a set..."
3,Methodology,
4,Pre-processing,"At the very beginning, we convert the papers i..."


In [21]:
citations = parse_references(parsed_file)
titles = []
journals = []
authors = []
for cit in citations:
    titles.append(cit['title'])
    journals.append(cit['journal'])
    authors.append(cit['authors'])

citations_data = pd.DataFrame()
citations_data['title'] = titles
citations_data['journal'] = journals
citations_data['authors'] = authors


In [22]:
citations_data.head()

Unnamed: 0,title,journal,authors
0,Reliability of reviewers' ratings when using p...,Learned Publishing,Lutz Bornmann; H-D Daniel
1,Universal Sentence Encoder for English,Association for Computational Linguistics,Daniel Cer; Yinfei Yang; Sheng-Yi Kong; Nan Hu...
2,The toronto paper matching system: an automate...,,Laurent Charlin; Richard Zemel
3,Investigating domain features for scope detect...,,Tirthankar Ghosal; Ravi Sonam; Sriparna Saha; ...
4,Investigating Impact Features in Editorial Pre...,ACM Press,Tirthankar Ghosal; Rajeev Verma; Asif Ekbal; S...


In [23]:

ref_sentence  = []
ref_citation = []
ref_venue = []

sentences = ref_sentences(parsed_file)
for sentence in sentences:
    if sentence['ref_id'] is not None:
        sentence['ref_id'] = int(sentence['ref_id'].replace('#b', ''))


for i in range(len(sentences)):
    ref_sentence.append(sentences[i]['ref_sentences'][0])
    if sentences[i]['ref_id'] is not None:
        index = sentences[i]['ref_id']
        ref_citation.append(citations[index]['title'])
    else:
        ref_citation.append(None)
    ref_venue.append(citations[index]['journal'])
        
ref_sentence_data = pd.DataFrame()
ref_sentence_data['ref sentence'] = ref_sentence
ref_sentence_data['ref citation'] = ref_citation
ref_sentence_data['ref venue'] = ref_venue


In [24]:
sentences

[{'ref_id': 0,
  'ref_text': '(Bornmann and Daniel, 2010)',
  'ref_sentences': ['The quality, randomness, bias, inconsistencies in peer reviews is well-debated across the academic community (Bornmann and Daniel, 2010)']},
 {'ref_id': 8,
  'ref_text': '(Langford and Guzdial, 2015)',
  'ref_sentences': [' To study the arbitrariness inherent in the existing peer review system, organisers of the NIPS 2014 conference assigned 10% submissions to two different sets of reviewers and observed that the two committees disagreed for more than quarter of the papers (Langford and Guzdial, 2015)']},
 {'ref_id': 11,
  'ref_text': '(Smith, 2006)',
  'ref_sentences': [' Many are of the opinion that the existing peer review system is fragile as it only depends on the view of a selected few (Smith, 2006)']},
 {'ref_id': 7,
  'ref_text': '(Kang et al., 2018)',
  'ref_sentences': ['The PeerRead dataset (Kang et  , 2018) is an excellent resource towards research and study on this very impactful and crucial p

In [25]:
ref_sentence_data

Unnamed: 0,ref sentence,ref citation,ref venue
0,"The quality, randomness, bias, inconsistencies...",Reliability of reviewers' ratings when using p...,Learned Publishing
1,To study the arbitrariness inherent in the ex...,"The arbitrariness of reviews, and advice for s...",Communications of the ACM
2,Many are of the opinion that the existing pee...,Peer review: a flawed process at the heart of ...,Journal of the Royal Society of Medicine
3,"The PeerRead dataset (Kang et , 2018) is an e...",A Dataset of Peer Reviews (PeerRead): Collecti...,Association for Computational Linguistics
4,Price and Flach (2017) did a thorough study o...,Computational support for academic peer review...,Commun. ACM
5,Mrowinski et (2017) explored an evolutionar...,Artificial intelligence in peer review: How ca...,PloS one
6,The famous Toronto Paper Matching system (Cha...,The toronto paper matching system: an automate...,
7,"Recently we (Ghosal et , 2018b,a) investigat...",,
8,Wang and Wan (2018) explored a multi-instance...,Sentiment Analysis of Peer Review Texts for Sc...,ACM Press
9,We carry our current investigations on a port...,A Dataset of Peer Reviews (PeerRead): Collecti...,Association for Computational Linguistics


In [26]:
sample_sen = ref_sentence[3]
sample_cit = ref_citation[3]

print(sample_sen)
print(sample_cit)

The PeerRead dataset (Kang et  , 2018) is an excellent resource towards research and study on this very impactful and crucial problem
A Dataset of Peer Reviews (PeerRead): Collection, Insights and NLP Applications


In [27]:
def Jaccard_Similarity(doc1, doc2):
    
    # List the unique words in a document
    words_doc1 = set(doc1.lower().split()) 
    words_doc2 = set(doc2.lower().split())
    
    # Find the intersection of words list of doc1 & doc2
    intersection = words_doc1.intersection(words_doc2)

    # Find the union of words list of doc1 & doc2
    union = words_doc1.union(words_doc2)
        
    # Calculate Jaccard similarity score 
    # using length of intersection set divided by length of union set
    return float(len(intersection)) / len(union)

In [28]:
Jaccard_Similarity(sample_sen, sample_cit)

0.06666666666666667

In [44]:
#from sentence_transformers import SentenceTransformer, util
#import numpy as np

In [46]:
#PATH = '/home/siddhant/dependancies/stsb-roberta-large/'
#model = SentenceTransformer(PATH)

In [50]:
import spacy
nlp = spacy.load('en_core_web_md')


In [30]:
sample_cit

'A Dataset of Peer Reviews (PeerRead): Collection, Insights and NLP Applications'

In [31]:
sample_sen

'The PeerRead dataset (Kang et  , 2018) is an excellent resource towards research and study on this very impactful and crucial problem'

In [41]:
sen1 = nlp(sample_sen)
sen2 = nlp(sample_cit)

print(sen1.similarity(sen2))


0.8528771485274309


In [42]:
def semanticScore(sentence1, sentence2):
    # Tokenize the sentences
    doc1 = nlp(sentence1)
    doc2 = nlp(sentence2)
    
    # Find the similarity score using the cosine similarity function
    return doc1.similarity(doc2)

In [48]:
ref_sentence = ref_sentence
ref_citation = ref_citation
ref_semantic = []
ref_lexical = []

for i in range(len(ref_sentence)):
    if ref_citation[i] is not None:
        ref_semantic.append(semanticScore(ref_sentence[i], ref_citation[i]))
        ref_lexical.append(Jaccard_Similarity(ref_sentence[i], ref_citation[i]))
    else:
        ref_semantic.append(None)
        ref_lexical.append(None)

similarity = pd.DataFrame()
similarity['ref sentence'] = ref_sentence
similarity['ref citation'] = ref_citation
similarity['semantic score'] = ref_semantic
similarity['lexical score'] = ref_lexical




In [49]:
similarity

Unnamed: 0,ref sentence,ref citation,semantic score,lexical score
0,"The quality, randomness, bias, inconsistencies...",Reliability of reviewers' ratings when using p...,0.810094,0.035714
1,To study the arbitrariness inherent in the ex...,"The arbitrariness of reviews, and advice for s...",0.882402,0.128205
2,Many are of the opinion that the existing pee...,Peer review: a flawed process at the heart of ...,0.893017,0.129032
3,"The PeerRead dataset (Kang et , 2018) is an e...",A Dataset of Peer Reviews (PeerRead): Collecti...,0.852877,0.066667
4,Price and Flach (2017) did a thorough study o...,Computational support for academic peer review...,0.87832,0.16
5,Mrowinski et (2017) explored an evolutionar...,Artificial intelligence in peer review: How ca...,0.865005,0.130435
6,The famous Toronto Paper Matching system (Cha...,The toronto paper matching system: an automate...,0.810994,0.238095
7,"Recently we (Ghosal et , 2018b,a) investigat...",,,
8,Wang and Wan (2018) explored a multi-instance...,Sentiment Analysis of Peer Review Texts for Sc...,0.835049,0.3
9,We carry our current investigations on a port...,A Dataset of Peer Reviews (PeerRead): Collecti...,0.80378,0.115385
