# Categorising in-text Citations

In [1]:
!pip install transformers mongoengine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting mongoengine
  Downloading mongoengine-0.26.0-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.4/110.4 KB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m96.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
Ins

In [2]:
from lxml import etree
import numpy as np
import pandas as pd
import json
from nltk import tokenize
import nltk
import string
import math
from pprint import pprint
import re
from transformers import pipeline
import requests
import traceback
from mongoengine import connect
from mongoengine.errors import NotUniqueError
from mongoengine import Document, StringField, FloatField, DateField, ListField, IntField, EmbeddedDocument, EmbeddedDocumentField

In [3]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### MongoDB Connection

In [4]:
db = connect(
    db='Articles',
    username='Categorising_in_text_Citations',
    password='#Food123',
    host='mongodb+srv://cluster0.gkdshl2.mongodb.net/'
)

In [5]:
class Reference(EmbeddedDocument):
    id = StringField(required=True, primary_key=True)
    ref_doi = StringField()
    ref_author = StringField(required=True)
    ref_text = StringField(required=True)
    ref_article_title = StringField()
    citations = ListField(EmbeddedDocumentField('Citation'))
    is_referenced_count = IntField()
    syntactic_frequency = IntField(required=True)
    syntactic_score = FloatField()
    polarity_score = FloatField(required=True)
    Introduction = IntField(required=True)
    Method = IntField(required=True)
    Results = IntField(required=True)
    Discussion = IntField(required=True)
    score = FloatField(required=True)
    scoring_category = StringField(required=True)

class Citation(EmbeddedDocument):
    reference_id = StringField(required=True)
    citation_mark = StringField()
    citation_section = StringField()
    citation_text = StringField()
    sentiment = FloatField()
    multi_citance = IntField()

class Article(Document):
    doi = StringField(required=True, pk=True, unique=True)
    article_title = StringField(required=True)
    abstract = StringField()
    journal_title = StringField()
    publisher_name = StringField()
    publish_date = StringField()
    article_authors = ListField(StringField(required=True))
    references = ListField(EmbeddedDocumentField(Reference))

In [6]:
#nltk.download('punkt')

In [7]:
#response = requests.get('https://api.crossref.org/works/10.7717/peerj-cs.490').json()

In [8]:
#response['message']['is-referenced-by-count']

### Load Model for Sentiment Analysis

In [9]:
SENTIMENT_MODEL = pipeline(model="cardiffnlp/twitter-roberta-base-sentiment")

Downloading (…)lve/main/config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

### Cross-ref Response 

In [10]:
def find_CrossRef_Response(doi):
    response = ''
    try:
      response = requests.get('https://api.crossref.org/works/'+ doi).json()
    except Exception as e:
      print("Crossref Error: ", e)
    return response

### Metadata

In [11]:
def find_DOI(article_meta):
    for article in article_meta:
        try:
            if(article.attrib['pub-id-type'] == 'doi'):
                return article.text
        except:
            continue

In [12]:
def find_Title(front):
    title = front.findall('.//article-title')[0]
    article_title = etree.tostring(title, method='text', encoding='unicode')
    return article_title

In [13]:
def find_Journal(front):
    journal_title= front.findall('.//journal-title')
    return journal_title[0].text

In [14]:
def find_Publisher(front):
    publisher = front.findall('.//publisher-name')[0]
    publisher_name = etree.tostring(publisher, method='text', encoding='unicode')
    return publisher_name

In [15]:
def find_Publish_Date(front):
    pub_date = front.findall('.//history')[0]

    for date in pub_date:
        if(date.attrib['date-type']== 'accepted'):
            pub_date = date
    
    published_date = ''
    for pub in pub_date:
        published_date += pub.text + '-'
        
    return published_date[:-1]

In [16]:
def find_Authors(front):
    article_author= front.findall('.//contrib')
    article_authors = []
    for i in article_author:
        if i.attrib['contrib-type'] == 'author' :
            article_authors.append(i)
            
    author_names = ''
    for name in article_authors:
        author = name.findall(".//name")[0]

        for author_name in author:
            author_names += author_name.text + ' '
        author_names += ', '
        
    author_names = author_names[:-2]
    author_names = author_names.split(", ")
    return author_names

In [17]:
def find_Abstract(front, xml_tree):
    abstract = xml_tree.xpath(".//abstract")[0]
    abstract = etree.tostring(abstract, method='text', encoding='unicode')
    return abstract

In [18]:
def find_Metadata(front, xml_tree):
    doi = find_DOI(front[1])
    title = find_Title(front)
    journal = find_Journal(front)
    publisher = find_Publisher(front)
    publish_date = find_Publish_Date(front)
    authors = find_Authors(front)
    abstract = find_Abstract(front, xml_tree)
    
    return doi, title, journal,publisher, publish_date, authors, abstract

### Article Section Lengths

In [19]:
def get_Article_Length(body):
    full_article_text = etree.tostring(body, method='text', encoding='unicode')
    article_text_length = len(full_article_text.split(" "))

    introduction_length = round(article_text_length/100) * 25
    method_length = round(article_text_length/100) * 50
    result_length = round(article_text_length/100) * 75
    discussion_length = round(article_text_length/100) * 100
    
    return article_text_length, introduction_length, method_length, result_length, discussion_length

In [20]:
def get_Article_Sections(body,introduction_length, method_length, result_length, discussion_length):
    sections = []
    c_sum = 0
    for i in body:
        title = i[0].text

        if (c_sum < introduction_length):
            section = "Introduction"
        elif (c_sum < method_length):
            section = "Method"
        elif (c_sum < result_length):
            section = "Results"
        else:
            section = "Discussion"

        sections.append({
            "title": title,
            "start_point": c_sum,
            "section": section
        })
        c_sum += len(etree.tostring(i, method='text', encoding='unicode').split(" "))
    
    return sections

### Citation Schema

#### Section Name

In [21]:
def verify_section(section_name, sections):
    for i in sections:
        if (i['title'] == section_name):
            return True
    else:
        return False

In [22]:
def check_section_name(section):
    section = section.lower()
    if (section == "introduction" or section == "intro"):
        return "Introduction"
    
    elif (section == "methods" or section == "methodology" or section== "method" or section == "materials & method" 
          or section == "materials and method" or section == "material and method" or section == "materials & methods" ):
        return "Method"
     
    elif (section == "results" or section == "result" or section == "conclusion"):
        return "Results"
    
    elif (section == "discussion" or section== "future work" or section == "future"):
        return "Discussion"
    
    else:
        return ''

In [23]:
def split_section_name(section):
    sec = section.split()
    
    for word in sec:
        section_name = check_section_name(word)
        if(section_name != ''):
            return section_name
    else:
        return ''

In [24]:
def get_section_name(section, sections):
    
    section_name = check_section_name(section)
    
    if(section_name != ''):
        return section_name
    
    elif (split_section_name(section) != ''):
        return split_section_name(section)

    else:
        for i in sections:
            if (i['title'].lower() == section.lower()):
                return i['section']

In [25]:
def get_section(citation, sections):
    section = citation.getparent().getparent()
    while True:
        if (section.tag != 'sec'):
            section = section.getparent()
            
        elif(verify_section(section[0].text, sections) == False):
            section = section.getparent()
            
        else:
            break
    
    section = get_section_name(section[0].text, sections)
    
    return section

#### Extraction of citation Text

In [26]:
def extract_Citation_Text(context, mark, citations_in_one_p):
    text = context.replace('al.', 'al')
    cit_text = nltk.sent_tokenize(text)

    citation_mark_ = mark.replace('al.', 'al')
    for sentence in cit_text:
        sentence_ = sentence.replace('(' , '')
        sentence_ = sentence_.replace(')', '')
        citation_mark_ = citation_mark_.replace('(', '')
        citation_mark_ = citation_mark_.replace(')', '')
        
        try:
            r = re.findall('.*' + citation_mark_ + '.*' , sentence_)
            if(len(r) >= 1):
                check = { 'text': sentence, 'citation': mark }
                if any(d == check for d in citations_in_one_p):
                    continue
                else:
                    #print("found")
                    return sentence
            
        except e:
            print("err")
            continue
    #print(cit_text)
    #print(citation_mark_)        
    #print("not found")

In [27]:
def extract_Citation_Schema(body, sections):
    citations_ref = body.xpath(".//xref")
    citations = []
    
    for i in citations_ref:
        if(i.attrib['ref-type'] == 'bibr'):
            citations.append(i)
    print(len(citations))
    citation_schema = []
    
    citations_in_one_p = []
    prev_context= None
    for citation in citations:            
        citation_mark = etree.tostring(citation, method='text', encoding='unicode')
        # try:
        #     citation_style = re.findall('(.+\d{4})', citation_mark)[0]
            
        # except:
        #     citation_style = citation_mark[:4]
            
        #Extracting the citation context
        context = etree.tostring(citation.getparent(), method='text', encoding='unicode') #citation paragraph
    
        #Extracting the citation section
        section = get_section(citation, sections) 
        
        #Extracting the full citation text
        text = extract_Citation_Text(context, citation.text, citations_in_one_p)
        
        #Checking the citations contain in one paragraph
        if (prev_context == None) or prev_context != context:
            prev_context = context
            citations_in_one_p = []
        else:
            citations_in_one_p.append({
                'text': text,
                'citation': citation.text
            })
            
        citation_id = citation.attrib['rid'].split()
        if len(citation_id) != 1:
            for ref in citation_id:
                citation_schema.append({
                    'reference_id': ref,
                    'citation_mark': citation.text,
                    'citation_section': section,
                    #'citation_context': context,
                    'citation_text': text,
                    'multi_citance': len(citation_id)
                })
        else:
            citation_schema.append({
                'reference_id': citation_id[0],
                'citation_mark': citation.text,
                'citation_section': section,
                #'citation_context': context,
                'citation_text': text,
                'multi_citance': 1
            })
            
    return citation_schema

In [28]:
def get_Citance_Count(citation_schema):
    counts = {}

    # Iterate through the list and update the counts dictionary
    for item in citation_schema:
        if item['citation_text'] != '':
            if item['citation_text'] in counts:
                counts[item['citation_text']] += 1
            else:
                counts[item['citation_text']] = 1

    # Iterate through the list and update the multi_citance field for each item based on the count in the counts dictionary
    for item in citation_schema:
        item['multi_citance'] = counts[item['citation_text']]

    return citation_schema

### Reference Schema

In [29]:
def get_Reference_DOI(reference):
    pub_ids = reference.findall(".//pub-id")

    doi = ''
    for id in pub_ids:
      if id.attrib['pub-id-type'] == 'doi':
        doi = id.text
        break
    # try:
    #     doi = etree.tostring(doi[0], method='text', encoding='unicode')
    # except:
    #     doi = ''
    
    reference_count = None
    if doi != '':
        crossref_response = requests.get('https://api.crossref.org/works/' + doi)
        if crossref_response.status_code != 404:
            reference_count = crossref_response.json()['message']['is-referenced-by-count']
    
    return reference_count, doi

In [30]:
def extract_Reference_Schema(body):
    references= body.xpath("//ref")
    reference_schema=[]
    for reference in references:
        try:
            article_title = reference.findall(".//article-title")
            article_title = etree.tostring(article_title[0], method='text', encoding='unicode')

        except:
            article_title = 'None'

        article_title = ' '.join(article_title.split())

        ref_author_name = ''
        ref_author = reference.findall('.//surname')
        for name in ref_author:
            try:
                ref_author_name += name.text + ', '
            except:
                break;
        
        reference_text = ''
        try:
            for ref_text in reference[1]:
                reference_text += etree.tostring(ref_text, method='text', encoding='unicode') + ' '
        except:
            reference_text = ''

        referenced_count, ref_doi = get_Reference_DOI(reference)
        #print(referenced_count)

        reference_text = re.sub(r"(\w)([A-Z])", r"\1 \2 ", reference_text)
        reference_full_text_split = ' '.join(reference_text.split())

        reference_schema.append({
        'id': reference.attrib['id'],
        'ref_doi': ref_doi,
        'ref_author': ref_author_name[:-2],
        'ref_text': reference_full_text_split,
        'ref_article_title': article_title,
        'is_referenced_count': referenced_count
    })
        
    return reference_schema

### Syntactic Analysis

In [31]:
def merge_Reference_Schema(citation_schema, reference_schema):
    for reference in reference_schema:
        reference['citations'] = []
        reference['syntactic_frequency'] = 0
        reference['polarity_score'] = 0
        reference['Introduction'] = 0
        reference['Method'] = 0
        reference['Results'] = 0
        reference['Discussion'] = 0
        reference['score'] = 0

        for citation in citation_schema:
            if(reference['id'] == citation['reference_id']):
                reference['citations'].append(citation)
                
                reference[citation['citation_section']] += 1

    references_having_citation = []            
    for reference in reference_schema:
      if len(reference['citations']) != 0:
        references_having_citation.append(reference)
        
    return reference_schema

In [32]:
def find_Reference_Frequency(reference_schema):
    reference_frequencies = []
    for i in reference_schema:
        i['syntactic_frequency'] = len(i['citations'])
        reference_frequencies.append(len(i['citations']))
    
    ref_freq_median = np.median(reference_frequencies)
    ref_freq_3rd_quarter =  np.percentile(reference_frequencies, 75)
    
    return reference_schema, ref_freq_median, ref_freq_3rd_quarter

## Sentiment Analysis

In [33]:
def find_Sentiment(reference_schema):
    for reference in reference_schema:
        for citation in reference['citations']:
            if citation['citation_text'] is not None:
                sentiment = SENTIMENT_MODEL(citation['citation_text'])
                sentiment_score = 0
                if (sentiment[0]['label'] == 'LABEL_1'):
                    sentiment_score = 1
                elif (sentiment[0]['label'] == 'LABEL_2'):
                    sentiment_score = 2 * sentiment[0]['score']
                citation['sentiment'] = sentiment_score
            else:
                citation['sentiment'] = 0
            
            
    return reference_schema

### Scoring

In [34]:
def category(reference, quartile_one_third, quartile_two_third):
    if (reference['score'] <= quartile_one_third ):
        reference['scoring_category'] = "Least Important"

    elif (reference['score'] <= quartile_two_third):
        reference['scoring_category'] = "Important"

    else:
        reference['scoring_category'] = "Most Important"
    
    return reference['scoring_category']

In [35]:
def scoring(reference_schema, ref_freq_median, ref_freq_3rd_quarter):
    scores = []
    #Semantic Scoring
    for reference in reference_schema:
        total_citations = len(reference['citations'])
        if total_citations == 0:
          reference_schema.remove(reference)
          continue
        sentiment_score = 0
        for citation in reference['citations']:
            sentiment_score += citation['sentiment']
        
        reference['polarity_score'] = sentiment_score / total_citations
        reference['score'] = reference['polarity_score']
        
    
    #Syntactic Scoring
    for reference in reference_schema:
        syntactic = 0
        for citation in reference['citations']:
            syntactic += 1 / citation['multi_citance']
        if (syntactic < ref_freq_median):
            reference['score'] += 1

        elif (reference['syntactic_frequency'] < ref_freq_3rd_quarter):
            reference['score'] += 2

        else:
            reference['score'] += 3
        
        reference['syntactic_score'] = syntactic
            
    #IMRAD Scoring 
    for reference in reference_schema:
        imrad_score = reference['Introduction'] + (reference['Method'] * 2)  + (reference['Results'] * 1.5) + (reference['Discussion'] * 1.25)
       
        imrad_score /= reference['syntactic_frequency']
        reference['score'] += imrad_score
        
        scores.append(reference['score'])
    
    quartile_one_third = np.percentile(scores, 40)
    quartile_two_third = np.percentile(scores, 75)
    
    #Merging all scores    
    for reference in reference_schema:
        reference['scoring_category'] = category(reference, quartile_one_third, quartile_two_third)
    
    return reference_schema

### Open Article

In [1]:
def check_Article_Uploaded(doi):
  try:
    article = Article.objects.get(doi=doi)
    return True
  except Article.DoesNotExist:
    return False

In [2]:
def open_article(file_path):
  article = open(file_path, 'r', encoding='utf-8')
  xml_parser = etree.XMLParser(remove_blank_text=True)
  xml_tree = etree.parse(file_path, xml_parser)
  front = xml_tree.xpath("//front")
  front = front[0]
  try:
      doi, title, journal, publisher, publish_date, authors, abstract = find_Metadata(front, xml_tree)

      if check_Article_Uploaded(doi) == True:
        return ("Article with doi '{}' already exists".format(doi))

      body = xml_tree.xpath("//body")
      body = body[0]

      article_text_length, introduction_length, method_length, result_length, discussion_length = get_Article_Length(body)
      print("Article length found")

      sections = get_Article_Sections(body, introduction_length, method_length, result_length, discussion_length)

      citation_schema = extract_Citation_Schema(body, sections)
      print("Extracted Citation Schema")

      citation_schema = get_Citance_Count(citation_schema)

      reference_schema = extract_Reference_Schema(body)
      print("Extracted Reference Schema")

      reference_schema = merge_Reference_Schema(citation_schema, reference_schema)

      reference_schema, ref_freq_median, ref_freq_3rd_quarter = find_Reference_Frequency(reference_schema)

      print("Sentiment...")
      reference_schema = find_Sentiment(reference_schema)

      reference_schema = scoring(reference_schema, ref_freq_median, ref_freq_3rd_quarter)
      print("Scoring done")

      schema = {
        "doi": doi,
        "article_title": title,
        "abstract": abstract,
        "journal_title": journal,
        "publisher_name": publisher,
        "publish_date": publish_date,
        "article_authors": authors,
        #"total_citations": len(citation_schema),
        #"total_references": len(reference_schema),
        "references": reference_schema
      }

        # Start the session
      try:
          # Save the article
          article = Article(**schema)
          article.save()
          
          print("Successful: ", file_path)

      except NotUniqueError:
          # Handle the error
          print("Article with doi '{}' already exists".format(article.doi))

      except Exception as e:
          # Roll back the changes
          print(e)

      return schema 
  

  except Exception as e:
      print(e)
  #crossref_response = find_CrossRef_Response(doi)

In [None]:
%%time
import os

folder_path = 'dataset/'

for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    if os.path.isfile(file_path):
        print("Article open: ", file_name)
        try:
          open_article(file_path)
          print("Article succesfully uploaded.")
          print("-----------------------------")
        except:
          print("Article failed: ", file_path)
          print(traceback.format_exc())
          continue
#root = 'peerj-cs-490.xml'


Article open:  PMC2978390.xml
Article length found
86
Extracted Citation Schema
Extracted Reference Schema
Sentiment...
float division by zero
Article succesfully uploaded.
-----------------------------
Article open:  PMC2978344.xml
Article length found
84
Extracted Citation Schema
Extracted Reference Schema
Sentiment...
Scoring done
Successful:  dataset/PMC2978344.xml
Article succesfully uploaded.
-----------------------------
Article open:  PMC2975984.xml
list index out of range
Article succesfully uploaded.
-----------------------------
Article open:  PMC2975999.xml
Article length found
8
Extracted Citation Schema
Extracted Reference Schema
Sentiment...
Scoring done
Successful:  dataset/PMC2975999.xml
Article succesfully uploaded.
-----------------------------
Article open:  PMC2978317.xml
Article length found
35
Extracted Citation Schema
Extracted Reference Schema
Sentiment...
Scoring done
Successful:  dataset/PMC2978317.xml
Article succesfully uploaded.
---------------------------

In [None]:

try:
  schema = open_article('peerj-cs-490.xml')
  print("Article succesfully uploaded.")
  print("-----------------------------")
except:
  print("Article failed:")
  print(traceback.format_exc())

In [None]:
df = pd.DataFrame(schema['references'])

In [None]:
df.groupby('score').mean()

In [None]:
df['score'].value_counts()

In [None]:
df['scoring_category'].value_counts()