## **Nature_Extract_Author_Contribution**

In [None]:
import re
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
from google.colab import files
import xml.etree.ElementTree as ET
from google.colab import drive
import zipfile
import io
import os

In [None]:
def get_soup(u):
  """
  Get The HTMl with Beautiful Soup
  """

  s=requests.get(u).content.decode('utf-8')
  html=s
  soup=BeautifulSoup(html, 'html.parser')
  return soup

def get_highest_page_count(soup):
  """
  Get The Highest Number Page In The Website. In bottom of the pages there are buttons to move on to the next page.
  Thus function return the biggest page number.
  """

  pagination_links = soup.find_all('a',attrs={'class':'c-pagination__link'})
  pattern_pagination=r'page=(\d+)'
  max_page=max([int(re.findall(pattern_pagination,str(pagination_links[i]))[0]) for i in range(len(pagination_links))])
  return max_page

def extract_title_urls_contribution(topics,type_url):
  """
  By each topic and his url I find the titles and articles' urls in each page.
  After that, to each article's url, I find:
  1.year publication of the article.
  2. The contibution of the authors. The contribuion came under the Section: "Author Information" under the title "Contribution".
  3. Author Address. The university and the country that the authors came from.
  4.Authors' Names

  The function downloads a dictionary by this structure: {topic:
                                            {Page Number:
                                            {title:..,url:..,contribution:..,year:..,address:..,authors:..}}}
  """

  for topic in topics:
    dict_all={}
    if type_url=='Nature Communication':
       u = "https://www.nature.com/subjects/"+topic+"/ncomms?searchType=journalSearch&sort=PubDate&page="
    else:
      #Nature
      u = "https://www.nature.com/subjects/"+topic+"/srep?searchType=journalSearch&sort=PubDate&page="

    soup=get_soup(u+str(1))
    try:
      max_page=get_highest_page_count(soup)
    except:
      max_page=1

    for i in range(1,max_page+1):
      dict1={'contribution':[],'url':[],'title':[],'authors':[],"address":[],'year':[]}
      print(i)
      if i>1:
        soup=get_soup(u+str(i))
      l = soup.find_all('a', href=lambda href: href and href.startswith("/articles/"))
      #all the urls shows in this pattern
      pattern_url = r'href="(.*?)"\s*itemprop="(.*?)"'
      urls=[re.findall(pattern_url, str(l[i]))[0][0] for i in range(len(l))]
      titles=[t.text.strip() for t in l]

      for j,url in enumerate(urls):
        title=titles[j]
        url_article="https://www.nature.com/"+topic+url
        soup_article=get_soup(url_article)
        year=soup_article.find('span', {'data-test': 'article-publication-year'}).text.strip()

        try:
          #contribution is under the class c-article__sub-heading under h3 tag and shows after <p .....</p>
          target_h3 = soup_article.find('h3', class_='c-article__sub-heading', text='Contributions').find_next('p')
          authour_contribution=target_h3.text.strip()
        except:
          authour_contribution=''
        try:
          address_elements = soup_article.find_all('p', class_='c-article-author-affiliation__address')
          author_elements=soup_article.find_all('p', class_='c-article-author-affiliation__authors-list')
          authors=[author.text.strip() for author in author_elements]
          address=[add.text.strip() for add in address_elements]
        except:
          authors=[]
          address=[]

        dict1['contribution']+=[authour_contribution]
        dict1['url']+=[url]
        dict1['title']+=[title]
        dict1['authors']+=[authors]
        dict1['address']+=[address]
        dict1['year']+=[year]
      dict_all[i]=dict1
      #after 30 pages download the file. Because the running is over 20 hours, so I divided to batches to save the results
      if i%30==0:
         with open(topic+".json", 'w') as json_file:
            json.dump(dict_all, json_file, indent=4)
         files.download(topic+".json")

    with open(topic+".json", 'w') as json_file:
      json.dump(dict_all, json_file, indent=4)
    files.download(topic+".json")

In [None]:
##add the topic Names in the list topics.
topics=['ecology']
extract_title_urls_contribution(topics,'Nature Communication')

## **OpenPlus Corpus**

### **Extract Zip File From Drive**

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
zip_file_path = '/content/drive/MyDrive/data_mining_project/allofplos.zip'

# Specify the path where you want to extract the contents
extracted_folder_path = '/content/extracted_folder/'

# Open the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Extract all contents to the specified folder
    zip_ref.extractall(extracted_folder_path)

### **Title:**

In [None]:
def get_title(root):
  '''
  Each file is XML file. the title tag in the xml file is .//title-group.
  The function return full title name
  '''
  title_group = root.find('.//title-group')
  article_title = title_group.find('.//article-title')
  title = ET.tostring(article_title, encoding='utf-8').decode('utf-8').strip()
  title = re.sub(r'<.*?>', '', title)
  return title

### **Year:**

In [None]:
def get_year(root):
  '''
  Each file is XML file. the date tag in the xml file is .///pub-date. From the date tag, I extract the year
  The function returns the publication year of the article.
  '''
  date=root.find('.//pub-date')
  year=date.find('.//year')
  return year.text

### **Contribution:**

In [None]:
#type1:
def get_contribution(root):
    '''
    Each file is XML file. the contribution tag in the xml file is ..//*[@fn-type='con'. From the contribution tag,
    I extract the full text of the contribution
    The function returns contribution types for each author.
    '''
    try:
      contribution=ET.tostring(root.findall(".//*[@fn-type='con']")[0], encoding='utf-8', method='text').decode('utf-8').strip()
    except:
      contribution=[]

    return contribution

### **Address:**

In [None]:
def get_address(root):
  address=root.findall('.//addr-line')
  aff=root.findall('.//aff')
  addr_line_texts={}
  if len(address)<=2:
    try:
      addr_line_texts = {'1': ET.tostring(address[0], encoding='utf-8', method='text').decode('utf-8').strip()}
    except:
      return {}
  else:
    for i,elem in enumerate(address):
      try:
        aff_num=ET.tostring(aff[i][0], encoding='utf-8', method='text').decode('utf-8')[0]
      except:
        aff_num=ET.tostring(aff[i], encoding='utf-8', method='text').decode('utf-8')[0]
      if aff_num.isdigit()==True:
        addr_line_texts[aff_num]=ET.tostring(elem, encoding='utf-8', method='text').decode('utf-8').strip()

  return addr_line_texts

### **Names:**


In [None]:
def get_names_address(root,contrib_found):
  type_xml=1
  lst_names_affs=[]
  contrib_grpoup_name=root.findall(".//*[@contrib-type='author']")
  if contrib_grpoup_name==[]:
    return lst_names_affs,0
  dict_names_address_contrib={'full name':[],'address':[],'roles':[]}
  if contrib_grpoup_name[0].findall('.//role')==[]:
    type_xml=0
    dict_names_address_contrib={'full name':[],'address':[]}
    if contrib_found==[]:
      return lst_names_affs,type_xml

  for i,elem in enumerate(contrib_grpoup_name):
    affs=[]
    dict_names_address_contrib2={}
    roles_to_each_author=[]
    first_name=elem.findall('.//surname')
    last_names=elem.findall('.//given-names')
    if first_name==[]:
      continue
    aff=elem.findall('.//sup')
    if type_xml==1:
      roles= elem.findall('.//role')
      for role in roles:
        roles_to_each_author.append(ET.tostring(role, encoding='utf-8', method='text').decode('utf-8').strip())

    addr_line_texts=get_address(root)
    if len(aff)>=1:
      for j , aff_id in enumerate(aff):
        try:
          aff_num=ET.tostring(aff_id, encoding='utf-8', method='text').decode('utf-8').strip()
          if aff_num=='*':
            affs.append(list(addr_line_texts.values()))
          else:
            affs.append(addr_line_texts[aff_num])
        except:
          continue

    elif aff==[]:
      if addr_line_texts!={}:
        if '1' in addr_line_texts.keys():
          affs.append(addr_line_texts['1'])

    else:
        try:
          affs.append(addr_line_texts[ET.tostring(aff[0], encoding='utf-8', method='text').decode('utf-8').strip()])
        except:
          continue
    if last_names==[]:
      full_name=ET.tostring(first_name[0], encoding='utf-8', method='text').decode('utf-8').strip()
    else:
      full_name=ET.tostring(first_name[0], encoding='utf-8', method='text').decode('utf-8').strip()+' '+ET.tostring(last_names[0], encoding='utf-8', method='text').decode('utf-8').strip()

    dict_names_address_contrib2['full name']=full_name
    dict_names_address_contrib2['address']=affs
    if 'roles' in dict_names_address_contrib.keys():
      dict_names_address_contrib2['roles']=roles_to_each_author
    lst_names_affs.append(dict_names_address_contrib2)
  return lst_names_affs,type_xml

### **Create Json With Relevant Features:**

In [None]:
path='/content/extracted_folder/'
files = os.listdir(path)
dict_files_meta_data={}
for i in range(len(files)):
  file_name = os.path.join(path, files[i])
  with open(file_name, 'r') as file:
        content = file.read()

  try:
    root = ET.fromstring(content)
  except:
    continue
  # Now 'root' is an ElementTree object that you can work with
  root = root.find('.//article-meta')
  contribution=get_contribution(root)
  dict_name_address_role=get_names_address(root,contribution)
  if (contribution==[] and dict_name_address_role[1]==0) or (contribution==[] and dict_name_address_role[0]==[]) :
    #there is nor contribution in this article
    continue
  if contribution!=[] and dict_name_address_role[0]==[]:
    dict_file={'title':title,'year':year,'contribution':contribution}
    dict_files_meta_data[file_name]=dict_file
    continue

  year=get_year(root)
  title=get_title(root)
  if 'roles' in dict_name_address_role[0][0].keys():
    dict_file={'title':title,'year':year,'name_address_contribution':dict_name_address_role[0]}
  else:
    dict_file={'title':title,'year':year,'name_address':dict_name_address_role[0],'contribution':contribution}

  dict_files_meta_data[file_name]=dict_file

with open("plosone.json", 'w') as json_file:
    json.dump(dict_files_meta_data, json_file, indent=4)


In [None]:
file_name = os.path.join(path, files[43039])
with open(file_name, 'r') as file:
    content = file.read()


In [None]:
print(content)

## Checking:

In [None]:
import xml.etree.ElementTree as ET
root = ET.fromstring(content)
# Now 'root' is an ElementTree object that you can work with
root = root.find('.//article-meta')

### Title:

In [None]:
def get_title(root):
  title_group = root.find('.//title-group')
  article_title = title_group.find('.//article-title')
  title = ET.tostring(article_title, encoding='utf-8').decode('utf-8').strip()
  title = re.sub(r'<.*?>', '', title)
  return title

In [None]:
get_title(root)

'Common Mental Disorders Associated with Tuberculosis: A Matched Case-Control Study'

### Year:

In [None]:
def get_year(root):
  date=root.find('.//pub-date')
  year=date.find('.//year')
  return year.text

In [None]:
get_year(root)

'2014'

### Contribution:

In [None]:
#type1:
def get_contribution(root):
    try:
      contribution=ET.tostring(root.findall(".//*[@fn-type='con']")[0], encoding='utf-8', method='text').decode('utf-8').strip()
    except:
      contribution=[]

    return contribution

In [None]:
get_contribution(root)

'Conceived and designed the experiments: SMP DNS MLB LCR. Performed the experiments: GSA SMP DNS JMM LCR MLB. Analyzed the data: GSA SMP DNS JMM LCR MLB. Contributed reagents/materials/analysis tools: GSA SMP DNS JMM LCR MLB. Wrote the paper: GSA SMP DNS LCR MLB.'

### Names:


In [None]:
def get_names_address(root,contrib_found):
  type_xml=1
  lst_names_affs=[]
  contrib_grpoup_name=root.findall(".//*[@contrib-type='author']")
  if contrib_grpoup_name==[]:
    return lst_names_affs,0
  dict_names_address_contrib={'full name':[],'address':[],'roles':[]}
  if contrib_grpoup_name[0].findall('.//role')==[]:
    type_xml=0
    dict_names_address_contrib={'full name':[],'address':[]}
    if contrib_found==[]:
      return lst_names_affs,type_xml

  for i,elem in enumerate(contrib_grpoup_name):
    affs=[]
    dict_names_address_contrib2={}
    roles_to_each_author=[]
    first_name=elem.findall('.//surname')
    last_names=elem.findall('.//given-names')
    if first_name==[]:
      continue
    aff=elem.findall('.//sup')
    if type_xml==1:
      roles= elem.findall('.//role')
      for role in roles:
        roles_to_each_author.append(ET.tostring(role, encoding='utf-8', method='text').decode('utf-8').strip())

    addr_line_texts=get_address(root)
    if len(aff)>=1:
      for j , aff_id in enumerate(aff):
        try:
          aff_num=ET.tostring(aff_id, encoding='utf-8', method='text').decode('utf-8').strip()
          if aff_num=='*':
            affs.append(list(addr_line_texts.values()))
          else:
            affs.append(addr_line_texts[aff_num])
        except:
          continue

    elif aff==[]:
      if addr_line_texts!={}:
        if '1' in addr_line_texts.keys():
          affs.append(addr_line_texts['1'])

    else:
        try:
          affs.append(addr_line_texts[ET.tostring(aff[0], encoding='utf-8', method='text').decode('utf-8').strip()])
        except:
          continue
    if last_names==[]:
      full_name=ET.tostring(first_name[0], encoding='utf-8', method='text').decode('utf-8').strip()
    else:
      full_name=ET.tostring(first_name[0], encoding='utf-8', method='text').decode('utf-8').strip()+' '+ET.tostring(last_names[0], encoding='utf-8', method='text').decode('utf-8').strip()

    dict_names_address_contrib2['full name']=full_name
    dict_names_address_contrib2['address']=affs
    if 'roles' in dict_names_address_contrib.keys():
      dict_names_address_contrib2['roles']=roles_to_each_author
    lst_names_affs.append(dict_names_address_contrib2)
  return lst_names_affs,type_xml

In [None]:
get_names_address(root,get_contribution(root))


### Address:

In [None]:
def get_address(root):
  address=root.findall('.//addr-line')
  aff=root.findall('.//aff')
  addr_line_texts={}
  if len(address)<=2:
    try:
      addr_line_texts = {'1': ET.tostring(address[0], encoding='utf-8', method='text').decode('utf-8').strip()}
    except:
      return {}
  else:
    for i,elem in enumerate(address):
      try:
        aff_num=ET.tostring(aff[i][0], encoding='utf-8', method='text').decode('utf-8')[0]
      except:
        aff_num=ET.tostring(aff[i], encoding='utf-8', method='text').decode('utf-8')[0]
      if aff_num.isdigit()==True:
        addr_line_texts[aff_num]=ET.tostring(elem, encoding='utf-8', method='text').decode('utf-8').strip()

  return addr_line_texts

In [None]:
get_address(root)

{'2': 'Departamento de Medicina Interna, School of Medicine and Public Health, Bahia, Brazil',
 '3': 'Department of Epidemiology and Population Health, London School of Hygiene and Tropical Medicine, London, United Kingdom'}