In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd /content/drive/MyDrive/InfoQualityLab/

/content/drive/MyDrive/InfoQualityLab


In [None]:
xmls = []
for file in os.listdir(folder):
  if file.endswith('.xml'):
    xmls.append(file)
print(len(xmls))

63


In [None]:
import os, pandas as pd
from lxml import etree

namespaces = {'elsevier': 'http://www.elsevier.com/xml/svapi/article/dtd',
              'ce': 'http://www.elsevier.com/xml/common/dtd',
              'sb': 'http://www.elsevier.com/xml/common/struct-bib/dtd'}

author_surname = 'Willoughby'

def is_root(element):
    return element is element.getroottree().getroot()

folder = 'JCDLPaperDataset'

result = []
for file in os.listdir(folder):
  if file.endswith('.xml'):
    root = etree.parse(os.path.join(folder, file))

    # just to check what XML elements usually contain this text
    # for element in root.iter():
      # if element.text is not None and 'Willoughby' in element.text:
      #   print((element.tag, element.text))

    author_xml = root.xpath('.//*[ce:surname="Willoughby"]', namespaces=namespaces)  # and ce:given-name="P.H."

    # if Willoughby was not found as an author
    if len(author_xml) == 0:
      print(f'Unable to find author in: {file}')

    # if there are multiple Willoughbys, find correct reference using the title
    if len(author_xml) > 1:
      for axml in author_xml:
        title = axml.getparent().getparent().xpath('./sb:title/sb:maintitle', namespaces=namespaces)
        if len(title) == 1:
          if 'guide to small-molecule structure assignment' in title[0].text:
            author_xml = [axml]
            break
          else:
            print(title[0].text)


    if len(author_xml) == 1:
      author_xml = author_xml[0]
      ref_ids = [author_xml.getparent().getparent().getparent().attrib.get('id'),
                author_xml.getparent().getparent().getparent().getparent().attrib.get('id')]

      found = False
      for ref_id in ref_ids:
        ref_xmls = root.xpath(f'.//*/ce:cross-ref[@refid="{ref_id}"]', namespaces=namespaces)
        is_multicitation = False

        # if we can't find the reference, it's probably part of a list
        if len(ref_xmls) == 0:
          ref_xmls = root.xpath(f'.//*/ce:cross-refs[contains(@refid, "{ref_id}")]', namespaces=namespaces)

          tmp_ref_xmls = []
          for ref_xml in ref_xmls:
            if ref_id in ref_xmls[0].attrib.get('refid').split():
              tmp_ref_xmls.append(ref_xml)

          ref_xmls = tmp_ref_xmls

          is_multicitation = True

        #  print(file)
        #  print(ref_id)
        if len(ref_xmls) > 0:
          for ref_xml in ref_xmls:
            citation_marker = ref_xml.xpath('string()')

            paragraph = ''
            parent = ref_xml
            while True:
              parent = parent.getparent()
              if is_root(parent):
                break
              if parent.tag == f'{{{namespaces["ce"]}}}para':
                paragraph = parent.xpath('string()')
                break

            section = ''
            parent = ref_xml
            while True:
              parent = parent.getparent()
              if is_root(parent):
                break
              if parent.tag == f'{{{namespaces["ce"]}}}section':
                section = parent.xpath('./ce:section-title', namespaces=namespaces)[0].xpath('string()')
                # section_xml[0])
                #/string(.)
                break
            result.append({'file' : file,
                           'citation_marker' : citation_marker,
                           'paragraph' : paragraph,
                           'section' : section,
                           'is_multicitation' : is_multicitation})
            found = True
      if found == False:
        print(f'Cannot find reference in the text: {file}')
    else:
      print(f'Error processing file: {file}')
result = pd.DataFrame(result)
result.head()

Cannot find reference in the text: file53.xml
Unable to find author in: file57.xml
Error processing file: file57.xml


Unnamed: 0,file,citation_marker,paragraph,section,is_multicitation
0,file52.xml,Willoughby et al. (2014),"Recently, the application of DFT to a complex ...",Nuclear Magnetic Resonance,False
1,file54.xml,58–60,To better understand the fluorescence performa...,The density functional theory (DFT) calculation,True
2,file55.xml,\n 15\n ...,NMR calculation has been commonly used in stru...,Structural elucidation of three novel polyketides,False
3,file56.xml,[17–19],Geometrically optimized conformers for possibl...,Chemical shifts calculation and DP4+ analysis,True
4,file58.xml,[19],The conformational search was undertaken using...,ECD and NMR chemical shifts calculation,False


In [None]:
len(result['file'].unique())

61

In [None]:
len(result)

77

In [None]:
result.to_csv('XMLParseOutput_v3.csv', index = False)

In [None]:
# file53.xml issue: rawtext

Compare with Heng's manual extraction

In [None]:
import pandas as pd

manual = pd.read_csv('WH_protocol_Heng_check.csv')

In [None]:
!pip install thefuzz

Collecting thefuzz
  Downloading thefuzz-0.22.1-py3-none-any.whl (8.2 kB)
Collecting rapidfuzz<4.0.0,>=3.0.0 (from thefuzz)
  Downloading rapidfuzz-3.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, thefuzz
Successfully installed rapidfuzz-3.7.0 thefuzz-0.22.1


In [None]:
from thefuzz import fuzz

comparison = []
for index, row in result.iterrows():
  file_id = int(row['file'][4:-4]) - 1
  manual_ccs = list(manual[manual['Unnamed: 0'] == file_id]['1: Quote'])

  for manual_cc in manual_ccs:
    difference = ''
    if manual_cc == row['paragraph']:
      difference = 'None'

    comparison.append({'id' : file_id,
                      'manual' : manual_cc,
                      'automated' : row['paragraph'],
                      'difference' : difference})
comparison = pd.DataFrame(comparison)
comparison.head()

Unnamed: 0,id,manual,automated,difference
0,51,"Recently, the application of DFT to a complex ...","Recently, the application of DFT to a complex ...",
1,53,To better understand the fluorescence performa...,To better understand the fluorescence performa...,
2,54,NMR calculation has been commonly used in stru...,NMR calculation has been commonly used in stru...,
3,55,Geometrically optimized conformers for possibl...,Geometrically optimized conformers for possibl...,
4,57,The conformational search was undertaken using...,The conformational search was undertaken using...,


In [None]:
comparison.to_csv('manual_vs_automated_comparison.csv', index = False)