In [None]:
# Script para la extracción de frases y referencias de un PDF a un CSV
# Versión 1.0

In [4]:
from lxml import etree
import re
import csv
import os

# Función que extrae título y año dado un número de referencia específico, útil para
# extraer DOIs debido a que son los inputs de la API:
def format_reference(ref):
    # Initial default values
    title = ''
    year = ''
    
    # Extract title from article-title if available
    article_title = ref.xpath('.//article-title/text()')
    source_text = ref.xpath('.//source/text()')
    
    if article_title:
        title = article_title[0]
        year = ref.xpath('.//year/text()')[0] if ref.xpath('.//year/text()') else ''
    elif source_text:
        title = source_text[0]
        year = ref.xpath('.//year/text()')[0] if ref.xpath('.//year/text()') else ''
    
    return title, year


# Función que "normaliza" los espacios entre palabras (si hay más de 1 espacio,
# si hay un salto de línea, etc):
def normalize_whitespace(text):
    return re.sub(r'\s+', ' ', text)


# Función que se encarga de separar las distintas oraciones. También se encarga de
# interpretar cuando una abreviación no constituye el inicio de una oración nueva
# (debido a puntos que pueda contener en su interior), cuando signos de interrogación
# o exclamación marcan el final de una oración, etc.:
def split_sentences_with_tags(paragraph_text):
    # Dictionary of abbreviations to protect
    abbreviations = {
        'e.g. ': 'EG_PLACEHOLDER ',
        'e.g.': 'EG_PLACEHOLDER',
        'et al. ': 'ETAL_PLACEHOLDER ',
        'et al.': 'ETAL_PLACEHOLDER',
        'i.e. ': 'IE_PLACEHOLDER ',
        'i.e.': 'IE_PLACEHOLDER',
        'fig.': 'FIG_PLACEHOLDER',
        'fig. ': 'FIG2_PLACEHOLDER',
        'vs.': 'VS_PLACEHOLDER',
        'p. ': 'P_PLACEHOLDER',
    }
    
    # Temporarily replace abbreviations with placeholders
    for abbr, placeholder in abbreviations.items():
        paragraph_text = re.sub(f'\\b{abbr}\\b', placeholder, paragraph_text, flags=re.IGNORECASE)
    
    # Split sentences on periods, exclamations, or question marks that are not followed by a closing parenthesis unless it's the end of the text
    sentences = re.split(r'(?<=[.!?])(?!\s*[\)\[])\s+', paragraph_text)
    
    # Restore the original abbreviations
    for abbr, placeholder in abbreviations.items():
        sentences = [sentence.replace(placeholder, abbr) for sentence in sentences]
    
    return sentences


# Función que se encarga de procesar las separaciónes que no cumplan ciertos criterios:
def split_sentences(sentences):
    i = 0
    while i < len(sentences) - 1:
        # Check if the sentence ends with "e.g.", "et al." or similar and the next does not start properly as a new sentence
        if any(sentences[i].endswith(x) for x in ['e.g.', 'et al.', 'i.e.', 'fig.', 'vs.']) and \
           (sentences[i+1][0].islower() or sentences[i+1][0].isdigit() or sentences[i+1].startswith('(')):
            sentences[i] = sentences[i] + ' ' + sentences[i+1]
            del sentences[i+1]
        else:
            i += 1
    return sentences


# Función que se encarga de ir oración a oración asociando las oraciones con referencias
# a su respectiva referencia usando las funciones definidas anteriormente
def extract_sentences_with_references_correctly(root, references):
    context_with_refs = []
    for body in root.xpath('.//body'):
        for p in body.xpath('.//p'):
            paragraph_html = etree.tostring(p, method='html', encoding='unicode')
            sentences = split_sentences_with_tags(paragraph_html)
            sentences = split_sentences(sentences)

            for sentence_html in sentences:
                if has_reference(sentence_html):
                    sentence_text = etree.fromstring(sentence_html, parser=etree.HTMLParser()).xpath('string()')
                    sentence_text = normalize_whitespace(sentence_text).strip()

                    sentence_xrefs = etree.fromstring(sentence_html, parser=etree.HTMLParser()).xpath('.//xref[@ref-type="bibr"]')
                    titles = []
                    years = []
                    for xref in sentence_xrefs:
                        rid = xref.get('rid')
                        if rid:
                            ref_ids = rid.split()
                            for ref_id in ref_ids:
                                reference_xml = root.xpath(f'.//ref[@id="{ref_id}"]')[0]
                                title, year = format_reference(reference_xml)
                                if (title, year) not in zip(titles, years):
                                    titles.append(title)
                                    years.append(year)
                    titles_str = '\n'.join(titles) if titles else "No title found"
                    years_str = '\n'.join(years) if years else "No year found"
                else:
                    sentence_text = etree.fromstring(f'<div>{sentence_html}</div>', parser=etree.HTMLParser()).xpath('string()')
                    sentence_text = normalize_whitespace(sentence_text).strip()
                    titles_str = ""
                    years_str = ""
                
                context_with_refs.append((sentence_text, titles_str, years_str))
                
    return context_with_refs

# Función que busca las etiquetas específicas del estándar NLM JATS
def has_reference(sentence_xml):
    # Pattern to detect <xref ref-type="bibr" ...> tags
    xref_pattern = re.compile(r'<xref ref-type="bibr"')
    # Search for the pattern in the sentence XML content
    return bool(xref_pattern.search(sentence_xml))


# Directorio que contiene los XMLs
xml_directory = 'XMLs'


# Contador del número de archivos XML procesador
file_count = 0


# Iteración para todos los XMLs en el directorio especificado
for xml_file in os.listdir(xml_directory):
    if xml_file.endswith('.cermxml'):
        file_path = os.path.join(xml_directory, xml_file)
        
        # Load and parse the XML file
        tree = etree.parse(file_path)
        root = tree.getroot()

        # Creating a dictionary of references
        references = {ref.get('id'): format_reference(ref) for ref in root.xpath('.//ref-list/ref')}

        # Extract sentences and references
        sentences_with_references = extract_sentences_with_references_correctly(root, references)

        # Define the output CSV file name based on the input XML file name
        csv_file_name = os.path.splitext(xml_file)[0] + '_output.csv'
        csv_file_path = os.path.join(xml_directory, csv_file_name)

        # Write the sentences and their references to a CSV file
        with open(csv_file_path, 'w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(["Context", "Title", "Year"])  # Adjusted header
            for context, titles, years in sentences_with_references:
                writer.writerow([context, titles, years])
        
        # Increment the file counter and print the number of processed files
        file_count += 1
        print(f"Processed XML file {file_count}: {xml_file}")

print("Extraction and CSV creation completed successfully for all XML files.")

Processed XML file 1: 2406.04353v1.cermxml
Processed XML file 2: 2209.04206v1.cermxml
Processed XML file 3: 2009.09310v1.cermxml
Processed XML file 4: 2402.03122v3.cermxml
Processed XML file 5: 2106.11168v1.cermxml
Processed XML file 6: 2111.13436v2.cermxml
Processed XML file 7: 2407.08406v1.cermxml
Processed XML file 8: 1909.10018v1.cermxml
Processed XML file 9: 2406.09966v1.cermxml
Processed XML file 10: 2009.12883v1.cermxml
Processed XML file 11: 2402.00066v1.cermxml
Processed XML file 12: 2111.06116v1.cermxml
Processed XML file 13: 2201.10636v2.cermxml
Processed XML file 14: 2302.03778v1.cermxml
Processed XML file 15: 2301.07676v1.cermxml
Processed XML file 16: 1909.07438v1.cermxml
Processed XML file 17: 1911.03240v1.cermxml
Processed XML file 18: 2110.10053v1.cermxml
Processed XML file 19: 2004.07354v1.cermxml
Processed XML file 20: 2111.05605v1.cermxml
Processed XML file 21: 2406.04354v1.cermxml
Processed XML file 22: 0911.1759v1.cermxml
Processed XML file 23: 2002.08811v2.cermxm