In [53]:
#import dependencies
import os 
from xml.etree import ElementTree as ET
import json
from lxml import etree as ET
from collections import defaultdict
import pandas as pd
import re

In [54]:
# Defining XML Namespaces: to process TEI(a form of XML Markup language) doc
ns = {
    'tei': 'http://www.tei-c.org/ns/1.0',
    'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
    'xlink': 'http://www.w3.org/1999/xlink'
}

In [55]:
# set dir name

#inport settings
sample = True

#set input and output dir
if sample:
    input_dir = 'sample/grobid_full_text'
    output_dir = '../data/sample/1_json'
    error_log_dir = 'sample/error'
else:
    input_dir = '/home/deallab/jisulee/CCE/data/grobid_full_text/'
    output_dir = '../data/result/1_json'
    error_log_dir = '../data/result/error'
    
#set ouput path if not exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

#set ouput path if not exists
if not os.path.exists(error_log_dir):
    os.makedirs(error_log_dir)

In [56]:
# to save file name which has an error
error_files_list = []

In [57]:
# to count the number of files
total_files = 0
files_processed = 0
error_files_count = 0

In [58]:
#convert ref tags to [REF] format
def convert_ref_tags(text):
    if re.search(r'<ref type="bibr">(.*?)</ref>',text):
        pattern = r'<ref type=\"bibr\">(.*?)<\/ref>'
        replacement = r'<ref type="single" target="#n999">\1</ref>'
        text = re.sub(pattern, replacement, text)
    pattern = r'<ref type=\"bibr\" target=\"#(.*?)\">(.*?)<\/ref>'
    replacement = r'<ref type="single" target="#\1">\2</ref>'
    return re.sub(pattern, replacement, text)


def replace_with_gref(match):
    all_refs = match.group(0)
    matches = re.findall(r'<ref type="single" target="(#[a-z]\d+)">(.*?)</ref>', all_refs)
    ids = ';'.join([match[0] for match in matches])
    content = ' '.join([match[1] for match in matches])
    return f'<ref type="group" target="{ids}">{content}</ref>'
# convert paragraphs
def process_paragraphs(paragraphs):
    processed_paragraphs = []
    for paragraph in paragraphs:

            
        processed_paragraph = convert_ref_tags(paragraph) # conver <ref> into [REF:#id]
        pattern = r'(<ref type="single"[^\>]+>[^\<]+<\/ref> ?)+<ref type="single"[^\>]+>[^\<]+<\/ref>' # pattern of consecutive [REF:#id] tag
        
        clean_paragraph = re.sub(pattern, replace_with_gref, processed_paragraph)
        
        clean_paragraph = re.sub(r'<p[^\>]+>(.*?)<\/p>', r'\1', clean_paragraph)
    
        clean_paragraph = re.sub(r'<(?=(?!\/))(?!ref)[^\>]+>[^\<]+<[^\>]+>', '', clean_paragraph)
        
        processed_paragraphs.append(clean_paragraph)
    return processed_paragraphs

In [59]:
def clean_div(div):
    
    #remove other ref tags (without removing content)
    cleaned_div = re.sub(r'<ref type="(?!bibr)[^\>]+>([^\<]+)<\/ref>', r'\1', div)

    #remove p tags surrounding formulas
    cleaned_div = re.sub(r'<\/p><formula[^\>]+>([^\<]+)<\/formula><p>',r'\1', cleaned_div)
    cleaned_div = re.sub(r'(<\/p>)?<formula[^\>]+>([^\<]+)<\/formula>(<p>)?',r'\3\2\1', cleaned_div)
    
    #clean <div> and <head> tags
    cleaned_div = re.findall(r'<p>(.*?)<\/p>', cleaned_div)
    
    return cleaned_div

In [60]:
for filename in os.listdir(input_dir):
    if filename.endswith('.xml'):
        total_files += 1
        xml_file_path = os.path.join(input_dir, filename)

        try:
            # Parsing XML file
            tree = ET.parse(xml_file_path)
            root = tree.getroot()

            # Extract <Title>
            title_element = root.find(".//tei:teiHeader//tei:title[@type='main']", ns)
            title = title_element.text if title_element is not None else None

            # Extract <authors (authors of master documents only)>
            authors = []
            for author in root.findall('.//tei:teiHeader//tei:author/tei:persName', ns):
                first_name = author.find('tei:forename', ns).text if author.find('tei:forename', ns) is not None else None
                last_name = author.find('tei:surname', ns).text if author.find('tei:surname', ns) is not None else None
                authors.append({'first_name': first_name, 'last_name': last_name})

            # Extract <year of publication>
            doc_pub_year_element = filename.split('.')[0]
            doc_pub_year = int(doc_pub_year_element) if doc_pub_year_element.isnumeric() else None

            # Extract <abstract>
            abstract_element = root.find('.//tei:teiHeader//tei:abstract//tei:p', ns)
            abstract = abstract_element.text if abstract_element is not None else None
            
            #Extract langage
            text_lang = root.find('tei:text', ns).get('{http://www.w3.org/XML/1998/namespace}lang')

            # Extracted <section>
            sections = []
            for div in root.findall('.//tei:body/tei:div', ns):
                paragraphs = clean_div(ET.tostring(div, encoding='unicode', method='xml'))
                section_name = div.find('.//tei:head', ns).text if div.find('.//tei:head', ns) is not None else None
                # paragraphs = [ET.tostring(p, encoding='unicode', method='xml') for p in div.findall('.//tei:p', ns)]
                processed_paragraphs = process_paragraphs(paragraphs)
                sections.append({'section_name': section_name, 'paragraphs': processed_paragraphs})

            # Extract <references>
            references = []
            biblStruct_elements = root.findall('.//tei:listBibl/tei:biblStruct', ns)
            for biblStruct in biblStruct_elements:
                ref_id = biblStruct.get('{http://www.w3.org/XML/1998/namespace}id', "Unknown ID")
                ref_title_element = biblStruct.find(".//tei:title[@type='main']", ns)
                ref_title = ref_title_element.text if ref_title_element is not None else None
                
                ref_authors = []
                for author in biblStruct.findall('.//tei:author/tei:persName', ns):
                    first_name_element = author.find('tei:forename', ns)
                    last_name_element = author.find('tei:surname', ns)
                    first_name = first_name_element.text if first_name_element is not None else None
                    last_name = last_name_element.text if last_name_element is not None else None
                    ref_authors.append({'first_name': first_name, 'last_name': last_name})
                
                pub_year_element = biblStruct.find('.//tei:date[@type="published"]', ns)
                if pub_year_element is None:
                    pub_year = None
                else: 
                    match = re.search(r'[1-2][0,9]\d{2}', pub_year_element.get('when'))
                    pub_year = int(match.group(0)) if match else None
                

                
                idno_element = biblStruct.find('.//tei:idno', ns)
                idno_type = idno_element.get('type') if idno_element is not None else None
                idno = idno_element.text.strip() if idno_element is not None else None
                
                reference = {
                    'id': ref_id,
                    'title': ref_title,
                    'authors': ref_authors,
                    'pub_year': pub_year,
                }
                if (idno_type and idno):
                    reference['IDNO'] = {'type': idno_type, 'content': idno}
                
                references.append(reference)

            document_data = {
                'title': title,
                'authors': authors,
                'pub_year': doc_pub_year,
                'lang': text_lang,
                'abstract': abstract,
                'IDNO': {'type': 'acl', 'content': 'aclanthology.org/' + filename[0:-8]},
                'sections': sections,
                'references' : references
            }

            json_output_path = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}.json")
            with open(json_output_path, 'w', encoding='utf-8') as json_file:
                json.dump(document_data, json_file, indent=4, ensure_ascii=False)
            
            files_processed += 1
                
        except ET.ParseError:
            error_files_count += 1
            error_files_list.append(filename)
        except Exception as e:
            print(f"Unexpected error processing {filename}: {e}")

In [61]:
#set ouput path if not exists
if not os.path.exists(error_log_dir):
    os.makedirs(error_log_dir)
    
with open(error_log_dir + 'log.txt', 'w', encoding='utf-8') as f:
    for file in error_files_list:
        f.write(f"{file}\n")

print("Finished processing. Check 'error_files.txt' for list of problematic files.")

Finished processing. Check 'error_files.txt' for list of problematic files.


In [62]:
print(f"Total XML files loaded: {total_files}")
print(f"Total XML files processed: {files_processed}")
print(f"Total error files: {error_files_count}")

Total XML files loaded: 82
Total XML files processed: 80
Total error files: 2
