In [9]:
import os
import xml.etree.ElementTree as ET
import json
from tqdm import tqdm

# Path to the folder containing your XML files
FOLDER_PATH = '/Users/jacobhessels/KU/bachelor/src/comm/PMC011_600/refids/data'

# Function to extract article pmid
def get_article_pmid(root):
    pmid_element = root.find('.//article-id[@pub-id-type="pmid"]')
    return pmid_element.text if pmid_element is not None else None

# Function to replace all <xref> elements with "OTHERCIT" and create context ids
def replace_references(root, pmid, reference_count, context_data):
    text_parts = []
    for elem in root.iter():
        if elem.tag == 'xref' and elem.get('ref-type') == 'bibr' and elem.text:
            rid = elem.get('rid', 'unknown')
            ref_element = root.find(f'.//ref[@id="{rid}"]')
            ref_pmid_element = ref_element.find('.//pub-id[@pub-id-type="pmid"]') if ref_element is not None else None
            ref_pmid = ref_pmid_element.text if ref_pmid_element is not None else None
            if ref_pmid is None:
                continue  # Skip references without a pmid

            if rid not in reference_count:
                reference_count[rid] = 0
            context_id = f"{pmid}_{ref_pmid}_{reference_count[rid]}"
            reference_count[rid] += 1

            context_data.append({"context_id": context_id, "rid": ref_pmid})
            elem.text = "OTHERCIT"

        if elem.text:
            text_parts.append(elem.text)
        if elem.tail:
            text_parts.append(elem.tail)
    return ''.join(text_parts)

# Function to extract context around each occurrence of "OTHERCIT"
def extract_all_contexts(full_text, context_length=201):
    contexts = []
    start = 0
    while True:
        ref_position = full_text.find("OTHERCIT", start)
        if ref_position == -1:
            break
        start = ref_position + len("OTHERCIT")
        context_start = max(0, ref_position - context_length)
        context_end = ref_position + len("OTHERCIT") + context_length
        context = full_text[context_start:context_end]
        modified_context = context[:ref_position - context_start] + "TARGETCIT" + context[ref_position - context_start + len("OTHERCIT"):]
        contexts.append(modified_context)
    return contexts

# Process all XML files in the folder
all_context_data = []

# Initialize progress bar
xml_files = [f for f in os.listdir(FOLDER_PATH) if f.endswith(('.xml', '.nxml'))]
with tqdm(total=len(xml_files), desc="Processing XML files") as pbar:
    for file_name in xml_files:
        file_path = os.path.join(FOLDER_PATH, file_name)
        
        # Load and parse the XML file
        tree = ET.parse(file_path)
        root = tree.getroot()

        # Extract article pmid
        pmid = get_article_pmid(root)
        if pmid is None:
            pbar.update(1)
            continue
        
        # Create a dictionary to keep track of the reference counts
        reference_count = {}
        context_data = []

        # Replace all references in the XML and get the modified full text
        full_text_with_replacements = replace_references(root, pmid, reference_count, context_data)

        # Extract all contexts for "OTHERCIT"
        contexts = extract_all_contexts(full_text_with_replacements)

        # Combine context data with extracted contexts
        for context, context_info in zip(contexts, context_data):
            context_info["masked_text"] = context
            context_info["citing_id"] = pmid
            all_context_data.append(context_info)

        pbar.update(1)

# Prepare the data to be written to the JSON file
data = {
    context_info["context_id"]: {
        "masked_text": context_info["masked_text"],
        "context_id": context_info["context_id"],
        "citing_id": context_info["citing_id"],
        "refid": context_info["rid"]
    }
    for context_info in all_context_data
}

# Write the modified contexts to contexts.json
with open("contexts.json", "w") as json_file:
    json.dump(data, json_file, indent=4)

print("Contexts have been extracted and saved to contexts.json.")


Processing XML files: 100%|██████████| 3524/3524 [00:59<00:00, 58.90it/s]


Contexts have been extracted and saved to contexts.json.


## Sort contexts.json acording to sorted_articles.csv

In [3]:
import json
import pandas as pd

# Paths to the files
json_file_path = '/Users/jacobhessels/KU/bachelor/src/contexts.json'
csv_file_path = '/Users/jacobhessels/KU/bachelor/src/sorted_articles.csv'
sorted_json_file_path = '/Users/jacobhessels/KU/bachelor/src/contexts.json'

# Read the JSON data from the input file
with open(json_file_path, 'r') as file:
    data = json.load(file)

# Read the CSV file to get the sorted order of PMIDs
sorted_df = pd.read_csv(csv_file_path)

# Extract the PMIDs column as a list
sorted_pmids = sorted_df['PMID'].astype(str).tolist()

# Create a new dictionary to hold the sorted data
sorted_data = {}

# Populate the sorted_data dictionary based on the sorted PMIDs order
for pmid in sorted_pmids:
    for key, value in data.items():
        if value['citing_id'] == pmid:
            sorted_data[key] = value

# Write the sorted JSON data to the output file
with open(sorted_json_file_path, 'w') as file:
    json.dump(sorted_data, file, indent=4)

print("Contexts have been sorted according to sorted_articles.csv and saved to contexts_sorted.json.")


Contexts have been sorted according to sorted_articles.csv and saved to contexts_sorted.json.
