<div style="display: flex; justify-content: flex-end; align-items: center;">
    <div style="width: 30%; text-align: right; margin-right: 20px;">
        <img src="https://www.juntadeandalucia.es/datosabiertos/portal/uploads/group/2022-09-06-135504.979247fps.png" alt="Web Fundación Progreso y Salud" style="width: 100%;"/>
    </div>
    <div style="width: 60%; margin-right: 1em;">
        <p style="text-align: right; font-weight: bold; font-size: 2em; margin-top: 30px;">
            Extract MeSH Tags from XLM file
        </p>
        <p style="text-align: right; color: #666">
            Proyect: MedicoderMESH
        </p>
        <p style="text-align: right; color: #666">
            Juan Luis González Rodríguez
        </p>
    </div>
</div>

# 0.Libraries

In [1]:
from xml.etree import ElementTree
import os
import json
import shutil
import sys
from tqdm import tqdm

# 1.Functions

In [2]:
def extractdate_xml(xml_date_elelemt):
    """
    Extract date from xml element
    
    Args:
    xml_date_elelemt (ElementTree.Element): xml element containing date
    
    Returns:
    str: date in dd/mm/yyyy format"""

    year = xml_date_elelemt.find('Year').text
    motnh = xml_date_elelemt.find('Month').text
    day = xml_date_elelemt.find('Day').text

    return f"{day}/{motnh}/{year}" #dd/mm/yyyy format

In [3]:
def parse_mesh_xml(mesh_element_xml):
    """
    Parse xml element containing mesh data.

    Args:
    mesh_element_xml (ElementTree.Element): xml element containing mesh data

    Returns:
    dict: dictionary containing mesh data
    """
    mesh_data = {}
    for element in mesh_element_xml:
        if element.tag == "DescriptorUI":
            try:
                mesh_data["DescriptorUI"] = element.text
            except AttributeError:
                mesh_data["DescriptorUI"] = ""
        
        if element.tag == "DescriptorName":
            try:
                mesh_data["DescriptorName"] = element.find("String").text
            except AttributeError:
                mesh_data["DescriptorName"] = ""
                
        if element.tag == "TreeNumberList":
            try:
                mesh_data["TreeNumberList"] = [tree_number.text for tree_number in element]
            except AttributeError:
                mesh_data["TreeNumberList"] = []
                
        if element.tag == "DateCreated":
            try:
                mesh_data["Creation_date"] = extractdate_xml(element)
            except AttributeError:
                mesh_data["Creation_date"] = ""
                
        if element.tag == "DateRevised":
            try:
                mesh_data["Revision_date"] = extractdate_xml(element)
            except AttributeError:
                mesh_data["Revision_date"] = ""
                
        if element.tag == "DateEstablished":
            try:
                mesh_data["Established_date"] = extractdate_xml(element)
            except AttributeError:
                mesh_data["Established_date"] = ""
                
        if element.tag == "Annotation":
            try:
                mesh_data["Annotation"] = element.text.strip()
            except AttributeError:
                mesh_data["Annotation"] = ""
                
        if element.tag == "ConceptList":
            try:
                mesh_data["ScopeNote"] = element.find("Concept").find("ScopeNote").text.strip()
                mesh_data["Terms"] = [term.find("String").text for term in element.find("Concept").find("TermList")]
            except AttributeError:
                mesh_data["ScopeNote"] = ""
                mesh_data["Terms"] = []
                
        if element.tag == "AllowableQualifiersList":
            try:
                mesh_data["AllowableQualifiers"] = [qualifier.find("QualifierReferredTo").find("QualifierName").find("String").text \
                                for qualifier in element.findall("AllowableQualifier")]
            except AttributeError:
                mesh_data["AllowableQualifiers"] = []

    return mesh_data


In [4]:
def check_save_path(save_path):
    """
    Check if the output directory already exists and prompt the user to either delete it or stop the execution.

    Args:
    save_path (str): path to the output directory

    Returns:
    None
    """
    # Check if the output directory already exists
    if os.path.exists(save_path):
        # Prompt the user to either delete the existing directory or stop the execution
        response = input(f"The directory {save_path} already exists. Do you want to delete it and continue with the execution? (yes/no): ")
        if response.lower() == 'yes':
            shutil.rmtree(save_path)
        else:
            sys.exit("Execution stopped by the user.")

    # Create the output directory if it doesn't exist
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    return None


In [5]:
def ensure_keys(mesh_data):
    """
    Ensure that all required keys are present in the mesh data dictionary.

    Args:
    mesh_data (dict): Mesh data dictionary

    Returns:
    dict: Updated mesh data dictionary with all required keys
    """
    required_keys = {
        "DescriptorUI": "",
        "DescriptorName": "",
        "TreeNumberList": [],
        "Creation_date": "",
        "Revision_date": "",
        "Established_date": "",
        "Annotation": "",
        "ScopeNote": "",
        "Terms": [],
        "AllowableQualifiers": []
    }
    for key, default_value in required_keys.items():
        if key not in mesh_data:
            mesh_data[key] = default_value

    return mesh_data


# 2.Load Data

Let's load the xml file and parse it using ElementTree.

In [6]:
xml_path = os.path.join("..", "data","desc2024.xml")

# Parse the XML file
tree = ElementTree.parse(xml_path)

The structure of MeSH is described in the <a href="https://www.nlm.nih.gov/mesh/xmlmesh.html">Mesh XML Structure</a>, which can be helpful for understanding the file structure.

In [7]:
meshs = tree.getroot()

print(f"There are {len(meshs)} MeSH terms in the file.")

There are 30764 MeSH terms in the file.


# 3.Parse the mesh and save it as a json file

In [8]:
save_path = os.path.join("..", "data", "meshs_json")
prefixes_to_check = ["C", "F03"]

# Check savepath content and existence
check_save_path(save_path)

for mesh in tqdm(meshs, desc="Processing MeSH terms", total=len(meshs), unit="MeSH term"):
    mesh_data = parse_mesh_xml(mesh)
    mes_data = ensure_keys(mesh_data)
    # check if any threenumber starts with the prefixes
    if any([tree_number.startswith(prefix) for prefix in prefixes_to_check for tree_number in mesh_data["TreeNumberList"]]):
        with open(os.path.join(save_path, f"{mesh_data['DescriptorUI']}.json"), "w") as file:
            json.dump(mesh_data, file, indent=4)

Processing MeSH terms: 100%|██████████| 30764/30764 [00:02<00:00, 12439.59MeSH term/s]


In [9]:
print(f"There are {len(os.listdir(save_path))} MeSH terms with TreeNumber starting with {', '.join(prefixes_to_check)}" +
      f"\nThe data has been saved in {save_path}.")

There are 5155 MeSH terms with TreeNumber starting with C, F03
The data has been saved in {save_path}.
