In [5]:
import xml.etree.ElementTree as ET
import json

In [6]:
def extract_graph_description(ditamap, dita):

    # get the tree from loaded file
    ditamap_root = ET.parse(ditamap)
    dita_root = ET.parse(dita)
    
    graph_base = dict()

    book_info = dict()
    book_title = ditamap_root.find('.//mainbooktitle')
    if hasattr(book_title, 'text'):
        book_info["title"] = book_title.text.strip()

    book_type = ditamap_root.find('.//booktitlealt')
    if hasattr(book_type, 'text'):
        book_info["type"] = book_type.text.strip()

    book_id = ditamap_root.find('.//bookid/booknumber')
    if hasattr(book_id, 'text'):
        book_info["course_code"] = book_id.text.strip()

    graph_base[book_info["course_code"]] = {
        "title": book_info["title"],
        "subtitle": book_info["type"],
        "type": "course",
    }
        
    def find_glossentries(root):
        for node in root.findall(".//glossentry"):
            key = node.attrib['id']
            text = " ".join(node.find('glossterm').itertext())
            graph_base[key] = {
                "title": text,
                "parents": [],
                "type": "glossentry",
            }
    
    find_glossentries(dita_root)

    def find_glossrefs(root, cycle_id):
        for term in root.findall(f".//term[@type='glossentry']"):
                graph_base[term.attrib['keyref']]['parents'].append(cycle_id)

    def find_cycles(root, section_id):
        for topic in root.findall(".topic"):
            id = topic.attrib['id']
            if id.startswith("CYCLE"):
                graph_base[id] = {
                    "title": topic.find('./title').text,
                    "parents": [section_id],
                    "type": "cycle",
                }
                find_glossrefs(topic, id)

    def find_sections(root):
        for topic in root.findall(".topic"):
            id = topic.attrib['id']
            if id.startswith("UNIT"):
                graph_base[id] = {
                    "title": topic.find('./title').text,
                    "parents": [book_info["course_code"]],
                    "type": "section",
                }
                find_cycles(topic, id)

    find_sections(dita_root)
        
    return graph_base

In [12]:
filename = 'DLBSAESA01.json'
ditamap = 'data/dita/' + 'book_DLBSAESA01_001-2022-0322_2022-03-22T08-45-07-654Z' + '.ditamap'
dita =    'data/dita/' + 'book_DLBSAESA01_001-2022-0322_2022-03-22T08-45-07-654Z' + '.dita'

graph_description = extract_graph_description(ditamap, dita)

In [13]:
with open(filename, 'w', encoding='utf8') as f:
    json.dump(graph_description, f, ensure_ascii=False)