In [1]:
import json
import os

from bs4 import BeautifulSoup
from bs4.element import NavigableString



basedir = 'chefdata/Sample2/Success_Mindset_English'

coursedir = os.path.join(basedir, 'course')

In [2]:
def parse_xml_file(coursedir, kind, name, ext='xml'):
    """
    Parse the XML file at {coursedir}/{kind}/{name}.{ext}
    and return the json tree representation.
    References are not resolved --- see `parse_xml_file_refusive` for that.
    """

    # Build path to XML file
    path = coursedir
    if kind:
        path = os.path.join(path, kind)
    path = os.path.join(path, name + '.' + ext)
    if not os.path.exists(path):
        raise ValueError('XML file not found: ' + path)
    
    # Load XML
    xml = open(path, 'r')
    doc = BeautifulSoup(xml, "xml")
    doc_children = list(doc.children)
    assert len(doc_children) == 1, 'Found more than one root element!'
    doc_root = doc_children[0]
    # print(doc)
    
    # JSON data object
    data = {
        'kind': doc_root.name,
        'children': [],
    }
    data.update(doc_root.attrs)
    
    # Add children as unresoled references
    for child in doc_root.children:
        if type(child) == NavigableString:
            continue
        assert len(child.attrs) == 1, 'Assumption failed: encountered more than one attr'
        kind = child.name
        child_ref = {
            'kind': kind,
        }
        if kind == 'wiki':
            child_ref['slug'] = child.attrs['slug']
        elif kind == 'html':
            child_ref['url_name'] = child.attrs['url_name']
            child_ref['ext'] = 'html'
        else:
            child_ref['url_name'] = child.attrs['url_name']
        data['children'].append(child_ref)

    return data

In [3]:
data = parse_xml_file(coursedir, '', 'course')
data

{'kind': 'course',
 'children': [],
 'url_name': 'course',
 'org': 'hp-life-e-learning',
 'course': '6638hpl-en30'}

In [4]:
data = parse_xml_file(coursedir, 'course', 'course')
data

{'kind': 'course',
 'children': [{'kind': 'chapter',
   'url_name': '23c14c940ed44b7ab9a5c682d443aec7'},
  {'kind': 'chapter', 'url_name': '535f768b2de84b6b9bdf92a8bfbc9f29'},
  {'kind': 'chapter', 'url_name': '4ae59da79d394d6cafa136077809b5bf'},
  {'kind': 'chapter', 'url_name': '6285d0bd66cd4433b039828fa21b6f49'},
  {'kind': 'chapter', 'url_name': '64d3a001ad73451e87c9ca6e1956578e'},
  {'kind': 'chapter', 'url_name': 'fe2f4e5367dc4531bf0d78f886acdac2'},
  {'kind': 'wiki', 'slug': 'hp-life.2266hp-102.fall-2015'}],
 'display_name': 'Success mindset',
 'enrollment_start': '"2018-10-11T00:00:00+00:00"',
 'graceperiod': '',
 'language': 'en',
 'minimum_grade_credit': '0.8',
 'start': '"2018-10-11T00:00:00+00:00"'}

In [5]:
def parse_xml_file_refusive(coursedir, kind, name, ext='xml'):
    """
    Parse the XML file at {coursedir}/{kind}/{name}.{ext} recusively
    using the base XML-to-JSON basic parsing function `parse_xml_file`.
    Recusrively resolves all references of the form {kind: AAA, url_name: BBB}
    bu loading the XML data from the file at {coursedir}/AAA/BBB.xml
    Returns a json tree representation.
    """
    root = parse_xml_file(coursedir, kind, name, ext=ext)
    new_children = []
    for child in root['children']:
        child_kind = child['kind']
        if child_kind in ['wiki', 'html']:
            new_children.append(child)
        elif child_kind in ['problem']:
            new_children.append(child)
        else:
            child_name = child['url_name']
            resolved_child = parse_xml_file_refusive(coursedir, child_kind, child_name, ext='xml')
            new_children.append(resolved_child)
    root['children'] = new_children
    return root


In [6]:
data = parse_xml_file_refusive(coursedir, 'course', 'course')
# data

In [7]:
from libedx import print_course


In [8]:
print_course(data)

 - Success mindset kind= course 	 
    - Start Course kind= chapter 	 
       - Success mindset kind= sequential 	 
          - Unit kind= vertical 	 
             -  kind= html 	  url_name=5e7e7ad5290f46cbbba0642d913f13c9
    - Story kind= chapter 	 
       - Story Slides kind= sequential 	 
          - Unit kind= vertical 	 
             -  kind= problem 	  url_name=a7677ba614c6480c8dd357f4a793480a
    - Business Concept kind= chapter 	 
       - Activity kind= sequential 	 
          - Unit kind= vertical 	 
             -  kind= problem 	  url_name=22a03eac7559497f8a00507d80f19331
    - Technology Skill kind= chapter 	 
       - Activity kind= sequential 	 
          - Unit kind= vertical 	 
             -  kind= problem 	  url_name=96ef66f259e648f6a57032a25b49e5e3
       - Downloadable Resources kind= sequential 	 
          - Unit kind= vertical 	 
             -  kind= html 	  url_name=d51950e67be341ae98838f0d2ef9e867
    - Course Feedback kind= chapter 	 
       - Survey kind= 