In [1]:
from pathlib import Path
from lxml import etree
import pandas as pd

In [2]:
directory = '../data/dataset'
file_list = []
headers = ['language', 'age', 'type', 'i', 'c', 'prompt_key', 'overall_score', 'question_number', 'question_score', 'file_name']
pathlist = Path(directory).glob('**/*.xml')
for path in pathlist:
    # because path is object not string
    file_list.append(str(path))

In [3]:
def process_ns(data, corrections, q_metadata, a_metadata):
    correct, wrong = '', ''
    if (data.text is not None):
        correct = correct + data.text
        wrong = wrong + data.text
    tup_c, tup_i = '', ''
    
#     print('PROCESSING: ' + data.tag)
    
    for items in data.getchildren():
        if ((items.tag == 'i') or (items.tag == 'ns' and items.getparent().tag == 'i')):
#             print('INC')
            i_c, i_i = process_ns(items, corrections, q_metadata, a_metadata) #items.text
            if (i_i is not None):
                wrong = wrong + ' ' + i_i
                tup_i = i_i
        elif ((items.tag == 'c') or (items.tag == 'ns' and items.getparent().tag == 'c')):
#             print('COR')
            c_c, c_i = process_ns(items, corrections, q_metadata, a_metadata) #items.text
            if (c_c is not None):
                correct = correct + ' ' + c_c
                tup_c = c_c
        else:
            c, i = process_ns(items, corrections, q_metadata, a_metadata)
            if (c is not None):
                correct = correct + ' ' + c
            if (i is not None):
                wrong = wrong + ' ' + i
            
#         print(items.tag)
        if (items.tail is not None):
            correct = correct + ' ' + items.tail
            wrong = wrong + ' ' + items.tail
    if (data.tag == 'NS'):
#         if (data.get('type') in corrections):
#             corrections[data.get('type')].append({'i': tup_i, 'c': tup_c})
#         else:
#             corrections[data.get('type')] = [{'i': tup_i, 'c': tup_c}]
        corrections.append([q_metadata['language'], q_metadata['age'], data.get('type'), tup_i, tup_c, q_metadata['prompt_key'], q_metadata['overall_score'], a_metadata['question_number'], a_metadata['question_score'], q_metadata['file_name']])
        
    return ' '.join(correct.split()), ' '.join(wrong.split())

In [4]:
def check_and_extract(doc, key):
    element = doc.find(key)
    if (element is not None):
        return element.text
    return ''

def parse_file(file_name, corrections):
#     print('PARSING: ' + file_name)
    with open(file_name) as fobj:
        xml = fobj.read()
    metadata = {}
    root = etree.XML(bytes(bytearray(xml, encoding='utf-8')))
    head = root[0]
    candidate = head[0]
    personnel = candidate.find('personnel')
    text = head.find('text')
    metadata['prompt_key'] = Path(file_name).parent.name
    metadata['sort_key'] = head.get('sortkey')
    metadata['overall_score'] = check_and_extract(candidate, 'score')
    metadata['language'] = check_and_extract(personnel, 'language')
    metadata['age'] = check_and_extract(personnel, 'age')
    metadata['file_name'] = file_name
    
    answers = []
    for item in text.getchildren():
        answer = {}
        tag = item.tag
        question_number = check_and_extract(item, 'question_number') # item.find('question_number').text
        question_score = check_and_extract(item, 'exam_score') # item.find('exam_score').text
        answer['tag'] = tag
        answer['question_number'] = question_number
        answer['question_score'] = question_score
        answer['coded_answer'] = item.find('coded_answer')
        answers.append(answer)
    for answer in answers:
        for p in answer['coded_answer']:
            process_ns(p, corrections, metadata, answer)

In [5]:
corrections = []
for file in file_list:
    parse_file(file, corrections)

In [6]:
df = pd.DataFrame(corrections, columns=headers)

In [7]:
len(df)

55895

In [83]:
df.to_csv('../data/parsed.csv', index = False)