In [19]:
from pathlib import Path
from lxml import etree
import pandas as pd

In [20]:
directory = '../data/dataset'
file_list = []
headers = ['language', 'age', 'type', 'i', 'c', 'prompt_key', 'overall_score', 'question_number', 'question_score', 'file_name']
pathlist = Path(directory).glob('**/*.xml')
for path in pathlist:
    # because path is object not string
    file_list.append(str(path))

In [21]:
def process_ns(data, corrections, q_metadata, a_metadata):
    correct, wrong = '', ''
    if (data.text is not None):
        correct = correct + data.text
        wrong = wrong + data.text
    tup_c, tup_i = '', ''
    
#     print('PROCESSING: ' + data.tag)
    
    for items in data.getchildren():
        if ((items.tag == 'i') or (items.tag == 'ns' and items.getparent().tag == 'i')):
#             print('INC')
            i_c, i_i = process_ns(items, corrections, q_metadata, a_metadata) #items.text
            if (i_i is not None):
                wrong = wrong + ' ' + i_i
                tup_i = i_i
        elif ((items.tag == 'c') or (items.tag == 'ns' and items.getparent().tag == 'c')):
#             print('COR')
            c_c, c_i = process_ns(items, corrections, q_metadata, a_metadata) #items.text
            if (c_c is not None):
                correct = correct + ' ' + c_c
                tup_c = c_c
        else:
            c, i = process_ns(items, corrections, q_metadata, a_metadata)
            if (c is not None):
                correct = correct + ' ' + c
            if (i is not None):
                wrong = wrong + ' ' + i
            
#         print(items.tag)
        if (items.tail is not None):
            correct = correct + ' ' + items.tail
            wrong = wrong + ' ' + items.tail
    if (data.tag == 'NS'):
#         if (data.get('type') in corrections):
#             corrections[data.get('type')].append({'i': tup_i, 'c': tup_c})
#         else:
#             corrections[data.get('type')] = [{'i': tup_i, 'c': tup_c}]
        corrections.append([q_metadata['language'], q_metadata['age'], data.get('type'), tup_i, tup_c, q_metadata['prompt_key'], q_metadata['overall_score'], a_metadata['question_number'], a_metadata['question_score'], q_metadata['file_name']])
        
    return ' '.join(correct.split()), ' '.join(wrong.split())

In [22]:
def check_and_extract(doc, key):
    element = doc.find(key)
    if (element is not None):
        return element.text
    return ''

def parse_file(file_name, corrections):
#     print('PARSING: ' + file_name)
    with open(file_name) as fobj:
        xml = fobj.read()
    metadata = {}
    root = etree.XML(bytes(bytearray(xml, encoding='utf-8')))
    head = root[0]
    candidate = head[0]
    personnel = candidate.find('personnel')
    text = head.find('text')
    metadata['prompt_key'] = Path(file_name).parent.name
    metadata['sort_key'] = head.get('sortkey')
    metadata['overall_score'] = check_and_extract(candidate, 'score')
    metadata['language'] = check_and_extract(personnel, 'language')
    metadata['age'] = check_and_extract(personnel, 'age')
    metadata['file_name'] = file_name
    
    answers = []
    for item in text.getchildren():
        answer = {}
        tag = item.tag
        question_number = check_and_extract(item, 'question_number') # item.find('question_number').text
        question_score = check_and_extract(item, 'exam_score') # item.find('exam_score').text
        answer['tag'] = tag
        answer['question_number'] = question_number
        answer['question_score'] = question_score
        answer['coded_answer'] = item.find('coded_answer')
        answers.append(answer)
    for answer in answers:
        for p in answer['coded_answer']:
            process_ns(p, corrections, metadata, answer)

In [5]:
corrections = []
for file in file_list:
    parse_file(file, corrections)

In [6]:
df = pd.DataFrame(corrections, columns=headers)

In [7]:
len(df)

55895

In [83]:
df.to_csv('../data/parsed.csv', index = False)

In [31]:
answer_headers = ['language', 'age', 'i', 'c', 'prompt_key', 'overall_score', 'question_number', 'question_score', 'file_name']
def get_prompt(prompt_key, question_key):
    folder = '../data/prompts/'
    q = question_key[0]
    if (len(question_key) > 1):
        qq = question_key[1]
    else:
        qq = None
    
    file_name = folder + prompt_key + '.xml'
    
    with open(file_name) as fobj:
        xml = fobj.read()
    metadata = {}
    root = etree.XML(bytes(bytearray(xml, encoding='utf-8')))
    exam = root[0]
    
    question = ''
    
    for qitem in exam.findall('q'):
        if (qitem.get('n') == q):
            question = question + qitem.text
            if (qq is not None):
                for qqitem in qitem.findall('qq'):
                    if (qqitem.get('nn') == qq):
                        question = question + ' ' + qqitem.text
                question = question + qitem.tail
    
    return question.strip()
def extract_answers(file_name, dataset):
#     print('PARSING: ' + file_name)
    with open(file_name) as fobj:
        xml = fobj.read()
    metadata = {}
    root = etree.XML(bytes(bytearray(xml, encoding='utf-8')))
    head = root[0]
    candidate = head[0]
    personnel = candidate.find('personnel')
    text = head.find('text')
    metadata['prompt_key'] = Path(file_name).parent.name
    metadata['sort_key'] = head.get('sortkey')
    metadata['overall_score'] = check_and_extract(candidate, 'score')
    metadata['language'] = check_and_extract(personnel, 'language')
    metadata['age'] = check_and_extract(personnel, 'age')
    metadata['file_name'] = file_name
    
    answers = []
    for item in text.getchildren():
        answer = {}
        tag = item.tag
        question_number = check_and_extract(item, 'question_number') # item.find('question_number').text
        question_score = check_and_extract(item, 'exam_score') # item.find('exam_score').text
        answer['tag'] = tag
        answer['question_number'] = question_number
        answer['question_score'] = question_score
        answer['coded_answer'] = item.find('coded_answer')
        answers.append(answer)
    corrections = []
    for answer in answers:
        correct, wrong = '', ''
        for p in answer['coded_answer']:
            c, i = process_ns(p, corrections, metadata, answer)
            correct = correct + ' ' + c
            wrong = wrong + ' ' + i
        dataset.append([metadata['language'], metadata['age'], wrong, correct, get_prompt(metadata['prompt_key'], answer['question_number']), metadata['overall_score'], answer['question_number'], answer['question_score'], metadata['file_name']])
            

In [32]:
dataset = []
for file in file_list:
    extract_answers(file, dataset)

In [33]:
df = pd.DataFrame(dataset, columns=answer_headers)

In [35]:
df.head(10)

Unnamed: 0,language,age,i,c,prompt_key,overall_score,question_number,question_score,file_name
0,Russian,16-20,"Dear Ms Jane Clark, Recently I visited an art...","Dear Ms Clark, Recently I visited an arts fes...",You recently spent two days at an annual inter...,25.0,1,3.2,../data/dataset/0102_2000_12/doc624.xml
1,Russian,16-20,"Dear Ana, Thanks for your letter and postcard...","Dear Ana, Thanks for your letter and postcard...",This is part of a letter which you receive fro...,25.0,4,2.3,../data/dataset/0102_2000_12/doc624.xml
2,Turkish,16-20,16.12.00 Dear Jane Firstly I want to tell you...,"16.12.00 Dear Jane Firstly , I want to tell y...",You recently spent two days at an annual inter...,32.0,1,4.3,../data/dataset/0102_2000_12/doc1562.xml
3,Turkish,16-20,"16.12.00 Dear Andrew, I just received your le...","16.12.00 Dear Andrew, I just received your le...",This is part of a letter which you receive fro...,32.0,4,4.2,../data/dataset/0102_2000_12/doc1562.xml
4,German,,"Dear Jane, I have read the advertisement for ...","Dear Jane, I have read the advertisement for ...",You recently spent two days at an annual inter...,25.0,1,3.1,../data/dataset/0102_2000_12/doc180.xml
5,German,,"Dear John, Here in Brazil the rules at school...","Dear John, Here in Brazil the rules at school...",This is part of a letter which you receive fro...,25.0,4,3.2,../data/dataset/0102_2000_12/doc180.xml
6,Turkish,<16,"Dec. 16th, 2000 Dear Ms Clark, I am writing t...","Dec. 16th, 2000 Dear Ms Clark, I am writing t...",You recently spent two days at an annual inter...,38.0,1,5.3,../data/dataset/0102_2000_12/doc2135.xml
7,Turkish,<16,"Dec 16th, 2000 Dear Mary, I'm glad to have re...","Dec 16th, 2000 Dear Mary, I'm glad to have re...",This is part of a letter which you receive fro...,38.0,4,5.3,../data/dataset/0102_2000_12/doc2135.xml
8,Japanese,26-30,"Dear Jane Clark, the organiser of Internation...","Dear Jane Clark, the organiser of the Interna...",You recently spent two days at an annual inter...,22.0,1,2.3,../data/dataset/0102_2000_12/doc2121.xml
9,Japanese,26-30,Thank you for your letter. And I'd like to an...,Thank you for your letter. And I'd like to an...,This is part of a letter which you receive fro...,22.0,4,3.2,../data/dataset/0102_2000_12/doc2121.xml


In [36]:
df.to_csv('../data/answers_parsed.csv', index = False)


Your English class is going to make a short video about daily life at your school.
Your teacher has asked you to write a report, suggesting which lessons and other activities should be filmed, and why.

Write your report.


Answer one of the following two questions based on your reading of one of these set books.
Write (a) or (b) as well as the number 5 in the question box, and the title of the book next to the box.
Your answer must be about one of the books below.

Best Detective Stories of Agatha Christie --- Longman Fiction
The Old Man and the Sea --- Ernest Hemingway
Cry Freedom --- John Briley
Wuthering Heights --- Emily Brontë
A Window on the Universe --- Oxford Bookworms Collection

Either
 
'This is such a marvellous book you will want to read it again.
Write an article for your college magazine, saying whether you think this statement is true of the book or one of the short stories you have read


