# Complexity of dataset for semantic parsing

Measures from ["Sources of Complexity in Semantic Frame Parsing for Information Extraction"](https://hal.archives-ouvertes.fr/hal-01731385/document)

The goal is to analyze the complexity of the HuRIC corpus

In [1]:
import os
from collections import defaultdict
from pathlib import Path
import xml.etree.ElementTree as ET

DATASET_LOCATION = '../data/huric_eb/modern/source'

def load_xmls(folder):
    path = Path(folder)
    files_list = [el for el in path.iterdir() if el.is_file()]
    file_contents = []
    for file in sorted(files_list):
        with open(file) as file_in:
            tree = ET.parse(file_in)
        root = tree.getroot()
        file_contents.append(root)
    return file_contents

def get_lu_pos(root, verbose=False):
    """Returns the POS for all the LU in the current document"""
    w_id_to_pos = {t.attrib['id']: t.attrib['pos'] for t in root.findall('tokens/token')}
    #print(w_id_to_pos)
    lu_idxs = [lu.attrib['id'] for lu in root.findall('semantics/frameSemantics/frame/lexicalUnit/token')]
    #print(lu_idxs)
    lu_pos = [w_id_to_pos[id] for id in lu_idxs]
    if verbose:
        print(root.attrib['id'], lu_pos)
    return lu_pos

def get_lu_are_roots(root, verbose=False):
    """Returns whether the lexicalUnits are the roots in dependencies"""
    roots_id = [d.attrib['to'] for d in root.findall('dependencies/dep') if d.attrib['type'] == 'root']
    lu_idxs = [lu.attrib['id'] for lu in root.findall('semantics/frameSemantics/frame/lexicalUnit/token')]
    lu_are_roots = [l in roots_id for l in lu_idxs]
    if verbose:
        print(root.attrib['id'], lu_are_roots)
    return lu_are_roots

def get_lengths(root, verbose=False):
    """Returns the length of the command"""
    # TODO shouldn't be frame-based? find min and max token id and do difference
    return [len(root.findall('tokens/token'))]

def get_lu_depths(root, verbose=False):
    """Returns the depths in the dependency tree of the lexicalUnits"""
    # TODO the depth should be relative to the frame
    edges = [(d.attrib['from'], d.attrib['to'], d.attrib['type']) for d in root.findall('dependencies/dep')]
    #print(edges)
    to_father = {e[1]: e[0] for e in edges}
    depths = {}
    for el, f in to_father.items():
        current_el = el
        depth = 0
        # the root has id == '0', the second condition is only to avoid infinite looping
        while f != '0' and depth < len(edges):
            depth += 1
            current_el = f
            f = to_father[current_el]
        if f != '0':
            # broken annotations
            depth = -1
        depths[el] = depth
    lu_idxs = [lu.attrib['id'] for lu in root.findall('semantics/frameSemantics/frame/lexicalUnit/token')]
    lu_depths = [depths[l] for l in lu_idxs]
    if verbose:
        print(root.attrib['id'], lu_depths)
    return lu_depths

def get_lu_positions(root, verbose=False):
    """Get the position of lexicalUnits in the command"""
    # TODO the position should be relative to the frame
    lu_idxs = [lu.attrib['id'] for lu in root.findall('semantics/frameSemantics/frame/lexicalUnit/token')]
    lu_positions = [int(l) for l in lu_idxs]
    return lu_positions

In [2]:
xml_docs = load_xmls(DATASET_LOCATION)
lu_pos_all = defaultdict(lambda: 0)
lu_are_roots_all = defaultdict(lambda: 0)
lengths_all = defaultdict(lambda: 0)
lu_depths_all = defaultdict(lambda: 0)
lu_positions_all = defaultdict(lambda: 0)
for doc in xml_docs:
    lu_pos = get_lu_pos(doc)
    for p in lu_pos:
        lu_pos_all[p] += 1
    lu_are_roots = get_lu_are_roots(doc)
    for r in lu_are_roots:
        lu_are_roots_all[r] += 1
    lengths = get_lengths(doc)
    for l in lengths:
        lengths_all[l] += 1
    lu_depths = get_lu_depths(doc)
    for d in lu_depths:
        lu_depths_all[d] += 1
    lu_positions = get_lu_positions(doc)
    for p in lu_positions:
        lu_positions_all[p] += 1
    
print('lu_pos', sorted(lu_pos_all.items(), key=lambda x: x[1], reverse=True))
print('lu_are_roots', sorted(lu_are_roots_all.items()))
print('lengths', sorted(lengths_all.items()))
print('lu_depths', sorted(lu_depths_all.items()))
print('lu_positions', sorted(lu_positions_all.items()))

lu_pos [('VB', 537), ('VBZ', 31), ('EX', 13), ('VBD', 12), ('VBN', 5), ('RP', 3), ('NN', 1)]
lu_are_roots [(False, 114), (True, 488)]
lengths [(2, 1), (3, 41), (4, 71), (5, 40), (6, 77), (7, 105), (8, 51), (9, 51), (10, 27), (11, 18), (12, 11), (13, 10), (14, 10), (15, 6), (16, 1), (17, 3), (18, 1), (19, 1), (20, 1)]
lu_depths [(0, 488), (1, 100), (2, 13), (3, 1)]
lu_positions [(1, 342), (2, 77), (3, 55), (4, 50), (5, 11), (6, 18), (7, 11), (8, 13), (9, 8), (10, 4), (11, 6), (12, 3), (14, 3), (15, 1)]
