In [2]:
from lxml import etree

In [15]:
class BuNaMoWrongDocument(Exception):
    """Exception raised for wrong document type"""

    def __init__(self, expected, got):
        self.expected = expected
        self.got = got
        self.message = f"Expected root element <{self.expected}> but got <{self.got}>"
        super().__init__(self.message)

In [44]:
def read_adjective(file):
    tree = etree.parse(file)
    root = tree.getroot()
    valid_tags = ['sgNom', 'sgGenMasc', 'sgGenFem', 'plNom', 'graded', 'abstractNoun']
    attribs = {}
    forms = []
    if root.tag != 'adjective':
        raise BuNaMoWrongDocument('adjective', root.tag)
    attribs['default'] = root.get('default')
    attribs['declension'] = root.get('declension')
    attribs['disambig'] = root.get('disambig')
    attribs['isPre'] = root.get('isPre')
    for child in root:
        if child.tag not in valid_tags:
            raise Exception('Unexpected tag ' + child.tag)
        tmp = {}
        tmp['props'] = child.tag
        tmp['form'] = child.get('default')
        forms.append(tmp)
    return attribs, forms
def read_noun(file):
    tree = etree.parse(file)
    root = tree.getroot()
    valid_tags = ['sgNom', 'sgGen', 'plNom', 'plGen']
    attribs = {}
    forms = []
    if root.tag != 'noun':
        raise BuNaMoWrongDocument('noun', root.tag)
    attribs['default'] = root.get('default')
    attribs['declension'] = root.get('declension')
    attribs['disambig'] = root.get('disambig')
    attribs['isProper'] = root.get('isProper')
    attribs['isDefinite'] = root.get('isDefinite')
    attribs['allowArticledGenitive'] = root.get('allowArticledGenitive')
    for child in root:
        if child.tag not in valid_tags:
            raise Exception('Unexpected tag ' + child.tag)
        tmp = {}
        tmp['props'] = child.tag
        tmp['form'] = child.get('default')
        tmp['gender'] = child.get('gender')
        tmp['strength'] = child.get('strength')
        forms.append(tmp)
    return attribs, forms
def read_nounphrase(file):
    tree = etree.parse(file)
    root = tree.getroot()
    valid_tags = ['sgNom', 'sgGen', 'plNom', 'plGen', 'sgNomArt', 'sgGenArt', 'plNomArt', 'plGenArt']
    attribs = {}
    forms = []
    if root.tag != 'nounPhrase':
        raise BuNaMoWrongDocument('nounPhrase', root.tag)
    attribs['default'] = root.get('default')
    attribs['declension'] = root.get('declension')
    attribs['disambig'] = root.get('disambig')
    attribs['isProper'] = root.get('isProper')
    attribs['isDefinite'] = root.get('isDefinite')
    attribs['allowArticledGenitive'] = root.get('allowArticledGenitive')
    attribs['forceNominative'] = root.get('forceNominative')
    for child in root:
        if child.tag not in valid_tags:
            raise Exception('Unexpected tag ' + child.tag)
        tmp = {}
        tmp['props'] = child.tag
        tmp['form'] = child.get('default')
        tmp['gender'] = child.get('gender')
        tmp['strength'] = child.get('strength')
        forms.append(tmp)
    return attribs, forms
def read_possessive(file):
    tree = etree.parse(file)
    root = tree.getroot()
    valid_tags = ['full', 'apos']
    attribs = {}
    forms = []
    if root.tag != 'possessive':
        raise BuNaMoWrongDocument('possessive', root.tag)
    attribs['default'] = root.get('default')
    attribs['disambig'] = root.get('disambig')
    attribs['mutation'] = root.get('mutation')
    for child in root:
        if child.tag not in valid_tags:
            raise Exception('Unexpected tag ' + child.tag)
        if child.tag == 'apos':
            attribs['apos'] = child.get('default')
    return attribs
def read_preposition(file):
    tree = etree.parse(file)
    root = tree.getroot()
    valid_tags = ['sg1', 'sg2', 'sg3Masc', 'sg3Fem', 'pl1', 'pl2', 'pl3']
    attribs = {}
    forms = []
    if root.tag != 'preposition':
        raise BuNaMoWrongDocument('preposition', root.tag)
    attribs['default'] = root.get('default')
    for child in root:
        if child.tag not in valid_tags:
            raise Exception('Unexpected tag ' + child.tag)
        attribs[child.tag] = child.get('default')
    return attribs


In [46]:
a, f = read_adjective('../input/bunamo-bunachar-naisiunta-moirfeolaiochta/adjective/beag_adj1.xml')
b, g = read_noun('../input/bunamo-bunachar-naisiunta-moirfeolaiochta/noun/Afganastánach_masc1.xml')
c = read_preposition('../input/bunamo-bunachar-naisiunta-moirfeolaiochta/preposition/ag_prep.xml')
#!cat '../input/bunamo-bunachar-naisiunta-moirfeolaiochta/adjective/beag_adj1.xml'
#read_adjective('../input/bunamo-bunachar-naisiunta-moirfeolaiochta/noun/Acádach_masc1.xml')

In [47]:
c

{'default': 'ag',
 'sg1': 'agam',
 'sg2': 'agat',
 'sg3Masc': 'aige',
 'sg3Fem': 'aici',
 'pl1': 'againn',
 'pl2': 'agaibh',
 'pl3': 'acu'}