In [1]:
from xml.sax import parse
from xml.sax.handler import ContentHandler
from collections import namedtuple
import os
import psutil

In [73]:
psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024

281.578125

In [3]:
os.chdir('..')
os.getcwd()

'/home/ec2-user/SageMaker/AmazonSageMaker-bluewater-nlp'

In [56]:
input_dir = '../data/rutez-lite'
include_domains = {'GL', 'SOC-POL', 'GEO'}

In [5]:
os.listdir(input_dir)

['relations.xml', 'concepts.xml', 'text_entry.xml', 'synonyms.xml']

In [6]:
REL_ENUM = ['АСЦ', 'АСЦ1', 'АСЦ2', 'ВЫШЕ', 'НИЖЕ', 'ЦЕЛОЕ', 'ЧАСТЬ']

class Concept:
    def __init__(self, id):
        self.id = id
        self.name = ''
        self.gloss = ''
        self.domain = ''
        self.relations = {
            r : set()
            for r in REL_ENUM
        }
        self.texts = set()
        
class TextEntry:
    def __init__(self, id):
        self.id = id
        self.name = ''
        self.lemma = ''
        self.main_word = ''
        self.synt_type = ''
        self.pos_string = ''
        self.concepts = set()

In [7]:
class ConceptsHandler(ContentHandler):
    def __init__(self):
        pass
    
    def startDocument(self):
        self.concepts = []
        self._char_handler = None
        self._current_concept = None
    
    def endDocument(self):
        print("Parsed %s concepts" % len(self.concepts))
        
    def _fill_current_concept(self, attr_name, attr_value):
        setattr(self._current_concept,
                attr_name,
                getattr(self._current_concept, attr_name) + attr_value)
        
    def startElement(self, name, attrs):
        if name == 'concepts':
            pass
        elif name == 'concept':
            if self._current_concept:
                raise ValueError()
            self._current_concept = Concept(attrs['id'])
        elif name == 'name':
            self._char_handler = lambda c: self._fill_current_concept('name', c)
        elif name == 'gloss':
            self._char_handler = lambda c: self._fill_current_concept('gloss', c)
        elif name == 'domain':
            self._char_handler = lambda c: self._fill_current_concept('domain', c)
        else:
            raise ValueError("Unknown element %s" % name)
            
    def characters(self, content):
        if self._char_handler:
            self._char_handler(content)
    
    def endElement(self, name):
        if name == 'concepts':
            pass
        elif name == 'concept':
            if not self._current_concept:
                raise ValueError()
            self.concepts.append(self._post_process(self._current_concept))
            self._current_concept = None
        elif name == 'name':
            self._char_handler = None
        elif name == 'gloss':
            self._char_handler = None
        elif name == 'domain':
            self._char_handler = None
        else:
            raise ValueError("Unknown element %s" % name)
    
    def _post_process(self, concept):
        concept.id = int(concept.id)
        return concept

In [12]:
class RelationsHandler(ContentHandler):
    def __init__(self, concept_idx):
        self.concept_idx = concept_idx
    
    def startDocument(self):
        self._counter = 0
        self._missed_concepts = set()
    
    def endDocument(self):
        print("Parsed %s relations" % self._counter)
        print("Missed %s concepts" % len(self._missed_concepts))
        
    def startElement(self, name, attrs):
        if name == 'relations':
            pass
        elif name == 'rel':
            from_id = int(attrs['from'])
            concept = self.concept_idx.get(from_id)
            if not concept:
                self._missed_concepts.add(from_id)
                return
            to_id = int(attrs['to'])
            if to_id not in concept_idx:
                self._missed_concepts.add(to_id)
                return
            rel_name = attrs['name']
            concept.relations[rel_name].add(to_id)
            self._counter += 1
        else:
            raise ValueError("Unknown element %s" % name)

In [9]:
class TextEntriesHandler(ContentHandler):
    def __init__(self):
        pass
    
    def startDocument(self):
        self.text_entries = []
        self._char_handler = None
        self._current_entry = None
    
    def endDocument(self):
        print("Parsed %s text entries" % len(self.text_entries))
        
    def _fill_current(self, attr_name, attr_value):
        setattr(self._current_entry,
                attr_name,
                getattr(self._current_entry, attr_name) + attr_value)
        
    def startElement(self, name, attrs):
        if name == 'entries':
            pass
        elif name == 'entry':
            if self._current_entry:
                raise ValueError()
            self._current_entry = TextEntry(int(attrs['id']))
        elif name == 'name':
            self._char_handler = lambda c: self._fill_current('name', c)
        elif name == 'lemma':
            self._char_handler = lambda c: self._fill_current('lemma', c)
        elif name == 'main_word':
            self._char_handler = lambda c: self._fill_current('main_word', c)
        elif name == 'synt_type':
            self._char_handler = lambda c: self._fill_current('synt_type', c)
        elif name == 'pos_string':
            self._char_handler = lambda c: self._fill_current('pos_string', c)
        else:
            raise ValueError("Unknown element %s" % name)
            
    def characters(self, content):
        if self._char_handler:
            self._char_handler(content)
    
    def endElement(self, name):
        if name == 'entries':
            pass
        elif name == 'entry':
            if not self._current_entry:
                raise ValueError()
            self.text_entries.append(self._current_entry)
            self._current_entry = None
        elif name == 'name':
            self._char_handler = None
        elif name == 'lemma':
            self._char_handler = None
        elif name == 'main_word':
            self._char_handler = None
        elif name == 'synt_type':
            self._char_handler = None
        elif name == 'pos_string':
            self._char_handler = None
        else:
            raise ValueError("Unknown element %s" % name)
    

In [16]:
class SynonymsHandler(ContentHandler):
    def __init__(self, concept_idx, te_idx):
        self.concept_idx = concept_idx
        self.te_idx = te_idx
    
    def startDocument(self):
        self._counter = 0
        self._missed_concepts = set()
    
    def endDocument(self):
        print("Parsed %s relations" % self._counter)
        print("Missed %s concepts" % len(self._missed_concepts))
        
    def startElement(self, name, attrs):
        if name == 'synonyms':
            pass
        elif name == 'entry_rel':
            concept_id = int(attrs['concept_id'])
            concept = self.concept_idx.get(concept_id)
            if not concept:
                self._missed_concepts.add(concept_id)
                return
            te_id = int(attrs['entry_id'])
            te = self.te_idx.get(te_id)
            if not te:
                print("WARN: no text entry with id ", te_id)
                return
            concept.texts.add(te_id)
            te.concepts.add(concept_id)
            self._counter += 1
        else:
            raise ValueError("Unknown element %s" % name)

In [57]:
concepts_handler = ConceptsHandler()
with open(os.path.join(input_dir, 'concepts.xml'), mode='rb') as inp:
    parse(inp, concepts_handler)
concept_idx = {
    c.id : c
    for c in concepts_handler.concepts if c.domain in include_domains
}
del concepts_handler
len(concept_idx)

Parsed 26354 concepts


26354

In [58]:
relations_handler = RelationsHandler(concept_idx)
with open(os.path.join(input_dir, 'relations.xml'), mode='rb') as inp:
    parse(inp, relations_handler)
del relations_handler

Parsed 107949 relations
Missed 39 concepts


In [59]:
te_handler = TextEntriesHandler()
with open(os.path.join(input_dir, 'text_entry.xml'), mode='rb') as inp:
    parse(inp, te_handler)
te_idx = {
    te.id : te
    for te in te_handler.text_entries
}
del te_handler

Parsed 96700 text entries


In [60]:
syn_handler = SynonymsHandler(concept_idx, te_idx)
with open(os.path.join(input_dir, 'synonyms.xml'), mode='rb') as inp:
    parse(inp, syn_handler)
del syn_handler

Parsed 115091 relations
Missed 9 concepts


In [61]:
orphan_te_ids = set(te.id for te in te_idx.values() if not te.concepts)
print("There are %s orphan text entries" % len(orphan_te_ids))
te_idx = {
    te_id : te
    for te_id, te in te_idx.items()
    if te_id not in orphan_te_ids
}
len(te_idx)

There are 13 orphan text entries


96687

In [62]:
text_idx = {
    te.name : te
    for te in te_idx.values()
}

In [71]:
def get_related(arg, rel_name, hint=None):
    if isinstance(arg, int):
        concept = concept_idx.get(arg)
        if not concept:
            raise ValueError("Can't find concept by id: %s" % arg)
    elif isinstance(arg, str):
        arg = arg.upper()
        te = text_idx.get(arg)
        if not te:
            raise ValueError("Can't find text entry by handle: %s" % arg)
        if len(te.concepts) == 1:
            hint = 0
        elif hint is None:
            err_msg = "There are several concepts:\n"
            for c_id in te.concepts:
                err_msg += concept_idx[c_id].name
                err_msg += '\n'
            err_msg += "Please specify which one as a hint."
            raise ValueError(err_msg)
        concept = concept_idx[list(te.concepts)[hint]]
    else:
        raise ValueError("Unknown arg type")
    
    def _rec_related(concept):
        hyp_concepts = [concept_idx[hyp_id] for hyp_id in concept.relations[rel_name]]
        for c in hyp_concepts:
            yield c
        for c in hyp_concepts:
            yield from _rec_related(c)
            
    return [concept] + list(_rec_related(concept))
    
def get_hyponyms(arg, hint=None):
    return get_related(arg, 'НИЖЕ', hint)

def get_hypernyms(arg, hint=None):
    return get_related(arg, 'ВЫШЕ', hint)

In [66]:
def print_related(arg, rel_name, hint=None):
    for c in get_related(arg, rel_name, hint):
        print(c.id, c.name, sep='\t')
        for te_id in c.texts:
            te = te_idx[te_id]
            print('', te.name, sep='\t')

In [72]:
print_related(118667, 'НИЖЕ', 0)

118667	УДОВЛЕТВОРИТЬСЯ
	ДОВОЛЬСТВОВАТЬСЯ
	ВДОВОЛЬ
	ВДОСТАЛЬ
	УДОВЛЕТВОРЕНИЕ
	УДОВЛЕТВОРИТЬСЯ
	УДОВЛЕТВОРЯТЬСЯ
	УДОВОЛЬСТВОВАТЬСЯ
	ПОЛУЧИТЬ УДОВЛЕТВОРЕНИЕ
	ПОЛУЧАТЬ УДОВЛЕТВОРЕНИЕ
118668	ОГРАНИЧИТЬСЯ (УДОВЛЕТВОРИТЬСЯ НЕМНОГИМ)
	ОГРАНИЧИВАТЬСЯ
	САМООГРАНИЧИВАТЬСЯ
	ОГРАНИЧИТЬСЯ
	ПРОБАВЛЯТЬСЯ
149907	ЛЬСТИТЬ ЧУВСТВУ
	ЛЬСТИТЬ ЧУВСТВУ
	ЛЕСТНЫЙ
	ЛЬСТИТЬ
	ПОЛЬСТИТЬ
117014	НАЕСТЬСЯ, НАСЫТИТЬСЯ
	НАКУШАТЬСЯ
	НАБИТЬ ЖИВОТ
	НАСЫТИТЬСЯ ЕДОЙ
	НАБИТЬ ЖЕЛУДОК
	НАЕДАТЬСЯ
	ДОСЫТА
	НАЕСТЬСЯ ДОСЫТА
	НАЕСТЬСЯ ДО ОТВАЛА
	НАБИТЬ ПУЗО
	НАЕСТЬСЯ
	НАЖРАТЬСЯ
	НАСЫЩЕНИЕ
	НАСЫТИТЬСЯ
	НАСЫЩАТЬСЯ
	НАБИТЬ БРЮХО
115578	НАПИТЬСЯ (ВЫПИТЬ В БОЛЬШОМ КОЛИЧЕСТВЕ)
	НАПИТЬСЯ
	НАПИВАТЬСЯ
138332	ОБОЙТИСЬ (УДОВЛЕТВОРИТЬСЯ ИМЕЮЩИМСЯ)
	ОБОЙТИСЬ
	ОБХОДИТЬСЯ
