Skip to content

Commit

Permalink
Merge pull request #39 from recski/dev
Browse files Browse the repository at this point in the history
handling of coordination, subordinating conjunctions (?), support for eksz, some new rules
  • Loading branch information
recski committed Oct 6, 2015
2 parents 475c233 + 8639124 commit cd6c569
Show file tree
Hide file tree
Showing 6 changed files with 155 additions and 18 deletions.
15 changes: 15 additions & 0 deletions conf/eksz_test.cfg
@@ -0,0 +1,15 @@
[dict]
input_type = eksz
input_file = test/input/eksz_test.xml
output_file = test/dict/eksz_test.json

[machine]
ext_definitions = test/machines/eksz_test.pickle
graph_dir = test/graphs/eksz_test

[deps]
lang = hu
dep_map = dep_to_4lang_hu.txt

[magyarlanc]
path = magyarlanc/magyarlanc-2.0.jar
8 changes: 5 additions & 3 deletions dep_to_4lang_hu.txt
Expand Up @@ -13,7 +13,9 @@ att 0,-
subj 1,0
obj 2,-

obl,.*,N..s.* -,- IN
obl,.*,N...2.* -,- IN
obl,.*,N...p.* -,- IN
obl,.*,N...m.* -,- AT

coord 0,0
conj 0,0

pred 0,0
17 changes: 11 additions & 6 deletions src/fourlang/dep_to_4lang.py
Expand Up @@ -8,6 +8,7 @@

from pymachine.operators import AppendOperator, AppendToNewBinaryOperator, AppendToBinaryFromLexiconOperator # nopep8

from dependency_processor import DependencyProcessor
from lemmatizer import Lemmatizer
from lexicon import Lexicon
from utils import ensure_dir, get_cfg, print_4lang_graphs
Expand All @@ -21,8 +22,10 @@ def __init__(self, cfg):
self.lang = self.cfg.get("deps", "lang")
self.out_fn = self.cfg.get("machine", "ext_definitions")
ensure_dir(os.path.dirname(self.out_fn))
self.dependency_processor = DependencyProcessor(self.cfg)
dep_map_fn = cfg.get("deps", "dep_map")
self.read_dep_map(dep_map_fn)
self.undefined = set()
self.lemmatizer = Lemmatizer(cfg)
self.lexicon_fn = self.cfg.get("machine", "definitions_binary")
self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
Expand All @@ -41,9 +44,11 @@ def apply_dep(self, dep, machine1, machine2):
msd1 = dep['gov'].get('msd')
msd2 = dep['dep'].get('msd')
if dep_type not in self.dependencies:
logging.warning(
'skipping dependency not in dep_to_4lang map: {0}'.format(
dep_type))
if dep_type not in self.undefined:
self.undefined.add(dep_type)
logging.warning(
'skipping dependency not in dep_to_4lang map: {0}'.format(
dep_type))
return False # not that anyone cares
for dep in self.dependencies[dep_type]:
dep.apply(msd1, msd2, machine1, machine2)
Expand All @@ -65,7 +70,8 @@ def dep_to_4lang(self):
definition = entry['senses'][0]['definition']
if definition is None:
continue
deps = definition['deps']
deps = self.dependency_processor.process_dependencies(
definition['deps'])
if not deps:
# TODO see previous comment
continue
Expand Down Expand Up @@ -124,7 +130,6 @@ def convert_old_deps(self, deps):
return new_deps

def get_dep_definition(self, word, deps):
# logging.info('deps: {0}'.format(deps))
if self.lang == 'en':
deps = self.convert_old_deps(deps)

Expand Down Expand Up @@ -264,7 +269,7 @@ def get_standard_operators(edge1, edge2, rel, reverse):

def match(self, msd1, msd2):
for patt, msd in ((self.patt1, msd1), (self.patt2, msd2)):
if patt is not None and not patt.match(msd):
if patt is not None and msd is not None and patt.match(msd):
return False
return True

Expand Down
121 changes: 116 additions & 5 deletions src/fourlang/dependency_processor.py
Expand Up @@ -68,14 +68,48 @@ def merge(self, word1, word2, exclude=[]):
else:
pass


class NewDependencies():
def __init__(self, deps):
self.deps = deps
self.indexed = False
self.index()

def index(self):
self.tok_index = defaultdict(lambda: [None, [], []])
self.dep_index = defaultdict(list)
for d in self.deps:
self.tok_index[d['gov']['id']][0] = d['gov']
self.tok_index[d['dep']['id']][0] = d['dep']
self.tok_index[d['gov']['id']][1].append(d)
self.tok_index[d['dep']['id']][2].append(d)
self.dep_index[d['type']].append(d)

self.indexed = True

def add(self, d_type, gov, dep):
self.deps.append({"type": d_type, "gov": gov, "dep": dep})
self.indexed = False

def remove_tok(self, i):
self.deps = [
d for d in self.deps
if d['gov']['id'] != i and d['dep']['id'] != i]
self.indexed = False

def remove_type(self, d_type):
self.deps = [d for d in self.deps if d['type'] != d_type]
self.indexed = False

class DependencyProcessor():
copulars = set([
"'s", 'are', 'be', 'been', 'being', 'is', 's', 'was', 'were'])

def __init__(self, cfg):
self.cfg = cfg
self.lang = self.cfg.get("deps", "lang")

def process_coordination(self, deps):
def process_coordination_stanford(self, deps):
for word1, word_deps in deepcopy(deps.index.items()):
for i in (0, 1):
for dep, words in word_deps[i].iteritems():
Expand Down Expand Up @@ -126,15 +160,92 @@ def remove_copulars(self, deps):

return deps

def process_string_dependencies(self, dep_strings):
return self.process_dependencies(
Dependencies.create_from_strings(dep_strings))
def process_conjunction_magyarlanc(self, deps):
# get 'hogy' dependants of conj relations
conjs = set((
d['dep']['id']
for d in deps.dep_index['conj'] if d['dep']['lemma'] == 'hogy'))
# then for each of these:
for conj in conjs:
govs = [
d['gov']
for d in deps.tok_index[conj][2] if d['type'] == 'conj']
for dep in deps.tok_index[conj][1]:
for gov in govs:
deps.add(dep['type'], gov, dep['dep'])

deps.remove_tok(conj)
deps.index()
return deps

def process_coordination_magyarlanc(self, deps):
# get governors of coord relations
govs = set((d['gov']['id'] for d in deps.dep_index['coord']))
# then for each of these:
for gov in govs:
# get dep-neighbours of each of these
coord = [d['dep']['id'] for d in deps.tok_index[gov][1]]
coord += [d['gov']['id'] for d in deps.tok_index[gov][2]]
# and unify their relations
# logging.info('unifying these:')
# for c in coord:
# logging.info(u"{0}".format(
# deps.tok_index[c][0]['word']))
gov_tok = deps.tok_index['gov'][0]
if gov_tok is None or gov_tok['msd'][0] != 'C':
# if the gov is not a conjunction, then it must take part
# in the unification
coord.append(gov)
else:
# otherwise it should be removed
deps.remove_tok(gov)

deps = self.unify_dependencies(
coord, deps, exclude=set(['att', 'coord', 'punct']))

# we reindex in the end only!
deps.index()
return deps

def unify_dependencies(self, tokens, deps, exclude):
for tok1 in tokens:
for tok2 in tokens:
if tok2 == tok1:
continue
for dep in deps.tok_index[tok1][1]:
if dep['type'] in exclude:
continue
# logging.info('copying: {0}'.format(dep))
deps.add(dep['type'], deps.tok_index[tok2][0], dep['dep'])
for dep in deps.tok_index[tok1][2]:
if dep['type'] in exclude:
continue
# logging.info('copying: {0}'.format(dep))
deps.add(dep['type'], dep['gov'], deps.tok_index[tok2][0])
return deps

def process_dependencies(self, deps):
if self.lang == 'en':
return self.process_stanford_dependencies(deps)
elif self.lang == 'hu':
return self.process_magyarlanc_dependencies(deps)
else:
raise Exception('unsupported language: {0}'.format(self.lang))

def process_magyarlanc_dependencies(self, deps):
deps = NewDependencies(deps)
deps.remove_type('punct')
deps.index()
deps = self.process_conjunction_magyarlanc(deps)
deps = self.process_coordination_magyarlanc(deps)
return deps.deps

def process_stanford_dependencies(self, dep_strings):
deps = Dependencies.create_from_strings(dep_strings)
deps = self.process_copulars(deps)
deps = self.remove_copulars(deps)
deps = self.process_rcmods(deps)
# deps = self.process_coordinated_root(deps)
deps = self.process_coordination(deps)
deps = self.process_coordination_stanford(deps)

return deps.get_dep_list()
10 changes: 6 additions & 4 deletions src/fourlang/dict_to_4lang.py
Expand Up @@ -17,6 +17,7 @@
from stanford_wrapper import StanfordWrapper
from utils import batches, ensure_dir, get_cfg
from collins_parser import CollinsParser
from eksz_parser import EkszParser
from nszt_parser import NSzTParser
from magyarlanc_wrapper import Magyarlanc

Expand Down Expand Up @@ -48,6 +49,9 @@ def get_parser_and_lang(self):
elif input_type == 'collins':
self.parser = CollinsParser()
self.lang = 'eng'
elif input_type == 'eksz':
self.parser = EkszParser()
self.lang = 'hun'
elif input_type == 'nszt':
self.parser = NSzTParser()
self.lang = 'hun'
Expand Down Expand Up @@ -100,10 +104,8 @@ def process_entries(self, words):
if definition is None:
continue
# print 'printing deps' + str(definition['deps'])
if self.lang == 'eng':
definition['deps'] = dependency_processor.process_string_dependencies( # nopep8
definition['deps'])
# print definition['deps']
definition['deps'] = dependency_processor.process_dependencies(
definition['deps'])

if word in self.dictionary:
logging.warning(
Expand Down
2 changes: 2 additions & 0 deletions src/fourlang/utils.py
@@ -1,4 +1,5 @@
from ConfigParser import ConfigParser
import logging
import os

from pymachine.utils import MachineGraph
Expand All @@ -23,6 +24,7 @@ def print_text_graph(words_to_machines, graph_dir, fn='text'):
return fn

def print_4lang_graphs(lexicon, graph_dir):
logging.info('printing graphs to {0}'.format(graph_dir))
for word, machine_set in lexicon.iteritems():
print_4lang_graph(word, next(iter(machine_set)), graph_dir)

Expand Down

0 comments on commit cd6c569

Please sign in to comment.