Skip to content

Commit

Permalink
converting Deps to NewDeps now happens in process_stanford_dependencies
Browse files Browse the repository at this point in the history
lemmatization of English words should all happen in one place
minor changes
	modified:   scripts/dep_to_dot.py
	modified:   src/fourlang/dep_to_4lang.py
	modified:   src/fourlang/dependency_processor.py
	modified:   src/fourlang/similarity.py
  • Loading branch information
Gabor Recski committed Feb 4, 2016
1 parent 7b8dd32 commit bd0b56e
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 40 deletions.
18 changes: 11 additions & 7 deletions scripts/dep_to_dot.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import sys

from pymachine.machine import Machine
from fourlang.dependency_processor import Dependencies

HEADER = u"digraph finite_state_machine {\n\tdpi=100;\n\trankdir=LR;\n"
EXCLUDE = ("punct")
Expand All @@ -28,15 +29,18 @@ def dep_to_dot(deps, fn):

def main():
data = json.load(open(sys.argv[1]))
if 'deps' in data:
i = 0 if len(sys.argv) == 3 else int(sys.argv[3])
try:
i = int(sys.argv[3])
except:
w = sys.argv[3]
sen = map(
Dependencies.parse_dependency,
data[w]['senses'][0]['definition']['deps'])
fn = u"{0}/{1}.dot".format(sys.argv[2], w).encode('utf-8')
dep_to_dot(sen, fn)
else:
sen = data['deps'][i]
dep_to_dot(sen, sys.argv[2])
else:
for word, entry in data.iteritems():
sen = entry['senses'][0]['definition']['deps']
fn = "{0}/{1}.dot".format(sys.argv[2], word)
dep_to_dot(sen, fn)

if __name__ == "__main__":
main()
44 changes: 14 additions & 30 deletions src/fourlang/dep_to_4lang.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,34 +112,21 @@ def parse_dependency(string):
return dep, (word1, id1), (word2, id2)

def get_root_lemmas(self, deps):
return [d['dep']['lemma'] for d in deps if d['type'] == 'root']

def convert_old_deps(self, deps):
new_deps = []
for dep, (word1, id1), (word2, id2) in deps:
lemma1, lemma2 = map(lambda w: self.lemmatizer.lemmatize(
w, defined=self.lexicon.get_words()) or w, (word1, word2))
# print 'l1:', lemma1, 'l2:', lemma2
new_deps.append({
"type": dep,
"gov": {
"id": id1, "word": word1, "msd": None, "lemma": lemma1},
"dep": {
"id": id2, "word": word2, "msd": None, "lemma": lemma2}
})
return new_deps
return [
d['dep'].setdefault(
'lemma', self.lemmatizer.lemmatize(d['dep']['word']))
for d in deps if d['type'] == 'root'] # TODO

def get_dep_definition(self, word, deps):
if self.lang == 'en':
deps = self.convert_old_deps(deps)

deps = self.dependency_processor.process_dependencies(deps)
root_lemmas = self.get_root_lemmas(deps)
if not root_lemmas:
logging.warning(
u'no root dependency, skipping word "{0}"'.format(word))
return None

word2machine = self.get_machines_from_parsed_deps(deps)
word2machine = self.get_machines_from_deps_and_corefs(
[deps], [], process_deps=False)

root_machines = filter(None, map(word2machine.get, root_lemmas))
if not root_machines:
Expand All @@ -155,15 +142,11 @@ def get_dep_definition(self, word, deps):
word_machine.append(root_machine, 0)
return word_machine

def get_machines_from_parsed_deps(self, deps):
# deprecated, use get_machines_from_deps_and_corefs
return self.get_machines_from_deps_and_corefs([deps], [])

def get_machines_from_deps_and_corefs(self, dep_lists, corefs):
dep_lists = map(
self.dependency_processor.process_dependencies, dep_lists)
if self.lang == 'en':
dep_lists = map(self.convert_old_deps, dep_lists)
def get_machines_from_deps_and_corefs(
self, dep_lists, corefs, process_deps=True):
if process_deps:
dep_lists = map(
self.dependency_processor.process_dependencies, dep_lists)
coref_index = defaultdict(dict)
for (word, sen_no), mentions in corefs:
for m_word, m_sen_no in mentions:
Expand All @@ -175,7 +158,8 @@ def get_machines_from_deps_and_corefs(self, dep_lists, corefs):
for deps in dep_lists:
for dep in deps:
for t in (dep['gov'], dep['dep']):
self.word2lemma[t['word']] = t['lemma']
self.word2lemma[t['word']] = t.setdefault(
'lemma', self.lemmatizer.lemmatize(t['word']))

for i, deps in enumerate(dep_lists):
try:
Expand Down
21 changes: 18 additions & 3 deletions src/fourlang/dependency_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,21 @@ def merge(self, word1, word2, exclude=[]):


class NewDependencies():

@staticmethod
def create_from_old_deps(old_deps):
deps = []
for d_type, gov, dep in old_deps.get_dep_list():
deps.append({
"type": d_type,
"gov": {
"word": gov[0],
"id": gov[1]},
"dep": {
"word": dep[0],
"id": dep[1]}})
return NewDependencies(deps)

def __init__(self, deps):
self.deps = deps
self.indexed = False
Expand Down Expand Up @@ -281,12 +296,12 @@ def process_magyarlanc_dependencies(self, deps):
return deps.deps

def process_stanford_dependencies(self, dep_strings):
deps = Dependencies(dep_strings)
# deps = Dependencies.create_from_strings(dep_strings)
# deps = Dependencies(dep_strings)
deps = Dependencies.create_from_strings(dep_strings)
deps = self.process_copulars(deps)
deps = self.remove_copulars(deps)
deps = self.process_rcmods(deps)
# deps = self.process_coordinated_root(deps)
deps = self.process_coordination_stanford(deps)

return deps.get_dep_list()
return NewDependencies.create_from_old_deps(deps).deps
4 changes: 4 additions & 0 deletions src/fourlang/similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def log(self, string):

def uniform_similarities(self, s):
return dict(((sim_type, s) for sim_type in WordSimilarity.sim_types))
# TODO return {sim_type: s for sim_type in WordSimilarity.sim_types}

def zero_similarities(self):
return self.uniform_similarities(0.0)
Expand Down Expand Up @@ -77,6 +78,7 @@ def machine_similarities(self, machine1, machine2):
sims['nodes_contain'] = 1

pn1, pn2 = machine1.printname(), machine2.printname()
# TODO
if pn1 in links2 or pn2 in links1:
sims['0-connected'] = 1

Expand Down Expand Up @@ -123,6 +125,7 @@ def word_similarities(self, word1, word2):
logging.debug("OOV: {0}".format(word1))
if lemma2 is None:
logging.debug("OOV: {0}".format(word2))
# TODO
word_sims = self.zero_similarities()
else:
word_sims = self.lemma_similarities(lemma1, lemma2)
Expand All @@ -146,6 +149,7 @@ def get_links(self, machine, depth):
self.seen_for_links.add(machine)
for hypernym in machine.partitions[0]:
name = hypernym.printname()
# TODO
if name == '=AGT' or not name.isupper():
# if depth == 0 and name not in ("lack", "to"): # TMP!!!
yield name
Expand Down

0 comments on commit bd0b56e

Please sign in to comment.