Skip to content

Commit

Permalink
new flags added to lexicon.get_machine and lemmatizer.lemmatize
Browse files Browse the repository at this point in the history
  • Loading branch information
Eszti committed Jul 16, 2016
1 parent 779b56f commit 6a25bcc
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 26 deletions.
10 changes: 5 additions & 5 deletions src/fourlang/dep_to_4lang.py
Expand Up @@ -114,7 +114,7 @@ def parse_dependency(string):
def get_root_lemmas(self, deps):
return [
d['dep'].setdefault(
'lemma', self.lemmatizer.lemmatize(d['dep']['word']))
'lemma', self.lemmatizer.lemmatize(d['dep']['word'], uppercase=True))
for d in deps if d['type'] == 'root'] # TODO

def get_dep_definition(self, word, deps):
Expand All @@ -138,7 +138,7 @@ def get_dep_definition(self, word, deps):
logging.info('word2machine: {0}'.format(word2machine))
sys.exit(-1)

word_machine = self.lexicon.get_new_machine(word)
word_machine = self.lexicon.get_machine(word, new_machine=True)

for root_machine in root_machines:
word_machine.unify(root_machine)
Expand All @@ -162,7 +162,7 @@ def get_machines_from_deps_and_corefs(
for dep in deps:
for t in (dep['gov'], dep['dep']):
self.word2lemma[t['word']] = t.setdefault(
'lemma', self.lemmatizer.lemmatize(t['word']))
'lemma', self.lemmatizer.lemmatize(t['word'], uppercase=True))

for i, deps in enumerate(dep_lists):
try:
Expand Down Expand Up @@ -197,8 +197,8 @@ def get_machines_from_deps_and_corefs(

for lemma in (lemma1, lemma2):
if lemma not in word2machine:
word2machine[lemma] = self.lexicon.get_new_machine(
lemma)
word2machine[lemma] = self.lexicon.get_machine(
lemma, new_machine=True)

self.apply_dep(
dep, word2machine[lemma1], word2machine[lemma2])
Expand Down
29 changes: 22 additions & 7 deletions src/fourlang/lemmatizer.py
Expand Up @@ -2,6 +2,7 @@
import os
import sys

from nltk.corpus import stopwords as nltk_stopwords
from hunmisc.utils.huntool_wrapper import Hundisambig, Ocamorph, OcamorphAnalyzer, MorphAnalyzer # nopep8
from stemming.porter2 import stem as porter_stem

Expand All @@ -13,6 +14,10 @@ def __init__(self, cfg):
self.cfg = cfg
self.analyzer, self.morph_analyzer = self.get_analyzer()

self.stopwords = set(nltk_stopwords.words('english'))
self.stopwords.add('as') # TODO
self.stopwords.add('root') # TODO

self.read_cache()
self.oov = set()

Expand All @@ -30,13 +35,23 @@ def _analyze(self, word):

self.cache[word] = (stem, lemma, candidates)

def lemmatize(self, word, defined=None, stem_first=False,
def _lemmatize_with_stopwords(self, word, uppercase):
if not uppercase:
return word
elif word == 'have':
return 'HAS'
elif word in self.stopwords:
return word.upper()
else:
return word

def lemmatize(self, word, defined=None, stem_first=False, uppercase=False,
debug=False):
# if 'defined' is provided, will refuse to return lemmas not in it

# if the word is defined, we just return it
if defined is not None and word in defined:
return word
return self._lemmatize_with_stopwords(word, uppercase)

# if the word is not in our cache, we run all analyses
if word not in self.cache:
Expand All @@ -51,22 +66,22 @@ def lemmatize(self, word, defined=None, stem_first=False,
logging.warning("stem_first=True and defined=None, \
'lemmatize' is now a blind Porter stemmer")
stemmed_lemma = self.lemmatize(
stem, defined=defined, stem_first=False)
stem, defined=defined, stem_first=False, uppercase=uppercase)
if stemmed_lemma is not None:
return stemmed_lemma
return self._lemmatize_with_stopwords(stemmed_lemma, uppercase)

# we return the lemma unless it's not in defined
if defined is None or lemma in defined:
return lemma
return self._lemmatize_with_stopwords(lemma, uppercase)

# we go over the other candidates as a last resort
for cand in candidates:
if cand in defined:
return cand
return self._lemmatize_with_stopwords(cand, uppercase)

# last resort is the porter stem:
if stem in defined:
return stem
return self._lemmatize_with_stopwords(stem, uppercase)

# if that doesn't work either, we return None
return None
Expand Down
21 changes: 7 additions & 14 deletions src/fourlang/lexicon.py
Expand Up @@ -87,7 +87,7 @@ def add_def_graph(self, word, word_machine, dumped_def_graph,
if not pn:
logging.warning(u"empty pn in node: {0}, word: {1}".format(
node, word))
node2machine[node] = self.get_new_machine(pn)
node2machine[node] = self.get_machine(pn, new_machine=True)

for node1, adjacency in graph.adjacency_iter():
machine1 = node2machine[node1]
Expand Down Expand Up @@ -162,13 +162,6 @@ def _add(self, printname, machine, lexicon):
raise Exception("duplicate word in lexicon: '{0}'".format(lexicon))
lexicon[printname] = set([machine])

def get_new_machine(self, printname):
"""returns a new machine without adding it to any lexicon"""
#TODO
if printname == 'have':
return self.get_new_machine('HAS')
return Machine(printname, ConceptControl())

def get_expanded_definition(self, printname):
machine = self.expanded_lexicon.get(printname)
if machine is not None:
Expand All @@ -179,23 +172,23 @@ def get_expanded_definition(self, printname):
self.expanded_lexicon[printname] = machine
return machine

def get_machine(self, printname, allow_new_base=False,
def get_machine(self, printname, new_machine=False, allow_new_base=False,
allow_new_ext=False, allow_new_oov=True):
"""returns the lowest level (base < ext < oov) existing machine
for the printname. If none exist, creates a new machine in the lowest
level allowed by the allow_* flags. Will always create new machines
for uppercase printnames"""

# returns a new machine without adding it to any lexicon
if new_machine:
return Machine(printname, ConceptControl())

# TODO
if not printname:
return self.get_machine("_empty_")

if printname.isupper():
return self.get_new_machine(printname)

# TODO: hack
if printname == 'have':
return self.get_machine('HAS')
return self.get_machine(printname, new_machine=True)

machines = self.lexicon.get(
printname, self.ext_lexicon.get(
Expand Down
2 changes: 2 additions & 0 deletions src/fourlang/similarity.py
Expand Up @@ -98,6 +98,7 @@ def lemma_similarities(self, lemma1, lemma2):
def word_similarities(self, word1, word2):
if (word1, word2) in self.word_sim_cache:
return self.word_sim_cache[(word1, word2)]
# TODO: uppercase flag = ?
lemma1, lemma2 = [self.lemmatizer.lemmatize(
word, defined=self.defined_words, stem_first=True)
for word in (word1, word2)]
Expand Down Expand Up @@ -329,6 +330,7 @@ def get_sims(self):
logging.warning('lemmatizing words to determine machine-OOVs...')
self.non_oov = set(
(word for word in self.non_oov
# TODO: uppercase flag = ?
if self.sim_wrapper.lemmatizer.lemmatize(
word, defined=self.sim_wrapper.machine_wrapper.definitions,
stem_first=True) is not None))
Expand Down

0 comments on commit 6a25bcc

Please sign in to comment.