diff --git a/setup.py b/setup.py index 5ce35ec..f545b8a 100644 --- a/setup.py +++ b/setup.py @@ -24,5 +24,5 @@ "https://github.com/kornai/pymachine/tarball/master#egg=pymachine"], install_requires=[ "nltk", "pymachine", "requests", "stemming", "unidecode", "pyzmq", - "graphviz"], + "graphviz", "scipy"], ) diff --git a/src/fourlang/corenlp_wrapper.py b/src/fourlang/corenlp_wrapper.py index 16122ad..6b3dc58 100644 --- a/src/fourlang/corenlp_wrapper.py +++ b/src/fourlang/corenlp_wrapper.py @@ -7,9 +7,11 @@ from longman_parser import XMLParser + class ParserException(Exception): pass + class Parser(XMLParser): sen_regex = re.compile( '(.*?)', re.S) @@ -69,7 +71,8 @@ def parse_corenlp_output(output): parsed_sens = [Parser.parse_sen(sen) for sen in Parser.sen_regex.findall(cl_output)] - parse_trees = [match for match in Parser.parse_tree_regex.findall(cl_output)] + parse_trees = [match + for match in Parser.parse_tree_regex.findall(cl_output)] corefs_match = Parser.all_corefs_regex.search(cl_output) if corefs_match is None: @@ -78,6 +81,7 @@ def parse_corenlp_output(output): corefs = Parser.parse_corefs(corefs_match.group(1)) return parsed_sens, corefs, parse_trees + class CoreNLPWrapper(): def __init__(self, cfg, is_server=False): @@ -94,6 +98,18 @@ def parse_text(self, text): def parse_sentences(self, sens): return self.parse_text("\n".join(sens)) + def parse_entries(self, entries): + for entry in entries: + for sense in entry['senses']: + sentence = sense['definition'] + deps, corefs, parse_trees = self.parse_text(sentence) + sense['definition'] = { + "sen": sentence, + "deps": deps[0], + "parse": parse_trees} + return entries + + def test(): cfg_file = 'conf/default.cfg' if len(sys.argv) < 2 else sys.argv[1] cfg = ConfigParser() diff --git a/src/fourlang/dep_to_4lang.py b/src/fourlang/dep_to_4lang.py index dd786e2..a2f9dab 100644 --- a/src/fourlang/dep_to_4lang.py +++ b/src/fourlang/dep_to_4lang.py @@ -130,13 +130,15 @@ def get_root_lemmas(self, deps): for d in deps if d['type'] == 'root'] # TODO def get_dep_definition(self, word, deps): - if isinstance(deps[0], unicode): - # TODO - root_lemmas = self.get_root_lemmas( - NewDependencies.create_from_old_deps( - Dependencies.create_from_strings(deps)).deps) - else: - root_lemmas = self.get_root_lemmas(deps) + # get NewDependencies from whatever type "deps" are + if isinstance(deps[0], unicode): # string dependencies + deps = NewDependencies.create_from_old_deps( + Dependencies.create_from_strings(deps)).deps + elif isinstance(deps[0], list): # old dependencies + deps = NewDependencies.create_from_old_deps( + Dependencies(deps)).deps + + root_lemmas = self.get_root_lemmas(deps) deps = self.dependency_processor.process_dependencies(deps) if not root_lemmas: logging.warning( diff --git a/src/fourlang/dict_to_4lang.py b/src/fourlang/dict_to_4lang.py index 3056139..48ab01d 100644 --- a/src/fourlang/dict_to_4lang.py +++ b/src/fourlang/dict_to_4lang.py @@ -8,17 +8,17 @@ import time import traceback +from collins_parser import CollinsParser +from corenlp_wrapper import CoreNLPWrapper from dep_to_4lang import DepTo4lang +from eksz_parser import EkszParser from entry_preprocessor import EntryPreprocessor from lexicon import Lexicon from longman_parser import LongmanParser -from wiktionary_parser import WiktParser -from stanford_wrapper import StanfordWrapper -from utils import batches, ensure_dir, get_cfg -from collins_parser import CollinsParser -from eksz_parser import EkszParser -from nszt_parser import NSzTParser from magyarlanc_wrapper import Magyarlanc +from nszt_parser import NSzTParser +from utils import batches, ensure_dir, get_cfg +from wiktionary_parser import WiktParser assert Lexicon # silence pyflakes (Lexicon must be imported for cPickle) @@ -86,9 +86,8 @@ def process_entries(self, words): (self.raw_dict[word] for word in words)) if self.lang == 'eng': - stanford_wrapper = StanfordWrapper(self.cfg) - entries = stanford_wrapper.parse_sentences( - entries, definitions=True) + corenlp_wrapper = CoreNLPWrapper(self.cfg) + entries = corenlp_wrapper.parse_entries(entries) elif self.lang == 'hun': magyarlanc_wrapper = Magyarlanc(self.cfg) entries = magyarlanc_wrapper.parse_entries(entries) diff --git a/src/fourlang/stanford_parser.py b/src/fourlang/stanford_parser.py deleted file mode 100644 index caa0407..0000000 --- a/src/fourlang/stanford_parser.py +++ /dev/null @@ -1,186 +0,0 @@ -import json -import logging -import math -import os -import sys -from tempfile import NamedTemporaryFile - -parser = sys.argv[1] -sys.path.append(parser) -sys.path.append(os.path.join(os.path.dirname(parser), 'ejml-0.23.jar')) - -from edu.stanford.nlp.process import Morphology, PTBTokenizer, WordTokenFactory -from edu.stanford.nlp.parser.common import ParserConstraint -from edu.stanford.nlp.parser.lexparser import Options -from edu.stanford.nlp.parser.lexparser import LexicalizedParser -from edu.stanford.nlp.ling import Sentence -from edu.stanford.nlp.trees import PennTreebankLanguagePack - -from java.io import StringReader -from java.util.regex import Pattern - -class StanfordParser: - - @staticmethod - def get_constraints(sentence, pos): - constraints = [] - length = len(sentence) - if pos == 'n': - constraints.append( - ParserConstraint(0, length, Pattern.compile("NP.*"))) - return constraints - - def __init__(self, parser_file, - parser_options=['-maxLength', '80', - '-retainTmpSubcategories']): - - """@param parser_file: path to the serialised parser model - (e.g. englishPCFG.ser.gz) - @param parser_options: options - """ - - assert os.path.exists(parser_file) - options = Options() - options.setOptions(parser_options) - self.lp = LexicalizedParser.getParserFromFile(parser_file, options) - tlp = PennTreebankLanguagePack() - self.gsf = tlp.grammaticalStructureFactory() - self.lemmer = Morphology() - self.word_token_factory = WordTokenFactory() - self.parser_query = None - - def tokenize(self, text): - reader = StringReader(text) - tokeniser = PTBTokenizer(reader, self.word_token_factory, None) - tokens = tokeniser.tokenize() - return tokens - - def get_parse(self, sentence): - tokens = [unicode(x) for x in self.tokenize(sentence)] - parse = self.lp.apply(Sentence.toWordList(tokens)) - return parse - - def get_grammatical_structure(self, parse): - return self.gsf.newGrammaticalStructure(parse) - - def get_kbest(self, query, k=3): - for candidate_tree in query.getKBestPCFGParses(k): - parse = candidate_tree.object() - prob = math.e ** candidate_tree.score() - yield prob, parse - - def parse(self, sentence): - return self.parse_with_constraints(sentence, None) - - def parse_with_constraints(self, sentence, constraints): - # logging.debug("getting query...") - query = self.lp.parserQuery() - if constraints is not None: - query.setConstraints(constraints) - # logging.debug("tokenizing...") - toks = self.tokenize(sentence) - # logging.debug("running parse...") - query.parse(toks) - # logging.debug("getting best...") - parse = query.getBestParse() - # logging.debug("getting gs...") - gs = self.get_grammatical_structure(parse) - # dependencies = gs.typedDependenciesCollapsed() - dependencies = gs.typedDependenciesCCprocessed() - return parse, gs, dependencies - - def parse_sens(self, in_file, out_file, log=False): - logging.debug("reading input...") - with open(in_file) as in_obj: - sens = json.load(in_obj) - parsed_sens = [] - if log: - log_file = NamedTemporaryFile(dir="/tmp", delete=False) - for c, sentence in enumerate(sens): - if log and c % 100 == 0: - log_file.write("parsed {0} sentences\n".format(c)) - log_file.flush() - parse, _, dependencies = self.parse(sentence) - - dep_strings = map(unicode, dependencies) - parsed_sens.append({ - 'sen': sentence, - 'deps': dep_strings}) - - with open(out_file, 'w') as out: - json.dump(parsed_sens, out) - - def parse_definitions(self, in_file, out_file): - with open(in_file) as in_obj: - logging.info("loading input...") - entries = json.load(in_obj) - logging.info("done!") - with NamedTemporaryFile(dir="/tmp", delete=False) as log_file: - logging.info('logging to {0}'.format(log_file.name)) - for c, entry in enumerate(entries): - # log_file.write( - # 'entry: {0}\n'.format(entry['hw']).encode('utf-8')) - # log_file.flush() - if c % 100 == 0: - log_file.write("parsed {0} entries\n".format(c)) - log_file.flush() - for sense in entry['senses']: - sentence = sense['definition'] - if sentence is None: - continue - # sentence += '.' # fixes some parses and ruins others - pos = sense['pos'] - constraints = StanfordParser.get_constraints(sentence, pos) - try: - parse, _, dependencies = self.parse_with_constraints( - sentence, constraints) - except: - sys.stderr.write( - u'parse failed on sentence: {0}'.format( - sentence).encode('utf-8')) - dep_strings = [] - else: - dep_strings = map(unicode, dependencies) - - sense['definition'] = { - 'sen': sentence, - 'deps': dep_strings} - - with open(out_file, 'w') as out: - json.dump(entries, out) - -def test(): - logging.warning("running test, not main!") - parser = StanfordParser(sys.argv[2]) - - # dv_model = parser.lp.reranker.getModel() - # print dv_model - - # sentence = 'the size of a radio wave used to broadcast a radio signal' - sentence = 'a man whose job is to persuade people to buy his company\'s \ - products.' - pos = 'n' - parse, gs, dependencies = parser.parse_with_constraints( - sentence, StanfordParser.get_constraints(sentence, pos)) - - print type(parse), type(gs) - print parse.pennPrint() - print "\n".join(map(str, dependencies)) - -def main(): - parser_file, in_file, out_file, is_defs, loglevel = sys.argv[2:7] - logging.basicConfig( - level=int(loglevel), - format="%(asctime)s : " + - "%(module)s (%(lineno)s) - %(levelname)s - %(message)s") - logging.info("initializing parser...") - parser = StanfordParser(parser_file) - logging.info("done!") - if int(is_defs): - parser.parse_definitions(in_file, out_file) - else: - parser.parse_sens(in_file, out_file) - -if __name__ == "__main__": - main() - # test() diff --git a/src/fourlang/stanford_wrapper.py b/src/fourlang/stanford_wrapper.py deleted file mode 100644 index dee7004..0000000 --- a/src/fourlang/stanford_wrapper.py +++ /dev/null @@ -1,218 +0,0 @@ -from ConfigParser import ConfigParser -import json -import logging -import os -import requests -import subprocess -from subprocess import Popen, PIPE -import sys -from tempfile import NamedTemporaryFile - -from utils import ensure_dir - -class StanfordWrapper(): - - http_request_headers = { - 'Content-type': 'application/json', 'Accept': 'text/plain'} - - class ParserError(Exception): - pass - - def __init__(self, cfg, is_server=False): - self.cfg = cfg - remote = self.cfg.getboolean('stanford', 'remote') - if is_server or not remote: - self.get_stanford_paths() - if is_server: - # used as server - self.start_parser() - self.parse_sentences = self.parse_sentences_server - else: - # standalone, using jython - self.get_jython_paths() - self.parse_sentences = self.parse_sentences_local - else: - # used as client - self.server_url = self.cfg.get('stanford', 'url') - self.parse_sentences = self.parse_sentences_remote - - def get_stanford_paths(self): - self.stanford_dir = self.cfg.get('stanford', 'dir') - parser_fn = self.cfg.get('stanford', 'parser') - self.model_fn = self.cfg.get('stanford', 'model') - self.parser_path = os.path.join(self.stanford_dir, parser_fn) - self.model_path = os.path.join(self.stanford_dir, self.model_fn) - if not (os.path.exists(self.parser_path) and - os.path.exists(self.model_path)): - raise Exception("cannot find parser and model files!") - - def get_jython_paths(self): - self.jython_path = self.cfg.get('stanford', 'jython') - if not os.path.exists(self.jython_path): - raise Exception("cannot find jython executable!") - - self.jython_module = os.path.join( - os.path.dirname(__file__), "stanford_parser.py") - - self.tmp_dir = self.cfg.get('data', 'tmp_dir') - ensure_dir(self.tmp_dir) - - def start_parser(self): - command = [ - 'java', '-mx1500m', '-cp', '{0}/*:'.format(self.stanford_dir), - 'edu.stanford.nlp.parser.lexparser.LexicalizedParser', - '-outputFormat', 'typedDependenciesCollapsed', - '-sentences', 'newline', - 'edu/stanford/nlp/models/lexparser/{0}'.format(self.model_fn), - '-'] - - logging.info( - "starting stanford parser with this command: {0}".format( - ' '.join(command))) - - self.parser_process = Popen(command, stdin=PIPE, stdout=PIPE) - - def parse_sentences_server(self, sens, definitions=False): - parsed_sens = [] - for c, sentence in enumerate(sens): - parsed_sens.append({'sen': sentence, 'deps': []}) - # logging.info('writing to stdin...') - self.parser_process.stdin.write(sentence+'\n') - self.parser_process.stdin.flush() - # logging.info('reading from stdout...') - line = self.parser_process.stdout.readline().strip() - while line: - # logging.info('read this: {0}'.format(repr(line))) - if line == '': - break - parsed_sens[-1]['deps'].append(line.strip()) - line = self.parser_process.stdout.readline().strip() - - # logging.info('returning parsed sens') - return parsed_sens - - def create_input_file(self, sentences, token): - sen_file = NamedTemporaryFile( - dir=self.tmp_dir, prefix=token, delete=False) - for sen in sentences: - # need to add a period so the Stanford Parser knows where - # sentence boundaries are. There should be a smarter way... - sen_file.write( - u"{0}\n".format(sen['sen']).encode('utf-8')) - - return sen_file.name - - def run_parser(self, in_file, out_file, definitions): - return_code = subprocess.call([ - self.jython_path, self.jython_module, self.parser_path, - self.model_path, in_file, out_file, str(int(definitions)), - str(logging.getLogger(__name__).getEffectiveLevel())]) - return return_code == 0 - - def parse_sentences_old(self, sentences): - """sentences should be a list of dictionaries, each with a "sen" key - whose value will be parsed, a "deps" key whose value is a list for - collecting dependencies, and a "pos" key that may map to constraints on - the parse""" - with NamedTemporaryFile(dir=self.tmp_dir, delete=False) as in_file: - json.dump(sentences, in_file) - in_file_name = in_file.name - with NamedTemporaryFile(dir=self.tmp_dir, delete=False) as out_file: - success = self.run_parser(in_file_name, out_file.name) - if not success: - logging.critical( - "jython returned non-zero exit code, aborting") - raise StanfordWrapper.ParserError() - parsed_sentences = json.load(out_file) - sentences.update(parsed_sentences) - return True - - def parse_sentences_remote(self, entries, definitions=False): - req = requests.get( - self.server_url, data=json.dumps(entries), - headers=StanfordWrapper.http_request_headers) - - return json.loads(req.text) - - def parse_sentences_local(self, entries, definitions=False): - with NamedTemporaryFile(dir=self.tmp_dir, delete=False) as in_file: - json.dump(entries, in_file) - in_file_name = in_file.name - logging.info("dumped input to {0}".format(in_file_name)) - - with NamedTemporaryFile(dir=self.tmp_dir, delete=False) as out_file: - out_file_name = out_file.name - logging.info("writing parses to {0}".format(out_file_name)) - success = self.run_parser(in_file_name, out_file_name, definitions) - - if not success: - logging.critical( - "jython returned non-zero exit code, aborting") - raise StanfordWrapper.ParserError() - - logging.debug("reading output...") - with open(out_file_name) as out_file: - new_entries = json.load(out_file) - - return new_entries - -def main_flask(wrapper): - from flask import Flask, request, Response - app = Flask(__name__) - - @app.route("/") - def hello(): - sens = request.get_json() - # logging.info('got this: {0}'.format(sens)) - parsed_sens = wrapper.parse_sentences(sens) - # logging.info('returning response...') - # logging.info('returning this: {0}'.format(parsed_sens)) - return Response(json.dumps(parsed_sens), mimetype='application/json') - - app.run() - -TEST_DATA = [ - ("rawhide", "leather that is in its natural state", "n"), - ("playback", "the playback of a tape that you have recorded is when you play it on a machine in order to watch or listen to it", "n"), # nopep8 - ("playhouse", "a theatre - used in the name of theatres", "n"), - ("extent", "used to say how true something is or how great an effect or change is", "n"), # nopep8 - ("indigenous", "indigenous people or things have always been in the place where they are, rather than being brought there from somewhere else", "n"), # nopep8 - ("off-street", "places for parking that are not on public streets", "n"), - ("half-caste", "a very offensive word for someone whose parents are of different races.", "n"), # nopep8 - ("concordant", "being in agreement or having the same regular pattern", "n"), # nopep8 - ("groundsman", "a man whose job is to take care of a large garden or sports field", "n") # nopep8 -] -def test(wrapper): - - entries = [{"hw": w, - "senses": [{ - "definition": d, "pos": "a" if n else 'a', "flags": []}]} - for w, d, n in TEST_DATA] - entries += [{ - "hw": "wombat", - "senses": [{ - "definition": "an Australian animal like a small bear whose babies\ - live in a pocket of skin on its body", - "pos": "n", - "flags": []}]}] - - parsed_entries = wrapper.parse_sentences( - entries, definitions=True) - print json.dumps(parsed_entries) - -def main(): - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s : " + - "%(module)s (%(lineno)s) - %(levelname)s - %(message)s") - - cfg_file = 'conf/default.cfg' if len(sys.argv) < 2 else sys.argv[1] - cfg = ConfigParser() - cfg.read([cfg_file]) - - wrapper = StanfordWrapper(cfg) - test(wrapper) - - -if __name__ == '__main__': - main()