diff --git a/.gitignore b/.gitignore index 4045b59..fddb97b 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,9 @@ *.pyc *.swp data +test +input + +build +dist +*egg-info diff --git a/conf/default.cfg b/conf/default.cfg index db492c8..8af311f 100644 --- a/conf/default.cfg +++ b/conf/default.cfg @@ -37,3 +37,7 @@ graph_dir = test/graphs/wikt_test [deps] dep_map = dep_to_4lang.txt + +[qa] +input_file = test/input/clef_qa_sample.xml +output_file = test/qa/clef_qa_sample.answers diff --git a/conf/qa.cfg b/conf/qa.cfg new file mode 100644 index 0000000..238eb89 --- /dev/null +++ b/conf/qa.cfg @@ -0,0 +1,3 @@ +[qa] +input_file = test/input/clef_qa_sample.xml +output_file = test/qa/clef_qa_sample.answers diff --git a/conf/wikt_medium.cfg b/conf/wikt_medium.cfg new file mode 100644 index 0000000..8ebb04d --- /dev/null +++ b/conf/wikt_medium.cfg @@ -0,0 +1,9 @@ +[dict] +input_type = wiktionary +input_file = test/input/wikt_medium.xml +output_file = test/dict/wikt_medium.json + +[machine] +definitions = data/machines/4lang.pickle:0 +ext_definitions = test/machines/wikt_medium.pickle +graph_dir = test/graphs/wikt_medium diff --git a/conf/wikt_small.cfg b/conf/wikt_small.cfg new file mode 100644 index 0000000..87adacc --- /dev/null +++ b/conf/wikt_small.cfg @@ -0,0 +1,9 @@ +[dict] +input_type = wiktionary +input_file = test/input/wikt_small.xml +output_file = test/dict/wikt_small.json + +[machine] +definitions = data/machines/4lang.pickle:0 +ext_definitions = test/machines/wikt_small.pickle +graph_dir = test/graphs/wikt_small diff --git a/default_index b/default_index deleted file mode 100644 index b81da25..0000000 --- a/default_index +++ /dev/null @@ -1,150 +0,0 @@ - 365 in 2758 - 331 at 2744 - 59 place 1026 - 45 can 1246 - 40 for 2782 - 38 from 2742 - 24 sound 993 - 18 light 739 - 17 man 744 - 15 society 2285 - 15 open 1814 - 14 hair 3359 - 13 thin 2598 - 13 pay 812 - 13 catch 828 - 12 space 2327 - 11 right 1191 - 11 area 2366 - 10 state 76 - 10 lead 2617 - 10 get 1223 - 9 like 3382 - 9 cook 825 - 8 to 2743 - 8 state 77 - 7 straight 563 - 7 man 659 - 7 bend 975 - 6 trunk 2759 - 6 close 3381 - 5 with 60 - 5 thick 2752 - 5 succeed 2718 - 5 room 2235 - 5 place 2326 - 5 lot 3394 - 5 fat 3337 - 5 fall 2694 - 5 bend 1112 - 4 thick 2134 - 4 season 548 - 4 ring 402 - 4 right 3122 - 4 mean 1186 - 4 firm 2215 - 4 care 82 - 4 bite 1001 - 3 trick 244 - 3 -th 5 - 3 spring 2318 - 3 spirit 2181 - 3 shut 2668 - 3 second 1569 - 3 right 1199 - 3 pay 237 - 3 order 2739 - 3 letter 278 - 3 let 971 - 3 expression 1332 - 3 develop 758 - 3 close 1413 - 3 circular 1294 - 3 back 2639 - 2 space 2509 - 2 sink 2747 - 2 set 2746 - 2 post 2740 - 2 plane 2807 - 2 order 1942 - 2 may 1560 - 2 march 1563 - 2 light 1381 - 2 letter 1539 - 2 interest 517 - 2 in 10 - 2 horn 2772 - 2 from 1838 - 2 flat 1493 - 2 figure 140 - 2 fast 940 - 2 court 2515 - 2 course 1927 - 2 company 2549 - 2 base 146 - 2 act 2373 - 1 wound 2068 - 1 weary 713 - 1 warm 1655 - 1 use_for 2782 - 1 turn 860 - 1 trunk 1910 - 1 trouble 217 - 1 tire 2528 - 1 thin 1038 - 1 the 62 - 1 temple 982 - 1 take 654 - 1 stroke 2749 - 1 steep 1673 - 1 stage 837 - 1 spoil 1640 - 1 sound 512 - 1 since 1839 - 1 second 1570 - 1 ring 2735 - 1 return 2643 - 1 regard 2312 - 1 play 1175 - 1 pet 966 - 1 mrs 35 - 1 mock 69 - 1 miss 1357 - 1 minister 3402 - 1 meal 543 - 1 match 1134 - 1 manner 1706 - 1 like 1701 - 1 kind 1274 - 1 keep 1646 - 1 in 2757 - 1 hail 1179 - 1 habit 2237 - 1 game 1173 - 1 formal 143 - 1 fit 1135 - 1 fat 1399 - 1 fall 1883 - 1 expression 64 - 1 express 2757 - 1 effect 1014 - 1 draw 2707 - 1 dislike 3382 - 1 develop 759 - 1 desert 1981 - 1 cure 934 - 1 cup 395 - 1 court 3124 - 1 cool 1103 - 1 comfort 1240 - 1 club 355 - 1 circular 1389 - 1 charge 457 - 1 chance 2770 - 1 cage 1307 - 1 bow 2698 - 1 bore 2697 - 1 blow 864 - 1 beam 2722 - 1 bathe 873 - 1 balance 1607 diff --git a/longman_old b/longman_old deleted file mode 100644 index 7f3b62c..0000000 --- a/longman_old +++ /dev/null @@ -1,444 +0,0 @@ --able -accordance -accustom -admiration --al -alike -all right -aloud -altogether -amongst --an --ance -ankle -ant -anybody -anyhow -apparatus -appoint -April --ar -arch -archway -arise -armour -arms -arrow -ash(es) -aside --ate --ation -attentive -August -aunt -axe -bacteria (-ium) -banana -bare -barrel -basin -bathe -beard -bee -beg -berry -Bible -bind -bleed -bless -blood -bold -bowel -brass -breadth -British -Buddhist -Buddhism -bunch -cage -camel -candle -captain -cardboard -cart -cave -cease -cement -cent -certainty -chairman -chalk -charm -cheer -cheerful -chief -child, children -chimney -Christian -Christianity -Christmas -civilize -clerk -coarse -coconut -combine -command -companion -compass -compound -conquer -conscience -consonant -content -contents -convenient -copper -cord -coward -creep -cricket -crown -cultivate -curse -cycle -dare -daring -dear -December -decimal -declare -deed -deer -delight -descend -descriptive -desert -determine -devil -diamond -dip -dis- -discourage -ditch -division --dom -donkey -doorway -dot -double -drag -drown --ed -elastic -elder -eldest -electrical -electrician -elephant -empire -en- --en --ence -English -enquire --er --ese -eyelid -fade -faint -fairy -fame -fancy -fate -favour -feast -February -fellow -fever -fierce -fisherman -fond -foolish -forbid -fore- -forehead -formerly -fort -fox -framework -Friday -fro --ful -fulfil -funny -furnish -gaiety -garment -gay -generosity -gentleman -glorious -glory -god -God -goddess -golden -grace -grand -grave -grief -grieve -guilt -gunpowder -habitual -handkerchief -happening -happiness -harbour -haste -hasten -hastily -hasty -heap -hen -Hindu -Hinduism -hire -honesty -honourable --hood -horizon -host -humble -hunger -hut -I --ible --ic --ical -icy -im- -in- -inch -indeed -influential -inform --ing -inn -inquire -inquiry -instruct --ion -ir- --ish --ist --ity --ive --ization --ize -January -jealous -jealousy -jelly -Jew -Jewish -joy -July -June -kilo -kingdom -labour -ladder -lady -lately -latter -lawful -lead --less -lid -lightning --like -limb -lime -loaf -lodge -lodgings -log -lump --ly -mad -magician -mail -manners -March -master -mat -May -mend --ment -mercy -merry -messenger -metric -microscope -mid- -mill -mis- -modest -Monday -mosque -mosquito -Mr -Mrs -Muslim -native -naval -neighbour --ness -nest -noble -nobleman -nobody -non- -none -no one -November -nylon -October -odd --or -ornament -otherwise --ous -overflow -owing to -ox(en) -packet -palace -parcel -paste -pastry -peculiar -penny -per -permit -photography -pillar -policeman -popularity -postage stamp -precious -presence -pride -prick -prickle -prickly -probable -probability -produce -provision(s) -pupil -quite -rapid -re- -receipt -recognition -rejoice -remains -republic -respectful -ripe -rod --ry -Saturday -scarce -scarcely -scenery -scold -secrecy -seize -self -sensation -senseless -September -shield -shilling --ship -signature -sir -slave -slip -so-colled -solemn -somebody -sorrow -spade -spear -spirit -splendid -stocking(s) -stranger -stroke -stupidity -subtract -Sunday -supper -swift -tears -telegram -telegraph -temple -tender -terror --th -thirst -Thursday -thus -tiger -till -timetable -tin -tire -tour -tremble -tropic -tropics -troublesome -trunk -trustworthy -Tuesday -un- -uncle -underneath -unity -upon --ure -urge -vary -vessel -vice- -voyage -wander --ward(s) -watchman -wax -Wednesday -weep -well- -wherever -whichever -whisper -wicked -wind v -witness -wonder --work -worm -worship -worthy (of) -wreck --y -yard -yesterday -yield -youth diff --git a/src/fourlang/clef_qa_parser.py b/src/fourlang/clef_qa_parser.py index 378c038..0a84664 100644 --- a/src/fourlang/clef_qa_parser.py +++ b/src/fourlang/clef_qa_parser.py @@ -25,7 +25,7 @@ def get_questions(r_body): answers = [ {"id": int(a_id), "answer": a_str.strip()} for a_id, a_str in QAParser.answer_regex.findall(q_body)] - questions.append({"id": int(q_id), "question": q_str.strip(), + questions.append({"id": int(q_id), "q_text": q_str.strip(), "answers": answers}) return questions @@ -35,7 +35,7 @@ def get_tests(xml): for r_id, r_body in QAParser.test_regex.findall(t_body): docs = [ {"id": int(d_id), - "doc": QAParser.html_parser.unescape(d_body).strip()} + "text": QAParser.html_parser.unescape(d_body).strip()} for d_id, d_body in QAParser.doc_regex.findall(r_body)] questions = QAParser.get_questions(r_body) yield {'t_id': int(t_id), 't_name': t_name, 'r_id': int(r_id), diff --git a/src/fourlang/dep_to_4lang.py b/src/fourlang/dep_to_4lang.py index ae6f0d7..c2b6db1 100644 --- a/src/fourlang/dep_to_4lang.py +++ b/src/fourlang/dep_to_4lang.py @@ -142,6 +142,8 @@ def get_machines_from_deps_and_corefs(self, dep_lists, corefs): # logging.info('w1: {0}, w2: {1}'.format(word1, word2)) c_word1 = coref_index[word1].get(i, word1) c_word2 = coref_index[word2].get(i, word2) + + """ if c_word1 != word1: logging.warning( "unifying '{0}' with canonical '{1}'".format( @@ -150,6 +152,7 @@ def get_machines_from_deps_and_corefs(self, dep_lists, corefs): logging.warning( "unifying '{0}' with canonical '{1}'".format( word2, c_word2)) + """ # logging.info( # 'cw1: {0}, cw2: {1}'.format(c_word1, c_word2)) diff --git a/src/fourlang/qa.py b/src/fourlang/qa.py new file mode 100644 index 0000000..6e4c030 --- /dev/null +++ b/src/fourlang/qa.py @@ -0,0 +1,55 @@ +import logging +import sys + +import nltk.data + +from clef_qa_parser import QAParser +from text_to_4lang import TextTo4lang +from utils import get_cfg + +__LOGLEVEL__ = 'INFO' + +class QuestionAnswerer: + def __init__(self, cfg): + self.cfg = cfg + + nltk.download('punkt', quiet=True) + self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') + self.text_to_4lang = TextTo4lang(cfg) + + def answer_question(self, question, model): + return 'No idea yet' + + def run(self): + logging.info('running QA...') + input_file = self.cfg.get('qa', 'input_file') + for entry in QAParser.parse_file(input_file): + logging.info('processing text...') + sens = [] + for doc in entry['docs']: + sens += self.sent_detector.tokenize(doc['text']) + + model = self.text_to_4lang.process(sens) + + logging.info('processing questions...') + for question in entry['questions']: + answer = self.answer_question(question, model) + print answer + + def answer_questions(self): + for question in self.questions: + + self.answer_question(question) + +def main(): + logging.basicConfig( + level=__LOGLEVEL__, + format="%(asctime)s : " + + "%(module)s (%(lineno)s) - %(levelname)s - %(message)s") + cfg_file = sys.argv[1] if len(sys.argv) > 1 else None + cfg = get_cfg(cfg_file) + qa = QuestionAnswerer(cfg) + qa.run() + +if __name__ == "__main__": + main() diff --git a/src/fourlang/text_to_4lang.py b/src/fourlang/text_to_4lang.py index e9df4f1..8cc2f8a 100644 --- a/src/fourlang/text_to_4lang.py +++ b/src/fourlang/text_to_4lang.py @@ -59,7 +59,8 @@ def main(): sens = sens[:max_sens] words_to_machines = text_to_4lang.process(sens, print_deps=True) - print_text_graph(words_to_machines, cfg.get('machine', 'graph_dir')) + fn = print_text_graph(words_to_machines, cfg.get('machine', 'graph_dir')) + logging.info('wrote graph to {0}'.format(fn)) if __name__ == "__main__": main() diff --git a/src/fourlang/utils.py b/src/fourlang/utils.py index 6c1b694..5126371 100644 --- a/src/fourlang/utils.py +++ b/src/fourlang/utils.py @@ -20,6 +20,7 @@ def print_text_graph(words_to_machines, graph_dir): fn = os.path.join(graph_dir, 'text.dot') with open(fn, 'w') as f: f.write(graph.to_dot().encode('utf-8')) + return fn def print_4lang_graphs(words_to_machines, graph_dir): for word, machine in words_to_machines.iteritems(): diff --git a/input/clef_qa_sample.xml b/test/input/clef_qa_sample.xml similarity index 100% rename from input/clef_qa_sample.xml rename to test/input/clef_qa_sample.xml