In [1]:
from __future__ import  print_function, division

import nltk
from codecs import open
import os
from os.path import join

DATA_DIR = join(os.environ['HOME'], 'data', 'allen-ai-challenge')
TRAINING_SET = join(DATA_DIR, 'training_set.tsv')
TRAINING_SET_MERGED = join(DATA_DIR, 'training_set_merged.tsv')

VALIDATION_SET = join(DATA_DIR, 'validation_set.tsv')
VALIDATION_SET_MERGED = join(DATA_DIR, 'validation_set_merged.tsv')

In [2]:
def is_statement_finished(*statements):
    return all([a[-1] in '.?' for a in statements])

def extract_np(result):
    for s in result.subtrees(filter=lambda x: x.label()=='NP'):
        return [L[0] for L in s.leaves()]
    
def normalize_sentences(sents):
    ws = []
    for s in sents:
        ws.extend(nltk.word_tokenize(s.lower()))
    return ' '.join(ws)


def fix_question(txt_q):
    if txt_q.find('.Which'):
        txt_q = txt_q.replace('.Which', '. Which')
    return txt_q

In [5]:
which_grammar = r"""
NP:
{<WDT><.*>*}          # Chunk everything
}<V.?[^G]?|MD|\.>+{      # Chink sequences of V.*"""

what_grammar = r"""
NP:
{<WP|WDT><.*>*}          # Chunk everything
}<V.?[^G]?|MD|\.>+{      # Chink sequences of V.*"""

def merge_qa(question, answers):
        sents = nltk.sent_tokenize(fix_question(question))
        
        new_q = ''
        if not is_statement_finished(sents[-1]) and is_statement_finished(*answers):
            return [normalize_sentences(sents[:-1] + [sents[-1]+ ' ' + a]) for a in answers]
            
        if '________' in question:
#             assert not is_statement_finished(*answers)
            rc = []
            for a in answers:
                ws = []
                for s in sents:
                    for w in nltk.word_tokenize(s.lower()):
                        if '________' in w:
                            ws.extend(nltk.word_tokenize(a.lower()))
                        else:
                            ws.append(w)
                rc.append(' '.join(ws))
            return rc
        
        if 'which' in sents[-1].lower() and not is_statement_finished(*answers):
            words = nltk.word_tokenize(sents[-1].lower())
            tagged_words = nltk.pos_tag(words)
#             print(tagged_words)
            
            cp = nltk.RegexpParser(which_grammar)
            result = cp.parse(tagged_words)
#             print(result)
            
            np = ' '.join(extract_np(result))
            rc = []
            for a in answers:
                new_last_qs = sents[-1].lower().replace(np, a).replace('?', '.')
                rc.append(normalize_sentences(sents[:-1] + [new_last_qs]))
            return rc
        
        if 'which' in sents[-1].lower() and is_statement_finished(*answers):
            words = nltk.word_tokenize(sents[-1].lower())
            new_last_qs = sents[-1].lower().replace('which', 'that').replace('?', '.')
            return [normalize_sentences(sents[:-1] + [a.lower(), new_last_qs]) for a in answers]
        
        if 'what' in sents[-1].lower() and not is_statement_finished(*answers):
            
            words = nltk.word_tokenize(sents[-1].lower())
            tagged_words = nltk.pos_tag(words)
#             print(tagged_words)
            
            cp = nltk.RegexpParser(what_grammar)
            result = cp.parse(tagged_words)
#             print(result)
            
            np = ' '.join(extract_np(result))
            rc = []
            for a in answers:
                new_last_qs = sents[-1].lower().replace(np, a).replace('?', '.')
                rc.append(normalize_sentences(sents[:-1] + [new_last_qs]))
            return rc
        if 'what' in sents[-1].lower() and is_statement_finished(*answers):
            words = nltk.word_tokenize(sents[-1].lower())
            new_last_qs = sents[-1].lower().replace('what', 'that').replace('?', '.')
            return [normalize_sentences(sents[:-1] + [a.lower(), new_last_qs]) for a in answers]
        
        return [normalize_sentences(sents + [a]) for a in answers]
    
qids = ['100645']
with open(TRAINING_SET, encoding='utf8') as f:
    f.readline()  # skip header
    i = 0
    for line in f:
        qid, q, c, aa, ab, ac, ad = line.strip().split('\t')
        if qid not in qids:
            continue
        print(q)
        print([aa, ab, ac, ad])
        print(merge_qa(q, [aa, ab, ac, ad])[0])

Paper chromatography is a process used to separate mixtures of substances into their components. The components are carried by a mobile phase through a stationary phase made of absorbent paper. An investigation analyzed a sample of black ink to determine its components.Which property allows the components to separate?
[u'the solubility of the components in the mobile phase', u'the evaporation rate of the components at a certain temperature', u'the magnetic property of the components', u'the thickness of the paper used as the stationary phase']
paper chromatography is a process used to separate mixtures of substances into their components . the components are carried by a mobile phase through a stationary phase made of absorbent paper . an investigation analyzed a sample of black ink to determine its components . the solubility of the components in the mobile phase allows the components to separate .


In [6]:
tests = {}
# when question is finishing directly with answers
tests['100064'] = 'trees most likely change the environment in which they are located by releasing nitrogen in the soil .'
# substitue _______ with answers
tests['100192'] = 'an inherited trait is determined by a single gene .'
tests['100306'] = 'the golgi apparatus is the structure responsible for modifying proteins , packaging proteins into vesicles , and transporting them to the plasma membrane for secretion .'
# simple which statements
tests['100002'] = 'smelling the air for odors describes a learned behavior in a dog .'
tests['100004'] = 'the symptoms of the disease is a distinction between an epidemic and a pandemic .'
tests['100007'] = 'water should a student apply to the skin if he or she gets splashed with an acid .'
tests['100009'] = 'tension has the greatest effect on aiding the movement of blood through the human body .'
tests['100016'] = 'helium is likely to be found in an organic compound .'
tests['100022'] = 'mitochondrion allows nutrients to pass into cells .'
tests['100030'] = 'some ancient greeks tried to discover the laws of the universe through thought and logic . analysis were these scientists missing .'
tests['100066'] = 'blood pressure is often used as an indicator of cardiovascular health . blood pressure is most often measured in cmhg .'
tests['100068'] = 'frost wedging occurs when rocks are broken into smaller pieces by water freezing and expanding in the cracks of the rock . frost wedging is considered a part of weathering .'
# "which" in question, but answers are complete
tests['100017'] = 'solid materials absorb seismic waves . that statement describes a principle scientists have used to learn more about the structure of earth \'s interior .'
tests['100051'] = 'a family owns a vacation cabin located on a hillside below a gas station with a leaking gasoline storage tank . water is pumped to the cabin from a distant reservoir . in that situation is the drinking water for the cabin most likely to be contaminated .'
# "what" in question, answer is not complete
tests['100001'] = 'when athletes begin to exercise , their heart rates and respiration rates increase . at at the tissue level does the human body coordinate these functions .'
tests['100167'] = ''
# "what" in question, answer is complete
tests['100019'] = 'robots can perform tasks that are dangerous for humans . the assembly pieces must be very small . that is the major limitation to the use of robots .'
tests['100071'] = 'a plant that grows red flowers was crossed with the same kind of plant that grows white flowers . their offspring grew pink flowers . the offspring experienced a genetic mutation . that best explains why the offspring grew pink flowers .'
# "why" and others are handled in same way. Answers are appended to the question.
tests['100096'] = 'why is competition among males during mating season important in some animal species ? it ensures that genes from the fittest animals are passed on .'
# tests['100051'] = ''

In [7]:
# run tests
qids = tests.keys()
# qids = '100051'

with open(TRAINING_SET, encoding='utf8') as f:
    f.readline()  # skip header
    i = 0
    for line in f:
        qid, q, c, aa, ab, ac, ad = line.strip().split('\t')
        if qid not in qids:
            continue        
        merged = merge_qa(q, [aa, ab, ac, ad])
        if tests.get(qid, '') != merged[0]:
            print(line)
            print(merged[0])
            print('-'*50)
        else:
            print('%s is ok' % qid)
        line = nltk.pos_tag(line.split())

100001 is ok
100002 is ok
100004 is ok
100007 is ok
100009 is ok
100016 is ok
100017 is ok
100019 is ok
100022 is ok
100030 is ok
100051 is ok
100064 is ok
100066 is ok
100068 is ok
100071 is ok
100096 is ok
100167	An air mass in a valley travels up a mountainside. What causes the movement of this air?	C	tidal pull of the moon's gravity	evaporation of water from soil in the valley	warming by solar energy re-radiated from the ground	cooling effect of ice crystals in the air over the mountain

an air mass in a valley travels up a mountainside . tidal pull of the moon 's gravity causes the movement of this air .
--------------------------------------------------
100192 is ok
100306 is ok


In [8]:
grammar = r"""
  NP:
    {<WDT><.*>*}          # Chunk everything
    }<V.?[^G]?|MD|\.>+{      # Chink sequences of VBD and IN
  """
sentence = [(u'blood', 'NN'), (u'pressure', 'NN'), (u'is', 'VBZ'), (u'most', 'RBS'), (u'often', 'RB'), (u'measured', 'VBN'), (u'in', 'IN'), (u'which', 'WDT'), (u'of', 'IN'), (u'the', 'DT'), (u'following', 'VBG'), (u'units', 'NNS'), (u'?', '.')]
cp = nltk.RegexpParser(grammar)
print(cp.parse(sentence))

(S
  blood/NN
  pressure/NN
  is/VBZ
  most/RBS
  often/RB
  measured/VBN
  in/IN
  (NP which/WDT of/IN the/DT following/VBG units/NNS)
  ?/.)


In [11]:
%%time
with open(TRAINING_SET, encoding='utf8') as f:
    f.readline()  # skip header
    with open(TRAINING_SET_MERGED, mode='w', encoding='utf8') as fo:
        for line in f:
            qid, q, c, aa, ab, ac, ad = line.strip().split('\t')
#             if int(qid) < 101767:
#                 continue
            try:
                merged = merge_qa(q, [aa, ab, ac, ad])
                print('\t'.join([qid, c, q] + merged), file=fo)
            except Exception as ex:
                print(qid, ex)
#             print(qid, q, '???', aa)
#             print(merged[0])
#             print('-'*50)

CPU times: user 23min 56s, sys: 15.5 s, total: 24min 12s
Wall time: 24min 12s


In [12]:
%%time
with open(VALIDATION_SET, encoding='utf8') as f:
    f.readline()  # skip header
    with open(VALIDATION_SET_MERGED, mode='w', encoding='utf8') as fo:
        for line in f:
            qid, q, aa, ab, ac, ad = line.strip().split('\t')
            try:
                merged = merge_qa(q, [aa, ab, ac, ad])
                print('\t'.join([qid, c, q] + merged), file=fo)
            except Exception as ex:
                print(qid, ex)

CPU times: user 42min 40s, sys: 23.7 s, total: 43min 4s
Wall time: 43min 5s
