In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from nltk import RegexpTokenizer, pos_tag, sent_tokenize, word_tokenize
from pandas.io.json import json_normalize

import json
import pandas as pd
import re

In [3]:
data = []
with open('dataset/SQuAD/train-v2.0.json') as f:
    json_data = json.load(f)['data']

    for i in range(len(json_data)):
        json_data_i = json_data[i]['paragraphs']
        
        for j in range(1):
            paragraph = json_data_i[j]['context']
            
            data.append([sentence for sentence in sent_tokenize(paragraph)])

# Sentence Selection

### Sentence importance sorting

In [4]:
idx = 131

for i in range(len(data[idx])):
    print(i, data[idx][i])

0 Beginning in 1689, the colonies became involved in a series of wars between Great Britain and France for control of North America, the most important of which were Queen Anne's War, in which the British conquered French colony Acadia, and the final French and Indian War (1754–63) when Britain was victorious over all the French colonies in North America.
1 This final war was to give thousands of colonists, including Virginia colonel George Washington, military experience which they put to use during the American Revolutionary War.


In [5]:
from sentence_selection import get_ranked_sentences

get_ranked_sentences(' '.join(data[idx]))

[1, 0]

# Sentence Parsing

In [8]:
from stanford_parser import StanfordParser

parser = StanfordParser()

sentence = data[idx][0]
sentence_tree = list(parser.parse(sentence))
print(list(map(lambda x: print(x), sentence_tree)))

(ROOT
  (S
    (PP (VBG Beginning) (PP (IN in) (NP (CD 1689))))
    (, ,)
    (NP (DT the) (NNS colonies))
    (VP
      (VBD became)
      (NP
        (NP
          (ADJP
            (VBN involved)
            (PP
              (IN in)
              (NP
                (NP (DT a) (NN series))
                (PP
                  (IN of)
                  (NP
                    (NP (NNS wars))
                    (PP
                      (IN between)
                      (NP
                        (NNP Great)
                        (NNP Britain)
                        (CC and)
                        (NNP France)))))
                (PP
                  (IN for)
                  (NP
                    (NP (NN control))
                    (PP
                      (IN of)
                      (NP
                        (NP (NNP North) (NNP America))
                        (, ,)
                        (SBAR
                          (WHNP
                            (WHNP 

# NER

In [9]:
from ner.NER import NER

ner = NER('models/ner_model.pkl')

Load Model Success


In [10]:
ner_classes = ner.predict_class_text(sentence)

sentence_words = RegexpTokenizer(r'\w+').tokenize(sentence)
sentence_ner_classes = []
for word, ner_class in zip(sentence_words, ner_classes[0]):
    sentence_ner_classes.append((word, ner_class))

sentence_ner_classes

[('Beginning', 'O'),
 ('in', 'O'),
 ('1689', 'O'),
 ('the', 'O'),
 ('colonies', 'O'),
 ('became', 'O'),
 ('involved', 'O'),
 ('in', 'O'),
 ('a', 'O'),
 ('series', 'O'),
 ('of', 'O'),
 ('wars', 'O'),
 ('between', 'O'),
 ('Great', 'O'),
 ('Britain', 'B-LOC'),
 ('and', 'O'),
 ('France', 'B-LOC'),
 ('for', 'O'),
 ('control', 'O'),
 ('of', 'O'),
 ('North', 'B-LOC'),
 ('America', 'I-LOC'),
 ('the', 'O'),
 ('most', 'O'),
 ('important', 'O'),
 ('of', 'O'),
 ('which', 'O'),
 ('were', 'O'),
 ('Queen', 'O'),
 ('Anne', 'B-PER'),
 ('s', 'I-PER'),
 ('War', 'O'),
 ('in', 'O'),
 ('which', 'O'),
 ('the', 'B-MISC'),
 ('British', 'O'),
 ('conquered', 'B-MISC'),
 ('French', 'O'),
 ('colony', 'O'),
 ('Acadia', 'O'),
 ('and', 'O'),
 ('the', 'O'),
 ('final', 'B-MISC'),
 ('French', 'O'),
 ('and', 'B-MISC'),
 ('Indian', 'I-MISC'),
 ('War', 'O'),
 ('1754', 'O'),
 ('63', 'B-LOC'),
 ('when', 'O'),
 ('Britain', 'O'),
 ('was', 'O'),
 ('victorious', 'O'),
 ('over', 'O'),
 ('all', 'B-MISC'),
 ('the', 'O'),
 ('French'

## Extract named entities

In [11]:
named_entities = []
named_entities_dict = {}
named_entity = []
prev_ner_class = ['', '']

for (word, ner_class) in sentence_ner_classes:
    if ner_class == 'O':
        if named_entity:
            named_entities.append((named_entity, prev_ner_class[1]))
            named_entity = []
        continue
        
    n = ner_class.split('-')
    if n[0] == 'B':
        named_entity.append(word)
    elif n[0] == 'I' and prev_ner_class[1] == n[1]:
        named_entity.append(word)
        
    if n[1] in named_entities_dict:
        named_entities_dict[n[1]].append(word)
    else:
        named_entities_dict[n[1]] = [word]
    
    prev_ner_class = n
    
if named_entity:
    named_entities.append((named_entity, prev_ner_class[1]))
    
print(named_entities)
print(named_entities_dict)

[(['Britain'], 'LOC'), (['France'], 'LOC'), (['North', 'America'], 'LOC'), (['Anne', 's'], 'PER'), (['the'], 'MISC'), (['conquered'], 'MISC'), (['final'], 'MISC'), (['and', 'Indian'], 'MISC'), (['63'], 'LOC'), (['all'], 'MISC'), (['colonies', 'in'], 'LOC')]
{'LOC': ['Britain', 'France', 'North', 'America', '63', 'colonies', 'in'], 'PER': ['Anne', 's'], 'MISC': ['the', 'conquered', 'final', 'and', 'Indian', 'all']}


# Gap Formulation

In [12]:
entities = next(iter(map(lambda x: list(x.subtrees(filter=lambda x: x.label() in ['NP', 'ADJP'])), sentence_tree)))
list(map(lambda x: print(x), entities))

(NP (CD 1689))
(NP (DT the) (NNS colonies))
(NP
  (NP
    (ADJP
      (VBN involved)
      (PP
        (IN in)
        (NP
          (NP (DT a) (NN series))
          (PP
            (IN of)
            (NP
              (NP (NNS wars))
              (PP
                (IN between)
                (NP (NNP Great) (NNP Britain) (CC and) (NNP France)))))
          (PP
            (IN for)
            (NP
              (NP (NN control))
              (PP
                (IN of)
                (NP
                  (NP (NNP North) (NNP America))
                  (, ,)
                  (SBAR
                    (WHNP
                      (WHNP (DT the) (RBS most) (JJ important))
                      (WHPP (IN of) (WHNP (WDT which))))
                    (S
                      (VP
                        (VBD were)
                        (NP
                          (NP
                            (NP (NNP Queen) (NNP Anne) (POS 's))
                            (NN War))
          

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

# Gap Creation

In [13]:
from util.boyer_moore import find

In [15]:
candidates = []
for entity in entities:
    has_ner = False
    has_broken_ner = False
    leaves = entity.leaves()
    if len(leaves) > 10:
        continue
    
#     i = find(sentence_words, leaves)
    
#     for leaf in leaves:
#         if sentence_ner_classes[i][1] == 'O':
#             i += 1
#             continue
        
#         is_subset = False
#         for ne, _ in named_entities:
#             if find(leaves, ne) != -1:
#                 is_subset = True
#                 break
                
#         if is_subset:
#             has_ner = True
#             has_broken_ner = False
#         else:
#             has_broken_ner = True
#         break
        
#     if has_broken_ner:
#         continue
    
    # Accept only candidates with NER
#     if not has_ner:
#         continue
    
    candidate_gap = str(' '.join(entity.leaves()))
    gapped_sentence = sentence.replace(candidate_gap, '_____')
    
    if candidate_gap.strip() != sentence.strip():
        candidates.append({
            'Sentence': sentence,
            'Question': gapped_sentence,
            'Answer': candidate_gap
        })

print("Sentence: {}\n".format(sentence))
for candidate in candidates:
    print("Q: {}\nA: {}\n".format(candidate['Question'], candidate['Answer']))

Sentence: Beginning in 1689, the colonies became involved in a series of wars between Great Britain and France for control of North America, the most important of which were Queen Anne's War, in which the British conquered French colony Acadia, and the final French and Indian War (1754–63) when Britain was victorious over all the French colonies in North America.

Q: Beginning in _____, the colonies became involved in a series of wars between Great Britain and France for control of North America, the most important of which were Queen Anne's War, in which the British conquered French colony Acadia, and the final French and Indian War (1754–63) when Britain was victorious over all the French colonies in North America.
A: 1689

Q: Beginning in 1689, _____ became involved in a series of wars between Great Britain and France for control of North America, the most important of which were Queen Anne's War, in which the British conquered French colony Acadia, and the final French and Indian W

# Question Formation

In [17]:
import nltk

def tree_to_dict(tree):
    tree_dict = dict()
    chunk_count = 0
    for st in (tree):
        input_chunked = ""
        if isinstance(st, nltk.Tree):
            input_chunked = ""
            for d in range(len(st)):
                if (d + 1) == len(st):
                    input_chunked = input_chunked + st[d][0]
                else:
                    input_chunked = input_chunked + st[d][0] + " "
            chunk_count +=1
            tree_dict["Chunk" + str(chunk_count)] = input_chunked
    return tree_dict

def pattern_verb_noun(tagged_sentence):
    chunkGram = 'Chunk: {<VB.?>+<NN.?>+}'
    chunkParser = nltk.RegexpParser(chunkGram)
    chunked = chunkParser.parse(tagged_sentence)
    chunk = tree_to_dict(chunked)
    
    pattern_strings = [chunk["Chunk" + str(i + 1)] for i, c in enumerate(chunk)]
    return pattern_strings

['were Queen Anne s War']

In [18]:
def catch_pos(catch_list, tagged):
    return [word for word, pos in tagged if pos in catch_list]

def pattern_verb_dt_adj_noun(tagged_sentence, named_entities_dict):
    verbs = catch_pos(['VB','VBD','VBG','VBN','VBP','VBZ'], tagged_sentence)
    nouns = catch_pos(['NN','NNP','NNS','NNPS'], tagged_sentence)

    chunkGram = 'Chunk: {<VB.?>+<DT>?<JJ.?>?<NN.?>+}'
    chunkParser = nltk.RegexpParser(chunkGram)
    chunked = chunkParser.parse(tagged_sentence)
    chunk = tree_to_dict(chunked)
    
    pattern_strings = [chunk["Chunk" + str(i + 1)] for i, c in enumerate(chunk)]
    
    return pattern_strings, verbs, nouns

(['were Queen Anne s War', 'conquered French colony Acadia'],
 ['Beginning', 'became', 'involved', 'were', 'conquered', 'was'],
 ['colonies',
  'series',
  'wars',
  'Great',
  'Britain',
  'France',
  'control',
  'North',
  'America',
  'Queen',
  'Anne',
  's',
  'War',
  'colony',
  'Acadia',
  'War',
  'Britain',
  'colonies',
  'North',
  'America'])

In [21]:
from stanford_postagger import StanfordPOSTagger

tagger = StanfordPOSTagger()
tagged = [(word, tagger.tag(word)[0][1]) for word in sentence_words]

questions = []

for candidate in candidates:
    full_ques = candidate['Question']
    sentence = candidate['Sentence']
    answer = candidate['Answer']

    flag = 0
    ans = 0

    pattern_strings = pattern_verb_noun(tagged)

    for word, pos in tagged:
        if ((answer.find(word)) >= 0):
            # Check if blank is in beginning
            if (flag == 0) and ((sentence.find(word)) == 0):
                if (pos in ['NN', 'NNP', 'NNPS']) and ('PER' in named_entities_dict) and (word in named_entities_dict['PER']):
                    full_ques = full_ques.replace("_____" , 'Who')[:-1] + "?"
                    flag = 1
                elif ('LOC' in named_entities_dict) and (word in named_entities_dict['LOC']):
                    full_ques = full_ques.replace("_____" , 'Where')[:-1] + "?"
                    flag = 1
                elif (pos in ['NN', 'NNP', 'NNPS']):
                    full_ques = full_ques.replace("_____" , 'What')[:-1] + "?"
                    flag = 1

            if (flag == 0) and pattern_strings:
                for i, v in enumerate(pattern_strings):
                    if (pos in ['NN', 'NNP', 'NNPS']) and ('PER' in named_entities_dict) and (word in named_entities_dict['PER']):
                        words = v.split()
                        verb = [word for word in words if word not in named_entities_dict['PER']]
                        full_ques = sentence.replace(v, '')
                        full_ques = "What {} {}?".format(verb[0], sentence.replace(v, '').lower())
                        flag = 1

            if flag:
                continue

            pattern_strings, verbs, nouns = pattern_verb_dt_adj_noun(tagged, named_entities_dict)
            if pattern_strings and ('LOC' in named_entities_dict) and (word in named_entities_dict['LOC']):
                for i, v in enumerate(pattern_strings):
                    if v.find(answer) < 0:
                        continue

                    words = v.split()
                    verb = [word for word in words if word in verbs]
                    full_ques = "Where {} {}?".format(verb[0], sentence.replace(v, '').lower())

                    noun = [word for word in words if word in nouns]
                    ps = PorterStemmer()
                    ans = ps.stem(word)

                    flag = 1
    
    questions.append({
        'Sentence': sentence,
        'Question': full_ques,
        'Answer': ans if ans else answer
    })
    
print("Sentence: {}\n".format(sentence))
for candidate in candidates:
    print("Q: {}\nA: {}\n".format(candidate['Question'], candidate['Answer']))

Sentence: Beginning in 1689, the colonies became involved in a series of wars between Great Britain and France for control of North America, the most important of which were Queen Anne's War, in which the British conquered French colony Acadia, and the final French and Indian War (1754–63) when Britain was victorious over all the French colonies in North America.

Q: Beginning in _____, the colonies became involved in a series of wars between Great Britain and France for control of North America, the most important of which were Queen Anne's War, in which the British conquered French colony Acadia, and the final French and Indian War (1754–63) when Britain was victorious over all the French colonies in North America.
A: 1689

Q: Beginning in 1689, _____ became involved in a series of wars between Great Britain and France for control of North America, the most important of which were Queen Anne's War, in which the British conquered French colony Acadia, and the final French and Indian W