Reference: https://github.com/adityasarvaiya/Automatic_Question_Generation/

In [48]:
import json
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import nltk
import math
import string
import operator
import sys
import collections
from itertools import islice
from nltk.parse import CoreNLPParser
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm

In [31]:
parser = CoreNLPParser(url='http://localhost:9000')
#list(parser.parse('What is the airspeed of an unladen swallow ?'.split()))

In [5]:
with open('squad-dev-v1.1.json') as json_file:  
    data = json.load(json_file)
data = pd.DataFrame.from_dict(data)
df = pd.DataFrame.from_dict(data['data'])

In [6]:
with open('squad.json') as json_file:  
    annotated = json.load(json_file)
annotated = pd.DataFrame.from_dict(annotated)

In [7]:
# for each topic
ids = []
topics = []
paragraphs = []
questions =[]
answers = []
answer_starts = []
for a in annotated.index:
    original_id = annotated.at[a,'original_id']
    for i in range(48):
        # for each paragraph
        for j in range(len(df['data'][i]['paragraphs'])):
            for k in range(len(df['data'][i]['paragraphs'][j]['qas'])):
                id_num = df['data'][i]['paragraphs'][j]['qas'][k]['id']
                if original_id==id_num:
                    annotated.at[a, 'topic'] = df['data'][i]['title']
                    annotated.at[a, 'paragraph'] = df['data'][i]['paragraphs'][j]['context']
                    annotated.at[a, 'question'] = df['data'][i]['paragraphs'][j]['qas'][k]['question']
                    annotated.at[a, 'answer'] = df['data'][i]['paragraphs'][j]['qas'][k]['answers'][0]['text']
                    annotated.at[a, 'answer_start'] = df['data'][i]['paragraphs'][j]['qas'][k]['answers'][0]['answer_start']
                    ids.append(id_num)
skills = []
sent_ind = []
skill_count = []
nonsense = []
for row in annotated['annotations']:
    skills.append(row[0]['skills'])
    sent_ind.append(row[0]['sents_indices'])
    skill_count.append(row[0]['skill_count'])
    nonsense.append(row[0]['nonsense'])
annotated['skills'] = skills
annotated['sent_indices'] = sent_ind
annotated['skill_count'] = skill_count
annotated['nonsense'] = nonsense

### Sample Paragraph from SQUaD

In [9]:
test_paragraph = annotated['paragraph'][21]
test_paragraph

'Consultant pharmacy practice focuses more on medication regimen review (i.e. "cognitive services") than on actual dispensing of drugs. Consultant pharmacists most typically work in nursing homes, but are increasingly branching into other institutions and non-institutional settings. Traditionally consultant pharmacists were usually independent business owners, though in the United States many now work for several large pharmacy management companies (primarily Omnicare, Kindred Healthcare and PharMerica). This trend may be gradually reversing as consultant pharmacists begin to work directly with patients, primarily because many elderly people are now taking numerous medications but continue to live outside of institutional settings. Some community pharmacies employ consultant pharmacists and/or provide consulting services.'

### Pre-process

In [10]:
def clean_sentences(paragraph):
        """Clean sentences, remove digit, punctuation, upper case to lower
        Args: sentences: sentences to be cleaned
        Return: sentences_processed: dict of cleaned sentences
        """
        flag = 0
        sentence_processed = {}

        # Remove all punctuation except periods
        punc = set(string.punctuation)
        punc.remove('.')
        # Remove all digits
        paragraph = ''.join([x for x in paragraph if not x.isdigit()])
        paragraph = ''.join([x for x in paragraph if x not in punc])
        # Lowercase everything
        paragraph = ''.join([x.lower() for x in paragraph])
        # Split into words
        paragraph = ' '.join(paragraph.split())

        stop_words = nltk.corpus.stopwords.words('english')
        stemmer = nltk.stem.PorterStemmer()
        tokenize = nltk.word_tokenize

        for sentence in paragraph.split('.'):
            sentence = sentence.strip()
            sentence = [stemmer.stem(word) for word in tokenize(
                sentence) if not word in stop_words]
            if sentence:
                sentence_processed[flag] = sentence
                flag += 1

        return sentence_processed
cleaned_data = clean_sentences(test_paragraph)
first2pairs = {k: cleaned_data[k] for k in list(cleaned_data)[:2]}
print(first2pairs)

{0: ['consult', 'pharmaci', 'practic', 'focus', 'medic', 'regimen', 'review'], 1: ['e']}


### Sentence Selection

In [11]:
def word_distribution(sentence_processed):
        """Compute word probabilistic distribution which is calculated by \
        term frequency divided by total word count"""
        word_distr = collections.defaultdict(int)
        word_count = 0.0
        # For each word in each sentence, count number of times each word appears as well as total words in sentence
        for k in sentence_processed:
            for word in sentence_processed[k]:
                word_distr[word] += 1
                word_count += 1

        for word in word_distr:
            word_distr[word] = word_distr[word] / word_count

        return word_distr
word_distr = word_distribution(cleaned_data)
first2pairs = {k: word_distr[k] for k in list(word_distr)[:2]}
print(first2pairs)

{'consult': 0.07792207792207792, 'pharmaci': 0.03896103896103896}


In [12]:
def sentence_weight(word_distribution, sentence_processed):
        """Compute weight with respect to sentences
        Args:
                word_distribution: probabilistic distribution of terms in document
                sentence_processed: dict of processed sentences generated by clean_sentences
        Return:
                sentence_weight: dict of weight of each sentence
        """
        sentence_weight = {}
        # Iterate through each word in each sentence, if word distribution and sentence id are in dictionary, 
        # add to existing word distribution. Else, sentence weight for given sentence equals current word distribution
        for sentence_id in sentence_processed:
            for word in sentence_processed[sentence_id]:

                if word_distribution[word] and sentence_id in sentence_weight:
                    sentence_weight[sentence_id] += word_distribution[word]
                else:
                    sentence_weight[sentence_id] = word_distribution[word]
        # Sentence weight equals sum of word distributions divided by length of cleaned sentence
            sentence_weight[sentence_id] = sentence_weight[
                sentence_id] / float(len(sentence_processed[sentence_id]))

        sentence_weight = sorted(sentence_weight.items(
        ), key=operator.itemgetter(1), reverse=True)
        return sentence_weight
sentence_weight = sentence_weight(word_distr,cleaned_data)
# Sentences with higher weight are sentences with more frequently seen words
sentence_weight

[(6, 0.03607503607503607),
 (0, 0.02782931354359926),
 (3, 0.027154663518299885),
 (5, 0.0218417945690673),
 (4, 0.021251475796930347),
 (2, 0.015584415584415586),
 (1, 0.012987012987012988)]

In [13]:
def topically_important_sentence(sentence_weight, paragraph):
        """Select topically import sentences
        Args:
                sentence_weight: dict, weight of sentences computed in sentence_weight
                sentences: set of sentences
        Return:
                sentences_selected: dict, topically important sentences selected
        """
        sentence_length = len(sentence_weight)
        # how many sentences to retain
        num_sentences_selected = math.ceil(float(0.05) * sentence_length)
        num_sentences_selected = int(num_sentences_selected)
        # key of selected sentences
        sentences_selected_key = []
        # dictionary of all sentences
        sentences_dict = {}
        flag = 0
        # select num_sentences_selected # of sentences from list of sentence weights
        for k, v in sentence_weight[0:num_sentences_selected]:
            sentences_selected_key.append(k)
        # Iterate through sentences in raw text and assign a id number
        for sentence in paragraph.split('.'):
            if sentence:
                sentences_dict[flag] = sentence
                flag += 1
        sentences_selected = collections.OrderedDict()
        
        for key in sentences_selected_key:
            sentences_selected[key] = sentences_dict[key]

        return sentences_selected
important_sentences = topically_important_sentence(sentence_weight, test_paragraph)
# Gets the sentences with corresponding heavy sentence weights
important_sentences

OrderedDict([(6,
              ' Some community pharmacies employ consultant pharmacists and/or provide consulting services')])

### Gap Selection

In [34]:
selected_sent = important_sentences[6]
print(selected_sent)
select_sent_tree = parser.raw_parse(selected_sent)

 Some community pharmacies employ consultant pharmacists and/or provide consulting services


In [35]:
list(parser.raw_parse(selected_sent))

[Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['Some']), Tree('NN', ['community']), Tree('NNS', ['pharmacies'])]), Tree('VP', [Tree('VP', [Tree('VBP', ['employ']), Tree('NP', [Tree('NN', ['consultant']), Tree('NNS', ['pharmacists'])])]), Tree('CC', ['and/or']), Tree('VP', [Tree('VBP', ['provide']), Tree('S', [Tree('VP', [Tree('VBG', ['consulting']), Tree('NP', [Tree('NNS', ['services'])])])])])])])])]

In [29]:
def extract_gaps(sentence, tree):
    """Extract nouns, np, adjp from tree object
    - Args:
        sentence(str): current sentence
        tree(list): list of Tree object, correspond to sentence
    - - Returnss:
        candidates(list of dict): candidate questions generated by this sentence,
        e.g. [{'question':'the capital city of NL is _____', 'gap':'Amsterdam'}]
    """
    candidates = []
    candidate = {}
    entities = ['NP', 'ADJP']
    entities = list(map(lambda x: list(x.subtrees(
        filter=lambda x: x.label() in entities)), tree))[0]
    print('entities: ' + str(entities))
    if len(entities) > 7:
        return False
    else:
        for entity in entities:
            candidate_gap = str(' '.join(entity.leaves()))
            print('candidate_gap: ' + str(candidate_gap))
            sentence_copy = sentence
            # replace sentence candidate_gap with ___
            sentence_copy = sentence_copy.replace(candidate_gap, '_____')
            candidate['Sentence'] = sentence
            candidate['Question'] = sentence_copy
            candidate['Answer'] = candidate_gap
            if candidate_gap.strip() != sentence.strip():
                candidates.append(candidate)
            candidate = {}
        return candidates

def get_candidates(sentences):
    """Main function, prepare sentences, parse sentence, extract gap
    - Args:
        sentences(dict): topically important sentences
    - - Returnss:
            candidates(list of dict): list of dictionary, e.g.
            [{'Sentence': .....,'Question':.....,'Answer':...},...]
    """
    candidates = []
    for sentence_id, sentence in sentences.items():
        tree = parser.raw_parse(sentence)
        current_sentence_candidates = extract_gaps(
            sentence, tree)  # build candidate questions
        if current_sentence_candidates == False:
            continue
        candidates = candidates + current_sentence_candidates
        print("building candidate question/answer pairs %d" % len(candidates))
        # clear current_sentence_candidates
        current_sentence_candidates = []
    return candidates

### Sample Fill In The Blank Questions

In [30]:
get_candidates(important_sentences)

entities: [Tree('NP', [Tree('DT', ['Some']), Tree('NN', ['community']), Tree('NNS', ['pharmacies'])]), Tree('NP', [Tree('NN', ['consultant']), Tree('NNS', ['pharmacists'])]), Tree('NP', [Tree('NNS', ['services'])])]
candidate_gap: Some community pharmacies
candidate_gap: consultant pharmacists
candidate_gap: services
building candidate question/answer pairs 3


[{'Sentence': ' Some community pharmacies employ consultant pharmacists and/or provide consulting services',
  'Question': ' _____ employ consultant pharmacists and/or provide consulting services',
  'Answer': 'Some community pharmacies'},
 {'Sentence': ' Some community pharmacies employ consultant pharmacists and/or provide consulting services',
  'Question': ' Some community pharmacies employ _____ and/or provide consulting services',
  'Answer': 'consultant pharmacists'},
 {'Sentence': ' Some community pharmacies employ consultant pharmacists and/or provide consulting services',
  'Question': ' Some community pharmacies employ consultant pharmacists and/or provide consulting _____',
  'Answer': 'services'}]

### Actual Question Generation

In [53]:
annotated['paragraph'][30]

'As a member of the Scottish Parliamentary Corporate Body, the Presiding Officer is responsible for ensuring that the Parliament functions effectively and has the staff, property and resources it requires to operate. Convening the Parliamentary Bureau, which allocates time and sets the work agenda in the chamber, is another of the roles of the Presiding Officer. Under the Standing Orders of the Parliament the Bureau consists of the Presiding Officer and one representative from each political parties with five or more seats in the Parliament. Amongst the duties of the Bureau are to agree the timetable of business in the chamber, establish the number, remit and membership of parliamentary committees and regulate the passage of legislation (bills) through the Parliament. The Presiding Officer also represents the Scottish Parliament at home and abroad in an official capacity.'

In [94]:
sample_sent = 'As a member of the Scottish Parliamentary Corporate Body, the Presiding Officer is responsible for ensuring that the Parliament functions effectively and has the staff, property and resources it requires to operate.'

In [None]:
#ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
ne_tree = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sample_sent)))
print(ne_tree)
nlp = en_core_web_sm.load()
sent_entities = nlp(sample_sent)
print('There are ' + str(len(sent_entities.ents)) + ' entities in this paragraph.')
labels = [x.label_ for x in sent_entities.ents]
print(Counter(labels))
items = [x.text for x in sent_entities.ents]
print('These are the most frequent terms: ' + str(Counter(items).most_common(3)))

In [83]:
# Extract chunks from tagged sentence
def tree_to_dict(tree):
    """
    Aditya : Convert Tree to a usefull dict[] = <list> format
    input : tree
    output : dictionary

    """
    tree_dict = dict()
    chunk_count = 0
    for st in (tree):
#         print('st: ' + str(st))
#         print(st[0])
        input_chunked = ""
        if isinstance(st, nltk.Tree):
#             print('st is tree')
#             print(st.label())
            input_chunked = ""
            for d in range(len(st)):
                # print "input__chunked"+input_chunked
                if (d+1) == len(st):
                    input_chunked = input_chunked + st[d][0]
                else:
                    input_chunked = input_chunked + st[d][0] + " "
            chunk_count +=1

            tree_dict["Chunk"+str(chunk_count)] = input_chunked
#     print(tree_dict)
    return tree_dict

In [90]:
def pattern_verb_noun(sent):
        """
        Aditya : Takes the sentence and find the chunk (matches the regex)
        input : sentence
        output : chuncked short sentence
        """
        words = nltk.word_tokenize(sent)
        tagged = nltk.pos_tag(words)
       
        chunkGram = 'Chunk: {<VB.?>+<NN.?>+}'
#         r"""VN: {<VB.?>+<DT>?<JJ.?>?<NN.?>+}""" 
        # verb + optional determiner + optional adj + noun
        #  NP: {<NNP>+} {<NN><NN>} # chunk sequences of proper nouns
        chunkParser = nltk.RegexpParser(chunkGram)
        chunked = chunkParser.parse(tagged)
        #print('Tree: ' + str(chunked))
        chunk = tree_to_dict(chunked)
        #print('tree_to_dict: ' + str(chunk))
        pattern_strings =[]
        if len(chunk) != 0:
            for chunk_no in range(len(chunk)):
                pattern_string = chunk["Chunk"+str(chunk_no+1)]
                pattern_strings.append(pattern_string)
                print("pattern_string  :  ", str(pattern_string))
         
        return pattern_strings
#pattern_verb_noun(test_paragraph.split('.')[0])

In [99]:
flag = 0
for sentence in test_paragraph.split('.'):
    print('\nSentence: ' + str(sentence))
    pattern_strings = pattern_verb_noun(sentence)
    tokenized = nltk.word_tokenize(sentence)
    # Remove stopwords
    stop_words = set(nltk.corpus.stopwords.words('english')) 
    filtered = [w for w in tokenized if not w in stop_words]
    #stemmed = [nltk.PorterStemmer.stem(w) for w in filtered]
    tagged = nltk.pos_tag(filtered)
#     ne_tree = nltk.ne_chunk(tagged)
#      print('tree_dict: ' + str(ne_tree))
#      print(tagged)
    nouns = []
    questions = []
    for word,pos in tagged:
        for pattern_string_no in range(len(pattern_strings)):
            if (('NN' == pos) or ('NNP' == pos) or ('NNPS' == pos)):
                nouns.append(word)
                individual_words = pattern_strings[pattern_string_no].split()
#                 print('individual words: ' + str(individual_words))
                verb = [word for word in individual_words if word not in nouns]
#                 print("Verb : ", str(verb))
#                 print("pattern_strings[pattern_string_no] : " , str(pattern_strings[pattern_string_no])) 
                full_ques = sentence.replace(str(pattern_strings[pattern_string_no]), '')
                full_ques = "What " + str(verb[0]) + " " + str(full_ques).lower() + "?"
                questions.append(full_ques)
#                 print("word : " + word + "  pos : " + pos)
                flag=1
    print(questions)
               
        



Sentence: Consultant pharmacy practice focuses more on medication regimen review (i
[]

Sentence: e
[]

Sentence:  "cognitive services") than on actual dispensing of drugs
[]

Sentence:  Consultant pharmacists most typically work in nursing homes, but are increasingly branching into other institutions and non-institutional settings
pattern_string  :   nursing homes
[]

Sentence:  Traditionally consultant pharmacists were usually independent business owners, though in the United States many now work for several large pharmacy management companies (primarily Omnicare, Kindred Healthcare and PharMerica)
[]

Sentence:  This trend may be gradually reversing as consultant pharmacists begin to work directly with patients, primarily because many elderly people are now taking numerous medications but continue to live outside of institutional settings
[]

Sentence:  Some community pharmacies employ consultant pharmacists and/or provide consulting services
pattern_string  :   employ consultant p

### nltk.ne_chunk() 
With the function nltk.ne_chunk(), we can recognize named entities using a classifier, the classifier adds category labels such as PERSON, ORGANIZATION, and GPE.

In [117]:
test_sent = annotated['paragraph'][20].split('.')[0]
ne_tree = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(test_sent)))
print(ne_tree)

(S
  In/IN
  (ORGANIZATION Ancient/NNP Greece/NNP)
  ,/,
  (PERSON Diocles/NNP)
  of/IN
  (GPE Carystus/NNP)
  (/(
  4th/JJ
  century/NN
  BC/NNP
  )/)
  was/VBD
  one/CD
  of/IN
  several/JJ
  men/NNS
  studying/VBG
  the/DT
  medicinal/JJ
  properties/NNS
  of/IN
  plants/NNS)


### SpaCy

In [123]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [127]:
#extracting named entities
article = nlp(annotated['paragraph'][20])
print('There are ' + str(len(article.ents)) + ' entities in this paragraph.')
labels = [x.label_ for x in article.ents]
print(Counter(labels))
items = [x.text for x in article.ents]
print('These are the most frequent terms: ' + str(Counter(items).most_common(3)))
print('\n')
print(annotated['paragraph'][20])

There are 13 entities in this paragraph.
Counter({'ORG': 4, 'CARDINAL': 2, 'NORP': 2, 'GPE': 1, 'ORDINAL': 1, 'PERSON': 1, 'LOC': 1, 'DATE': 1})
These are the most frequent terms: [('Greece', 1), ('Diocles of Carystus', 1), ('4th', 1)]


In Ancient Greece, Diocles of Carystus (4th century BC) was one of several men studying the medicinal properties of plants. He wrote several treatises on the topic. The Greek physician Pedanius Dioscorides is famous for writing a five volume book in his native Greek Περί ύλης ιατρικής in the 1st century AD. The Latin translation De Materia Medica (Concerning medical substances) was used a basis for many medieval texts, and was built upon by many middle eastern scientists during the Islamic Golden Age. The title coined the term materia medica.


In [130]:
sentences = [x for x in article.sents]
for s in range(len(sentences)):
    displacy.render(nlp(str(sentences[s])), jupyter=True, style='ent')

  "__main__", mod_spec)


  "__main__", mod_spec)


In [129]:
displacy.render(nlp(str(sentences[0])), style='dep', jupyter = True, options = {'distance': 120})