In [12]:
import pandas as pd
from IPython.display import Markdown, display, clear_output

In [13]:
import _pickle as cPickle
import scipy
from pathlib import Path

def dumpPickle(fileName, content):
    pickleFile = open(fileName, 'wb')
    cPickle.dump(content, pickleFile, -1)
    pickleFile.close()

def loadPickle(fileName):    
    file = open(fileName, 'rb')
    content = cPickle.load(file)
    file.close()
    
    return content
    
def pickleExists(fileName):
    file = Path(fileName)
    
    if file.is_file():
        return True
    
    return False

In [14]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')


def extractAnswers(qas, doc):
    answers = []

    senStart = 0
    senId = 0

    for sentence in doc.sents:
        senLen = len(sentence.text)

        for answer in qas:
            answerStart = answer['answers'][0]['answer_start']

            if (answerStart >= senStart and answerStart < (senStart + senLen)):
                answers.append({'sentenceId': senId, 'text': answer['answers'][0]['text']})

        senStart += senLen
        senId += 1
    
    return answers


def tokenIsAnswer(token, sentenceId, answers):
    for i in range(len(answers)):
        if (answers[i]['sentenceId'] == sentenceId):
            if (answers[i]['text'] == token):
                return True
    return False



def getNEStartIndexs(doc):
    neStarts = {}
    for ne in doc.ents:
        neStarts[ne.start] = ne
        
    return neStarts 

def getSentenceStartIndexes(doc):
    senStarts = []
    
    for sentence in doc.sents:
        senStarts.append(sentence[0].i)
    
    return senStarts
    
def getSentenceForWordPosition(wordPos, senStarts):
    for i in range(1, len(senStarts)):
        if (wordPos < senStarts[i]):
            return i - 1
        
def addWordsForParagrapgh(newWords, text):
    doc = nlp(text)

    neStarts = getNEStartIndexs(doc)
    senStarts = getSentenceStartIndexes(doc)
    
    
    i = 0
    
    while (i < len(doc)):
        
        if (i in neStarts):
            word = neStarts[i]
            
            currentSentence = getSentenceForWordPosition(word.start, senStarts)
            wordLen = word.end - word.start
            shape = ''
            for wordIndex in range(word.start, word.end):
                shape += (' ' + doc[wordIndex].shape_)

            newWords.append([word.text,
                            0,
                            0,
                            currentSentence,
                            wordLen,
                            word.label_,
                            None,
                            None,
                            None,
                            shape])
            i = neStarts[i].end - 1
        
        else:
            if (doc[i].is_stop == False and doc[i].is_alpha == True):
                word = doc[i]

                currentSentence = getSentenceForWordPosition(i, senStarts)
                wordLen = 1

                newWords.append([word.text,
                                0,
                                0,
                                currentSentence,
                                wordLen,
                                None,
                                word.pos_,
                                word.tag_,
                                word.dep_,
                                word.shape_])
        i += 1

def oneHotEncodeColumns(df):
    columnsToEncode = ['NER', 'POS', "TAG", 'DEP']

    for column in columnsToEncode:
        one_hot = pd.get_dummies(df[column])
        one_hot = one_hot.add_prefix(column + '_')

        df = df.drop(column, axis = 1)
        df = df.join(one_hot)
    
    return df

In [15]:
def generateDf(text):
    words = []
    addWordsForParagrapgh(words, text)

    wordColums = ['text', 'titleId', 'paragrapghId', 'sentenceId','wordCount', 'NER', 'POS', 'TAG', 'DEP','shape']
    df = pd.DataFrame(words, columns=wordColums)
    
    return df

In [16]:
def prepareDf(df):
    wordsDf = oneHotEncodeColumns(df)


    
    predictorFeaturesName = 'data/pickles/nb-predictor-features.pkl'
    featureNames = loadPickle(predictorFeaturesName)

    for feature in featureNames:
        if feature not in wordsDf.columns:
            wordsDf[feature] = 0    
                

    columnsToDrop = ['text', 'titleId', 'paragrapghId', 'sentenceId', 'shape', 'isAnswer']
    wordsDf = wordsDf.drop(columnsToDrop, axis = 1)


    return wordsDf

In [17]:
def predictWords(wordsDf, df):
    
    predictorPickleName = 'data/pickles/nb-predictor.pkl'
    predictor = loadPickle(predictorPickleName)
    
    y_pred = predictor.predict_proba(wordsDf)

    labeledAnswers = []
    for i in range(len(y_pred)):
        labeledAnswers.append({'word': df.iloc[i]['text'], 'prob': y_pred[i][0]})
    
    return labeledAnswers

In [18]:
def blankAnswer(firstTokenIndex, lastTokenIndex, sentStart, sentEnd, doc):
    leftPartStart = doc[sentStart].idx
    leftPartEnd = doc[firstTokenIndex].idx
    rightPartStart = doc[lastTokenIndex].idx + len(doc[lastTokenIndex])
    rightPartEnd = doc[sentEnd - 1].idx + len(doc[sentEnd - 1])
    
    question = doc.text[leftPartStart:leftPartEnd] + '_____' + doc.text[rightPartStart:rightPartEnd]
    
    return question


In [19]:
def addQuestions(answers, text):
    doc = nlp(text)
    currAnswerIndex = 0
    qaPair = []

    
    for sent in doc.sents:
        for token in sent:
            
            
            if currAnswerIndex >= len(answers):
                break
            
            answerDoc = nlp(answers[currAnswerIndex]['word'])
            answerIsFound = True
            
            for j in range(len(answerDoc)):
                if token.i + j >= len(doc) or doc[token.i + j].text != answerDoc[j].text:
                    answerIsFound = False
           
            if answerIsFound:
                question = blankAnswer(token.i, token.i + len(answerDoc) - 1, sent.start, sent.end, doc)
                
                qaPair.append({'question' : question, 'answer': answers[currAnswerIndex]['word'], 'prob': answers[currAnswerIndex]['prob']})
                
                currAnswerIndex += 1
                
    return qaPair

In [20]:
def sortAnswers(qaPairs):
    orderedQaPairs = sorted(qaPairs, key=lambda qaPair: qaPair['prob'])
    
    return orderedQaPairs    

In [21]:
import os
import gensim
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors

glove_file = 'data/embeddings/glove.6B.300d.txt'
tmp_file = 'data/embeddings/word2vec-glove.6B.300d.txt'
model = None

if os.path.isfile(glove_file):
    from gensim.scripts.glove2word2vec import glove2word2vec
    glove2word2vec(glove_file, tmp_file)
    model = KeyedVectors.load_word2vec_format(tmp_file)
else:
    print("Glove embeddings not found. Please download and place them in the following path: " + glove_file)

  glove2word2vec(glove_file, tmp_file)


In [22]:
def generate_distractors(answer, count):
    answer = str.lower(answer)
     
    try:
        closestWords = model.most_similar(positive=[answer], topn=count)
    except:
        return []

    distractors = list(map(lambda x: x[0], closestWords))[0:count]
    
    return distractors

In [23]:
def addDistractors(qaPairs, count):
    if not model:
        print("Glove embeddings not found." + glove_file)
    
    for qaPair in qaPairs:
        distractors = generate_distractors(qaPair['answer'], count)
        qaPair['distractors'] = distractors
    
    return qaPairs

# Main function

In [24]:
def generateQuestions(text, count):
    
    df = generateDf(text)
    wordsDf = prepareDf(df)
    
    labeledAnswers = predictWords(wordsDf, df)
    
    qaPairs = addQuestions(labeledAnswers, text)
    
    orderedQaPairs = sortAnswers(qaPairs)
    
    questions = addDistractors(orderedQaPairs[:count], 4)
    
    for i in range(count):
        display(Markdown('### Question ' + str(i + 1) + ':'))
        print(questions[i]['question'])

        display(Markdown('#### Answer:'))
        print(questions[i]['answer'])
        
        display(Markdown('#### Incorrect answers:'))
        for distractor in questions[i]['distractors']:
            print(distractor)
        
        print()

In [27]:
# text = "Oxygen is a chemical element with symbol O and atomic number 8. It is a member of the chalcogen group on the periodic table, a highly reactive nonmetal, and an oxidizing agent that readily forms oxides with most elements as well as with other compounds. By mass, oxygen is the third-most abundant element in the universe, after hydrogen and helium. At standard temperature and pressure, two atoms of the element bind to form dioxygen, a colorless and odorless diatomic gas with the formula O2. Diatomic oxygen gas constitutes 20.8% of the Earth's atmosphere. As compounds including oxides, the element makes up almost half of the Earth's crust."
# text = "Hydrogen, denoted by the symbol H and atomic number 1, occupies a prominent place on the periodic table as the simplest and lightest element. Belonging to the alkali metal group, hydrogen is incredibly reactive, readily forming compounds with various elements. Despite its abundance in the universe, hydrogen is seldom found in its pure form on Earth due to its tendency to combine with other elements. When it does occur as a diatomic molecule, represented by H2, it exists as a colorless, odorless gas. Hydrogen holds significance in various industries, from energy production to aerospace, and plays a crucial role in sustaining life through its involvement in the formation of water molecules. Additionally, hydrogen's isotopes, such as deuterium and tritium, have applications in nuclear fusion research. As compounds, hydrogen contributes to a myriad of substances, ranging from hydrocarbons to acids, underscoring its versatility and importance in both natural and synthetic processes"
text = "The Nile River, symbolized by its historical significance and geographical prominence, stands as a vital lifeline in the heart of northeastern Africa. Measuring as the longest river globally, its journey spans over 6,600 kilometers, traversing multiple countries and diverse landscapes. As a perennial source of sustenance and civilization, the Nile has shaped the development of ancient and modern societies alike, fostering agriculture, trade, and cultural exchange along its fertile banks. From the ancient civilizations of Egypt and Nubia to the contemporary nations of Sudan and Ethiopia, the river has been a cradle of human civilization, offering fertile soils and freshwater resources that sustain millions of people to this day. The Nile's annual flooding, once the lifeblood of agriculture, has been harnessed through extensive irrigation systems, powering economies and shaping regional dynamics. Moreover, the river holds spiritual and cultural significance for the diverse peoples inhabiting its basin, inspiring myths, rituals, and artistic expressions throughout history. Despite facing modern challenges such as pollution, overuse, and climate change, the Nile remains a symbol of resilience and adaptation, reflecting the enduring bond between humanity and the natural world"
generateQuestions(text, 20)

Feature names must be in the same order as they were in fit.



### Question 1:

From the ancient civilizations of _____ and Nubia to the contemporary nations of Sudan and Ethiopia, the river has been a cradle of human civilization, offering fertile soils and freshwater resources that sustain millions of people to this day.


#### Answer:

Egypt


#### Incorrect answers:

egyptian
syria
arabia
cairo



### Question 2:

From the ancient civilizations of Egypt and _____ to the contemporary nations of Sudan and Ethiopia, the river has been a cradle of human civilization, offering fertile soils and freshwater resources that sustain millions of people to this day.


#### Answer:

Nubia


#### Incorrect answers:

nubian
pharaohs
illyria
alexandrina



### Question 3:

From the ancient civilizations of Egypt and Nubia to the contemporary nations of _____ and Ethiopia, the river has been a cradle of human civilization, offering fertile soils and freshwater resources that sustain millions of people to this day.


#### Answer:

Sudan


#### Incorrect answers:

sudanese
darfur
khartoum
uganda



### Question 4:

From the ancient civilizations of Egypt and Nubia to the contemporary nations of Sudan and _____, the river has been a cradle of human civilization, offering fertile soils and freshwater resources that sustain millions of people to this day.


#### Answer:

Ethiopia


#### Incorrect answers:

eritrea
ethiopian
uganda
sudan



### Question 5:

From the ancient civilizations of Egypt and Nubia to the contemporary nations of Sudan and Ethiopia, the river has been a cradle of human civilization, offering fertile soils and freshwater resources that sustain _____ of people to this day.


#### Answer:

millions


#### Incorrect answers:

tens
billions
thousands
hundreds



### Question 6:

From the ancient civilizations of Egypt and Nubia to the contemporary nations of Sudan and Ethiopia, the river has been a cradle of human civilization, offering fertile soils and freshwater resources that sustain millions of people to _____.


#### Answer:

this day


#### Incorrect answers:




### Question 7:

_____, symbolized by its historical significance and geographical prominence, stands as a vital lifeline in the heart of northeastern Africa.


#### Answer:

The Nile River


#### Incorrect answers:




### Question 8:

Measuring as the longest river globally, its journey spans over _____, traversing multiple countries and diverse landscapes.


#### Answer:

6,600 kilometers


#### Incorrect answers:




### Question 9:

The Nile River, symbolized by its historical significance and geographical prominence, stands as a vital lifeline in the heart of northeastern _____.


#### Answer:

Africa


#### Incorrect answers:

african
south
continent
africans



### Question 10:

As a perennial source of sustenance and civilization, the _____ has shaped the development of ancient and modern societies alike, fostering agriculture, trade, and cultural exchange along its fertile banks.


#### Answer:

Nile


#### Incorrect answers:

river
ebola
yangtze
mekong



### Question 11:

The _____'s annual flooding, once the lifeblood of agriculture, has been harnessed through extensive irrigation systems, powering economies and shaping regional dynamics.


#### Answer:

Nile


#### Incorrect answers:

river
ebola
yangtze
mekong



### Question 12:

Despite facing modern challenges such as pollution, overuse, and climate change, the _____ remains a symbol of resilience and adaptation, reflecting the enduring bond between humanity and the natural world


#### Answer:

Nile


#### Incorrect answers:

river
ebola
yangtze
mekong



### Question 13:

As a perennial source of sustenance and civilization, the Nile has _____ the development of ancient and modern societies alike, fostering agriculture, trade, and cultural exchange along its fertile banks.


#### Answer:

shaped


#### Incorrect answers:

shape
shapes
resembling
resemble



### Question 14:

The Nile's annual flooding, once the lifeblood of agriculture, has been _____ through extensive irrigation systems, powering economies and shaping regional dynamics.


#### Answer:

harnessed


#### Incorrect answers:

harnessing
harness
utilise
harnesses



### Question 15:

Measuring as the longest river globally, its journey _____ over 6,600 kilometers, traversing multiple countries and diverse landscapes.


#### Answer:

spans


#### Incorrect answers:

spanning
span
spanned
stretches



### Question 16:

As a perennial source of sustenance and civilization, the Nile has shaped the development of ancient and modern _____ alike, fostering agriculture, trade, and cultural exchange along its fertile banks.


#### Answer:

societies


#### Incorrect answers:

society
associations
cultures
organizations



### Question 17:

As a perennial source of sustenance and civilization, the Nile has shaped the development of ancient and modern societies alike, fostering agriculture, trade, and cultural exchange along its fertile _____.


#### Answer:

banks


#### Incorrect answers:

bank
banking
lenders
loans



### Question 18:

From the ancient _____ of Egypt and Nubia to the contemporary nations of Sudan and Ethiopia, the river has been a cradle of human civilization, offering fertile soils and freshwater resources that sustain millions of people to this day.


#### Answer:

civilizations


#### Incorrect answers:

civilisations
civilization
cultures
religions



### Question 19:

From the ancient civilizations of Egypt and Nubia to the contemporary _____ of Sudan and Ethiopia, the river has been a cradle of human civilization, offering fertile soils and freshwater resources that sustain millions of people to this day.


#### Answer:

nations


#### Incorrect answers:

countries
u.n.
un
united



### Question 20:

From the ancient civilizations of Egypt and Nubia to the contemporary nations of Sudan and Ethiopia, the river has been a cradle of human civilization, offering fertile soils and freshwater resources that sustain millions of _____ to this day.


#### Answer:

people


#### Incorrect answers:

others
those
least
many

