In [8]:
from nltk import tokenize
import pandas as pd

In [9]:
train = pd.read_json('../data/squad-v1/archive/train-v1.1.json', orient='column')
dev = pd.read_json('../data/squad-v1/archive/dev-v1.1.json', orient='column')
df = pd.concat([train, dev], ignore_index=True) 
df.head()

Unnamed: 0,data,version
0,"{'title': 'University_of_Notre_Dame', 'paragra...",1.1
1,"{'title': 'Beyoncé', 'paragraphs': [{'context'...",1.1
2,"{'title': 'Montana', 'paragraphs': [{'context'...",1.1
3,"{'title': 'Genocide', 'paragraphs': [{'context...",1.1
4,"{'title': 'Antibiotics', 'paragraphs': [{'cont...",1.1


In [10]:
def showQuestion(titleId, paragraphId, questionId):
    
    title = df['data'][titleId]['title']
    paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']
    question = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['question']
    answer = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['text']
    answerStart = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['answer_start']

    print('Title')
    print(title)
    print('\nParagraph')
    print(paragraph)
    print('\nQuestion')
    print(question)
    print('\nAnswer')
    print(answerStart)
    print(answer)

In [11]:
titleId = 0
paragraphId = 0 
questionId = 0

showQuestion(titleId, paragraphId, questionId)

Title
University_of_Notre_Dame

Paragraph
Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.

Question
To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?

Answer
515
Saint Bernadette Soubirous


Dataset Size

In [12]:
titlesCount = len(df['data'])
totalParagraphsCount = 0
totalQuestionsCount = 0

for titleId in range(titlesCount):
    paragraphsCount = len(df['data'][titleId]['paragraphs'])
    totalParagraphsCount += paragraphsCount
    
    for paragraphId in range(paragraphsCount):
        questionsCount = len(df['data'][titleId]['paragraphs'][paragraphId]['qas'])
        totalQuestionsCount += questionsCount
        
print('Titles', titlesCount)
print('Paragraphs', totalParagraphsCount)
print('Questions', totalQuestionsCount)

Titles 490
Paragraphs 20963
Questions 98169


In [13]:
def extractSentence(paragraph, answerStart):
    
    sentences = tokenize.sent_tokenize(paragraph)
    sentenceStart = 0
    
    for sentence in sentences:
        if (sentenceStart + len(sentence) >= answerStart):
            return sentence         
        
        sentenceStart += len(sentence) + 1

In [14]:
paragraph = df['data'][0]['paragraphs'][0]['context']
answerStart = df['data'][0]['paragraphs'][0]['qas'][0]['answers'][0]['answer_start']

sentence = extractSentence(paragraph, answerStart)
print(sentence)

It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.


In [15]:
def containedInText(text, question):
    
    questionWords = tokenize.word_tokenize(question.lower())
    textWords = tokenize.word_tokenize(text.lower())
    wordsContained = 0

    for questionWord in questionWords:
        for textWord in textWords:
            if (questionWord == textWord):
                wordsContained += 1
                break

    return wordsContained / len(questionWords)

In [16]:
question =  df['data'][0]['paragraphs'][0]['qas'][0]['question']
contained = containedInText(sentence, question)

print('Question')
print(question)
print('\nSentence')
print(sentence)
print("\nContained")
print(contained)


Question
To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?

Sentence
It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.

Contained
0.6428571428571429


In [17]:
import pandas as pd
from pathlib import Path
from IPython.display import clear_output

def printPercentage(currentStep, maxStep):
    stepSize = maxStep / 100
    if int(currentStep / stepSize) > int((currentStep - 1) / stepSize):
        clear_output()
        print('{}%'.format(int(currentStep / stepSize)))

questionContainmentCsvName = 'questionContainmentDf.csv'

def csvExists(fileName):
    file = Path(fileName)
    return file.is_file()

if csvExists(questionContainmentCsvName):
    print("CSV found. Loading...")
    questionContainmentDf = pd.read_csv(questionContainmentCsvName)
else:
    sentenceScore = []
    paragraphScore = []

    titlesCount = len(df['data'])
    for titleId in range(titlesCount):
        printPercentage(titleId, titlesCount)

        for paragraphId in range(len(df['data'][titleId]['paragraphs'])):
            paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']

            for questionId in range(len(df['data'][titleId]['paragraphs'][paragraphId]['qas'])):
                question = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['question']
                answer = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['text']
                answerStart = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['answer_start']
                sentence = extractSentence(paragraph, answerStart)

                sentenceScore.append(containedInText(sentence, question))
                paragraphScore.append(containedInText(paragraph, question))
                                        
    sentenceScoreDf = pd.DataFrame(sentenceScore, columns=['sentence'])
    paragraphScoreDf = pd.DataFrame(paragraphScore, columns=['paragraph'])
    questionContainmentDf = pd.concat([sentenceScoreDf, paragraphScoreDf], axis=1)

    questionContainmentDf.to_csv(questionContainmentCsvName, index=False)
    print("Result not found. Generating and saved as CSV...")
    
   

CSV found. Loading...


In [18]:
 #Extracting Answers Dataframe
answers = []
sentences = []

titlesCount = len(df['data'])
for titleId in range(titlesCount):
    printPercentage(titleId, titlesCount)

    for paragraphId in range(len(df['data'][titleId]['paragraphs'])):
        paragraph = df['data'][titleId]['paragraphs'][paragraphId]['context']

        for questionId in range(len(df['data'][titleId]['paragraphs'][paragraphId]['qas'])):
            answer = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['text']
            answerStart = df['data'][titleId]['paragraphs'][paragraphId]['qas'][questionId]['answers'][0]['answer_start']
            sentence = extractSentence(paragraph, answerStart)
            #For extracting answers
            answers.append(answer)
            sentences.append(sentence)
answerTextsDf = pd.DataFrame(answers, columns=['answer'])
sentenceDf = pd.DataFrame(sentences, columns=['sentence'])
answersDf = pd.concat([answerTextsDf, sentenceDf], axis=1)

99%


In [19]:
questionContainmentDf.describe()

Unnamed: 0,sentence,paragraph
count,98169.0,98169.0
mean,0.463967,0.582192
std,0.190377,0.159048
min,0.0,0.0
25%,0.333333,0.5
50%,0.461538,0.6
75%,0.6,0.7
max,1.0,1.0


In [20]:
questionContainmentDf.head(10)

Unnamed: 0,sentence,paragraph
0,0.642857,0.571429
1,0.636364,0.636364
2,0.533333,0.6
3,0.375,0.5
4,0.333333,0.416667
5,0.272727,0.636364
6,0.3,0.8
7,0.363636,0.727273
8,0.0,0.545455
9,0.266667,0.733333


In [21]:
answersDf.head()

Unnamed: 0,answer,sentence
0,Saint Bernadette Soubirous,"It is a replica of the grotto at Lourdes, Fran..."
1,a copper statue of Christ,Immediately in front of the Main Building and ...
2,the Main Building,Next to the Main Building is the Basilica of t...
3,a Marian place of prayer and reflection,"Immediately behind the basilica is the Grotto,..."
4,a golden statue of the Virgin Mary,Atop the Main Building's gold dome is a golden...


In [22]:
wordCount = []

for i in range(len(answersDf)):
    wordCount.append(len(tokenize.word_tokenize(answersDf.iloc[i]['answer'])))

In [23]:
answersDf = pd.concat([answersDf, pd.DataFrame(wordCount, columns=['wordCount'])], axis=1)

In [24]:
answersDf['wordCount'].describe()

count    98169.000000
mean         3.355031
std          3.731700
min          1.000000
25%          1.000000
50%          2.000000
75%          4.000000
max         46.000000
Name: wordCount, dtype: float64

In [25]:
answersDf.head()

Unnamed: 0,answer,sentence,wordCount
0,Saint Bernadette Soubirous,"It is a replica of the grotto at Lourdes, Fran...",3
1,a copper statue of Christ,Immediately in front of the Main Building and ...,5
2,the Main Building,Next to the Main Building is the Basilica of t...,3
3,a Marian place of prayer and reflection,"Immediately behind the basilica is the Grotto,...",7
4,a golden statue of the Virgin Mary,Atop the Main Building's gold dome is a golden...,7


In [26]:
answersDf['wordCount'].value_counts()

wordCount
1     32156
2     25228
3     14348
4      7562
5      4659
6      3051
7      2222
8      1676
9      1206
10      975
11      755
12      652
13      566
14      461
15      407
16      313
18      274
17      269
19      244
20      191
21      182
23      138
22      131
25      120
24      101
26       77
28       59
27       58
29       28
30       19
31       12
32       11
33        6
38        2
34        2
35        2
36        2
37        2
46        1
42        1
Name: count, dtype: int64

WORD TYPES


In [27]:
import spacy
from spacy import displacy
from collections import Counter
nlp = spacy.load('en_core_web_sm')

In [28]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
print([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'), ('Google', 'ORG'), ('$5.1 billion', 'MONEY'), ('Wednesday', 'DATE')]


In [29]:
def NerForWord(text):
    doc = nlp(text)
    
    entitiesFound = len(doc.ents)
    
    if (entitiesFound > 0):
        return doc.ents[0].label_
    else:
        return ''

In [30]:
NerForWord('Portugal')

'GPE'

In [31]:
def isSingleToken(text):
    doc = nlp(text)
    
    #The entire text is a single named entity 
    entitiesFound = len(doc.ents)
    if(entitiesFound == 1 and doc.ents[0].text == text):
        return True
    
    #The text is not an named entity, but is a single token
    tokensFound = len(doc)
    if (tokensFound == 1):
        return True
    
    return False

In [32]:
isSingleToken('George R. R. Martin')

True

In [33]:
singleTokenCount = 0


for i in range(len(answersDf)):
    
    printPercentage(i, len(answersDf))
    
    if (isSingleToken(answersDf.iloc[i]['answer'])):
        singleTokenCount += 1

99%


In [34]:
sampleSize = int(len(answersDf))
singleTokenCount / sampleSize

0.508521019873891

In [35]:
answersDf['isSingleToken'] = False
answersDf['NER'] = ''
answersDf['POS'] = ''
answersDf['TAG'] = ''
answersDf['DEP'] = ''
answersDf['shape'] = ''
answersDf['isAlpha'] = False
answersDf['isStop'] = False
answersDf.head()

Unnamed: 0,answer,sentence,wordCount,isSingleToken,NER,POS,TAG,DEP,shape,isAlpha,isStop
0,Saint Bernadette Soubirous,"It is a replica of the grotto at Lourdes, Fran...",3,False,,,,,,False,False
1,a copper statue of Christ,Immediately in front of the Main Building and ...,5,False,,,,,,False,False
2,the Main Building,Next to the Main Building is the Basilica of t...,3,False,,,,,,False,False
3,a Marian place of prayer and reflection,"Immediately behind the basilica is the Grotto,...",7,False,,,,,,False,False
4,a golden statue of the Virgin Mary,Atop the Main Building's gold dome is a golden...,7,False,,,,,,False,False


In [36]:
import pandas as pd
import os

def csv_exists(file_path):
    return os.path.isfile(file_path)

csv_file_path = "df.csv"

if csv_exists(csv_file_path):
    answersDf = pd.read_csv(csv_file_path)
else:
    for i in range(len(answersDf)):
        answer = answersDf.iloc[i]['answer']
        if (isSingleToken(answer)):
            answersDf.at[i, 'isSingleToken'] = True
            answersDf.at[i, 'NER'] = NerForWord(answer)
            
            doc = nlp(answer)
            
            answersDf.at[i, 'POS'] = doc[0].pos_
            answersDf.at[i, 'TAG'] = doc[0].tag_
            answersDf.at[i, 'DEP'] = doc[0].dep_
            answersDf.at[i, 'isAlpha'] = doc[0].is_alpha
            answersDf.at[i, 'isStop'] = doc[0].is_stop
            
            shape = doc[0].shape_
            for wordIndex in range(1, len(doc)):
                shape += (' ' + doc[wordIndex].shape_)
                
            answersDf.at[i, 'shape'] = shape

    answersDf.to_csv(csv_file_path, index=False)


In [37]:
answersDf.head()

Unnamed: 0,answer,sentence,wordCount,isSingleToken,NER,POS,TAG,DEP,shape,isAlpha,isStop
0,Saint Bernadette Soubirous,"It is a replica of the grotto at Lourdes, Fran...",3,False,,,,,,False,False
1,a copper statue of Christ,Immediately in front of the Main Building and ...,5,False,,,,,,False,False
2,the Main Building,Next to the Main Building is the Basilica of t...,3,True,ORG,DET,DT,det,xxx Xxxx Xxxxx,True,True
3,a Marian place of prayer and reflection,"Immediately behind the basilica is the Grotto,...",7,False,,,,,,False,False
4,a golden statue of the Virgin Mary,Atop the Main Building's gold dome is a golden...,7,False,,,,,,False,False


In [38]:
answersDf['isSingleToken'].value_counts()

isSingleToken
True     49921
False    48248
Name: count, dtype: int64

In [39]:
answersDf['isStop'].value_counts()

isStop
False    93877
True      4292
Name: count, dtype: int64

In [40]:
answersDf['NER'].value_counts()

NER
               61309
DATE            9421
PERSON          7056
CARDINAL        6031
ORG             5578
GPE             3225
NORP            1333
PERCENT         1252
LOC              640
MONEY            536
ORDINAL          408
QUANTITY         384
EVENT            295
FAC              239
TIME             157
PRODUCT           83
LANGUAGE          79
LAW               74
WORK_OF_ART       69
Name: count, dtype: int64

In [41]:
answersDf['POS'].value_counts()

POS
         48248
PROPN    18094
NUM      14158
NOUN      9098
ADJ       3640
VERB      1749
DET       1582
ADV        663
SYM        384
ADP        236
X          122
PRON        58
PUNCT       50
INTJ        40
AUX         28
PART        15
CCONJ        4
Name: count, dtype: int64

NOUNS: Answers are dominated by nouns

In [42]:
answersDf[answersDf['POS'] == 'PROPN'].sample(n=5, random_state=16)

Unnamed: 0,answer,sentence,wordCount,isSingleToken,NER,POS,TAG,DEP,shape,isAlpha,isStop
2882,Zelda,"Bringing a dying Midna to Zelda, Link learns h...",1,True,,PROPN,NNP,ROOT,Xxxxx,True,False
40277,Manchester United,"In addition, Arsenal and Manchester United dev...",2,True,ORG,PROPN,NNP,compound,Xxxxx Xxxxx,True,False
35374,Ricoh,The NES uses a custom-made Picture Processing ...,1,True,,PROPN,NNP,ROOT,Xxxxx,True,False
79901,Vitruvius Britannicus,Major architects to promote the change in dire...,2,True,PERSON,PROPN,NNP,compound,Xxxxx Xxxxx,True,False
38762,Egypt,"After the defeat of Anthony and his lover, the...",1,True,GPE,PROPN,NNP,ROOT,Xxxxx,True,False


In [43]:
answersDf[answersDf['POS'] == 'NOUN'].sample(n=5, random_state=16)

Unnamed: 0,answer,sentence,wordCount,isSingleToken,NER,POS,TAG,DEP,shape,isAlpha,isStop
2471,Apple,The iPod is a line of portable media players a...,1,True,ORG,NOUN,NN,ROOT,Xxxxx,True,False
26904,handicrafts,The state is well known for its handicrafts.,1,True,,NOUN,NNS,ROOT,xxxx,True,False
40513,women,This change is interesting from a sociolinguis...,1,True,,NOUN,NNS,ROOT,xxxx,True,False
38140,intermarriage,"McNutt says, ""It is probably safe to assume th...",1,True,,NOUN,NN,ROOT,xxxx,True,False
25550,LAME,"However, some encoders such as LAME can attach...",1,True,PERSON,NOUN,NN,ROOT,XXXX,True,False


Numerals

In [44]:
answersDf[answersDf['POS'] == 'NUM'].sample(n=10, random_state=16)

Unnamed: 0,answer,sentence,wordCount,isSingleToken,NER,POS,TAG,DEP,shape,isAlpha,isStop
60329,1877,"In 1877, the Protestant James Cameron from the...",1,True,DATE,NUM,CD,ROOT,dddd,False,False
17947,32,"About 2,100 students attend the Edwards Campus...",1,True,CARDINAL,NUM,CD,ROOT,dd,False,False
35419,11.7,The unit itself weighs approximately 11.7 poun...,1,True,CARDINAL,NUM,CD,ROOT,dd.d,False,False
65451,1790,The Seine département had been governing Paris...,1,True,DATE,NUM,CD,ROOT,dddd,False,False
13308,47,"In 1974, there were 475 institutes of higher e...",1,True,CARDINAL,NUM,CD,ROOT,dd,False,False
90384,12,Amongst these include 5 University of Californ...,1,True,CARDINAL,NUM,CD,ROOT,dd,False,False
64351,three,"He proposed three worlds: World One, being the...",1,True,CARDINAL,NUM,CD,ROOT,xxxx,True,True
73146,800 tonnes,The last major attack on London was on 10/11 M...,2,True,QUANTITY,NUM,CD,nummod,ddd xxxx,False,False
7534,131,"In 2012, an analysis of the 131 contestants wh...",1,True,CARDINAL,NUM,CD,ROOT,ddd,False,False
31894,1214,"John's first wife, Isabel, Countess of Glouces...",1,True,DATE,NUM,CD,ROOT,dddd,False,False


Noun Chunks

In [45]:
text = df['data'][0]['paragraphs'][0]['context']
doc = nlp(text)

for noun_chunk in doc.noun_chunks:
    print(noun_chunk)

the school
a Catholic character
the Main Building's gold dome
a golden statue
the Virgin Mary
front
the Main Building
it
a copper statue
Christ
arms
the legend
"Venite Ad Me Omnes
the Main Building
the Basilica
the Sacred Heart
the basilica
the Grotto
a Marian place
prayer
reflection
It
a replica
the grotto
Lourdes
France
the Virgin Mary
the end
the main drive
a direct line
that
3 statues
the Gold Dome
a simple, modern stone statue
Mary
