# Feature Engineering

## Imports

In [1]:
!pip3 install -r Requirements.txt



In [2]:

import pandas as pd

### Spacy
For the features of the words we're going to use spacy. Once we tokenize the text, we can access features like Part of speech and others.

In [3]:
!python3 -m spacy download en_core_web_sm 


Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 2.1 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [4]:
import spacy
nlp = spacy.load('en_core_web_sm')

#### Displaying progress
tqdm is a progress bar. It will come usefeful when we extract the answers for all the articles.

In [5]:
from tqdm import tqdm

In [6]:
import time

for i in tqdm(range(10)):
    time.sleep(0.5)

100%|██████████| 10/10 [00:05<00:00,  1.99it/s]


#### Pickling
Once we extract all the words from the texts, we'll save them using pickle. Then we can easily use them in the other modules and have to wait for them to generat again.

In [7]:
import _pickle as cPickle
from pathlib import Path

def dumpPickle(fileName, content):
    pickleFile = open(fileName, 'wb')
    cPickle.dump(content, pickleFile, -1)
    pickleFile.close()

def loadPickle(fileName):    
    file = open(fileName, 'rb')
    content = cPickle.load(file)
    file.close()
    
    return content
    
def pickleExists(fileName):
    file = Path(fileName)
    
    if file.is_file():
        return True
    
    return False

## Reading the dataset

In [8]:
train = pd.read_json('../data/squad-v1/train-v1.1.json', orient='column')
dev = pd.read_json('../data/squad-v1/dev-v1.1.json', orient='column')

df = pd.concat([train, dev], ignore_index=True)

In [9]:
df.head()

Unnamed: 0,data,version
0,"{'title': 'University_of_Notre_Dame', 'paragra...",1.1
1,"{'title': 'Beyoncé', 'paragraphs': [{'context'...",1.1
2,"{'title': 'Montana', 'paragraphs': [{'context'...",1.1
3,"{'title': 'Genocide', 'paragraphs': [{'context...",1.1
4,"{'title': 'Antibiotics', 'paragraphs': [{'cont...",1.1


## Extracting words from a paragrapgh

In [10]:
currText = df['data'][0]['paragraphs'][0]['context']
currQas = df['data'][0]['paragraphs'][0]['qas']

In [11]:
currDoc = nlp(currText)

In [12]:
#Extract answers and the sentence they are in
def extractAnswers(qas, doc):
    answers = []

    senStart = 0
    senId = 0

    for sentence in doc.sents:
        senLen = len(sentence.text)

        for answer in qas:
            answerStart = answer['answers'][0]['answer_start']

            if (answerStart >= senStart and answerStart < (senStart + senLen)):
                answers.append({'sentenceId': senId, 'text': answer['answers'][0]['text']})

        senStart += senLen
        senId += 1
    
    return answers

In [13]:
currAnswers = extractAnswers(currQas, currDoc)
currAnswers

[{'sentenceId': 1, 'text': 'a golden statue of the Virgin Mary'},
 {'sentenceId': 2, 'text': 'a copper statue of Christ'},
 {'sentenceId': 3, 'text': 'the Main Building'},
 {'sentenceId': 4, 'text': 'a Marian place of prayer and reflection'},
 {'sentenceId': 5, 'text': 'Saint Bernadette Soubirous'}]

In [14]:
#TODO - Clean answers from stopwords?
def tokenIsAnswer(token, sentenceId, answers):
    for i in range(len(answers)):
        if (answers[i]['sentenceId'] == sentenceId):
            if (answers[i]['text'] == token):
                return True
    return False

In [15]:
tokenIsAnswer('the Main Building', 4, currAnswers)

False

In [16]:
#Save named entities start points

def getNEStartIndexs(doc):
    neStarts = {}
    for ne in doc.ents:
        neStarts[ne.start] = ne
        
    return neStarts 

In [17]:
currNeStarts = getNEStartIndexs(currDoc)

if 6 in currNeStarts:
    print(currNeStarts[6].label_)

NORP


In [18]:
def getSentenceStartIndexes(doc):
    senStarts = []
    
    for sentence in doc.sents:
        senStarts.append(sentence[0].i)
    
    return senStarts
    
def getSentenceForWordPosition(wordPos, senStarts):
    for i in range(1, len(senStarts)):
        if (wordPos < senStarts[i]):
            return i - 1

In [19]:
senStarts = getSentenceStartIndexes(currDoc)
senStarts

[0, 9, 25, 55, 68, 84, 108]

In [20]:
getSentenceForWordPosition(108, senStarts)

In [21]:
#Creating the dataframe
wordColums = ['text', 'isAnswer', 'titleId', 'paragrapghId', 'sentenceId','wordCount', 'NER', 'POS', 'TAG', 'DEP','shape', 'Rake']
wordDf = pd.DataFrame(columns=wordColums)

#Save to pickle

#load df

#Add new words to array
newWord = ['koala', True, 0, 0, 4, 1, None, None, None, None, 'xxxxx']
newWords = []
#newWords.append(newWord)

#Make array to dataframe
newWordsDf = pd.DataFrame(newWords, columns=wordColums)
newWordsDf

#Merge dataframes

Unnamed: 0,text,isAnswer,titleId,paragrapghId,sentenceId,wordCount,NER,POS,TAG,DEP,shape,Rake


In [22]:
!pip3 install rake_nltk



In [23]:
from rake_nltk import Rake
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [24]:
# from keybert import KeyBERT

def addWordsForParagrapgh(newWords, titleId, paragraphId):
    text = df['data'][titleId]['paragraphs'][paragraphId]['context']
    qas = df['data'][titleId]['paragraphs'][paragraphId]['qas']

    doc = nlp(text)
    
    r = Rake()
    r.extract_keywords_from_text(text)
    b = r.get_ranked_phrases()
    
    # kb_model = KeyBERT()
    # keywords = kb_model.extract_keywords(text)
    
    answers = extractAnswers(qas, doc)
    neStarts = getNEStartIndexs(doc)
    senStarts = getSentenceStartIndexes(doc)
    
    #index of word in spacy doc text
    i = 0
    
    while (i < len(doc)):
        #If the token is a start of a Named Entity, add it and push to index to end of the NE
        if (i in neStarts):
            word = neStarts[i]
            #add word
            currentSentence = getSentenceForWordPosition(word.start, senStarts)
            wordLen = word.end - word.start
            shape = ''
            for wordIndex in range(word.start, word.end):
                shape += (' ' + doc[wordIndex].shape_)

            newWords.append([word.text,
                            tokenIsAnswer(word.text, currentSentence, answers),
                            titleId,
                            paragraphId,
                            currentSentence,
                            wordLen,
                            word.label_,
                            None,
                            None,
                            None,
                            shape,
                            b.count(word.text)>0])
                            # keywords.count(word.text)>0])
            i = neStarts[i].end - 1
        #If not a NE, add the word if it's not a stopword or a non-alpha (not regular letters)
        else:
            if (doc[i].is_stop == False and doc[i].is_alpha == True):
                word = doc[i]

                currentSentence = getSentenceForWordPosition(i, senStarts)
                wordLen = 1

                newWords.append([word.text,
                                tokenIsAnswer(word.text, currentSentence, answers),
                                titleId,
                                paragraphId,
                                currentSentence,
                                wordLen,
                                None,
                                word.pos_,
                                word.tag_,
                                word.dep_,
                                word.shape_,
                                b.count(word.text)>0])
                                # keywords.count(word.text)>0])
        i += 1


In [25]:
#TODO For each token add, for each NE add... 

In [26]:
newWords = []

In [27]:
addWordsForParagrapgh(newWords, 0, 1)

In [28]:
newWords[0]

['universities', False, 0, 1, 0, 1, None, 'NOUN', 'NNS', 'pobj', 'xxxx', True]

In [29]:
newWordsDf = pd.DataFrame(newWords, columns=wordColums)
newWordsDf.head()

Unnamed: 0,text,isAnswer,titleId,paragrapghId,sentenceId,wordCount,NER,POS,TAG,DEP,shape,Rake
0,universities,False,0,1,0.0,1,,NOUN,NNS,pobj,xxxx,True
1,Notre Dame's,False,0,1,0.0,3,ORG,,,,Xxxxx Xxxx 'x,False
2,students,False,0,1,0.0,1,,NOUN,NNS,nsubj,xxxx,True
3,run,False,0,1,0.0,1,,VERB,VBP,ROOT,xxx,False
4,number,False,0,1,0.0,1,,NOUN,NN,dobj,xxxx,True


In [30]:
newWordsDf[newWordsDf['isAnswer'] == True].head()

Unnamed: 0,text,isAnswer,titleId,paragrapghId,sentenceId,wordCount,NER,POS,TAG,DEP,shape,Rake
13,three,True,0,1,1.0,1,CARDINAL,,,,xxxx,True
24,September 1876,True,0,1,2.0,2,DATE,,,,Xxxxx dddd,False
75,1987,True,0,1,7.0,1,DATE,,,,dddd,True


Generating a words for 2 titles

In [31]:
words = []

#titlesCount = len(df['data'])
titlesCount = 2

for titleId in tqdm(range(titlesCount)):
    paragraphsCount = len(df['data'][titleId]['paragraphs'])
        
    for paragraphId in range(paragraphsCount):
        addWordsForParagrapgh(words, titleId, paragraphId)
        

100%|██████████| 2/2 [00:05<00:00,  2.70s/it]


In [32]:
wordsDf = pd.DataFrame(words, columns=wordColums)
wordsDf.head()

Unnamed: 0,text,isAnswer,titleId,paragrapghId,sentenceId,wordCount,NER,POS,TAG,DEP,shape,Rake
0,Architecturally,False,0,0,0.0,1,ORG,,,,Xxxxx,False
1,school,False,0,0,0.0,1,,NOUN,NN,nsubj,xxxx,True
2,Catholic,False,0,0,0.0,1,NORP,,,,Xxxxx,False
3,character,False,0,0,0.0,1,,NOUN,NN,dobj,xxxx,False
4,Atop,False,0,0,1.0,1,,ADP,IN,prep,Xxxx,False


In [33]:
print("Total words for 2 articles:", len(wordsDf))

Total words for 2 articles: 8664


## Generating the entire word dataset

In [None]:
wordPickleName = '../data/pickles/wordsDf.pkl'
# wordPickleName = '/content/wordsDf.pkl'
#If the dataframe is already generated, load it.
if (pickleExists(wordPickleName)):
    print("Pickle found. Saved some time.")
    wordsDf = loadPickle(wordPickleName)
else:
    #Extracting words
    words = []

#     titlesCount = len(df['data'])   
    titlesCount = 2   

    for titleId in tqdm(range(titlesCount)):
        paragraphsCount = len(df['data'][titleId]['paragraphs'])

#         printProgress(titleId, titlesCount - 1)

        for paragraphId in range(paragraphsCount):
            addWordsForParagrapgh(words, titleId, paragraphId)
    
    #Create the dataframe
    wordColums = ['text', 'isAnswer', 'titleId', 'paragrapghId', 'sentenceId','wordCount', 'NER', 'POS', 'TAG', 'DEP','shape', 'Rake']
    wordsDf = pd.DataFrame(words, columns=wordColums)
    
    #Pickle the result
    dumpPickle(wordPickleName, wordsDf)
    print("Result was not pickled. You had to wait.")

 50%|█████     | 1/2 [00:02<00:02,  2.76s/it]

Total extracted words:

In [None]:
print("Total words for all articles:", len(wordsDf))

Check what percentage of the extracted words are answers in the dataframe. They should be pretty low

In [None]:
totalAnswers = len(wordsDf[wordsDf['isAnswer'] == True])
print(totalAnswers, 'total answers', '{:.2f}%'.format((totalAnswers / len(wordsDf)) * 100), 'of all words are answers.')