In [1]:
import pandas as pd

### Question Answer Dataset from The WikiQA Corpus (2015)
#### Available at https://www.microsoft.com/en-us/download/confirmation.aspx?id=52419

In [3]:
# Create dataframes
df = pd.read_csv('WikiQA.tsv',sep='\t')

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29208 entries, 0 to 29207
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   QuestionID     29208 non-null  object
 1   Question       29208 non-null  object
 2   DocumentID     29208 non-null  object
 3   DocumentTitle  29208 non-null  object
 4   SentenceID     29208 non-null  object
 5   Sentence       29208 non-null  object
 6   Label          29208 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 1.6+ MB


In [17]:
df.head()

Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label
0,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,D0,African immigration to the United States,D0-0,African immigration to the United States refer...,0
1,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,D0,African immigration to the United States,D0-1,The term African in the scope of this article ...,0
2,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,D0,African immigration to the United States,D0-2,From the Immigration and Nationality Act of 19...,0
3,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,D0,African immigration to the United States,D0-3,African immigrants in the United States come f...,0
4,Q0,HOW AFRICAN AMERICANS WERE IMMIGRATED TO THE US,D0,African immigration to the United States,D0-4,"They include people from different national, l...",0


In [22]:
# Question topics 
pd.set_option("display.max_rows", None)
df["DocumentTitle"].value_counts().to_frame()

Unnamed: 0,DocumentTitle
American Civil War,203
World War I,162
John F. Kennedy,100
New Deal,87
Spider,84
Education in the United States,72
Body water,66
George Washington,60
Sic,60
John Adams,57


### NLP pre-processing pipeline

In [30]:
import spacy
import string
from spacy.lang.en import English
parser = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS
punctuations = string.punctuation

# Create tokeniser function
def mytokeniser(text):
    
    mytokens = parser(text)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words and punctuation
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [38]:
all_words = []
tags = []
xy = []

for index, row in df.iterrows():
    tag = row["DocumentTitle"]
    tags.append(tag) if tag not in tags else tags
    for word in mytokeniser(row["Question"]):
        all_words.append(word) if word not in all_words else all_words
        xy.append((word, tag)) if (word, tag) not in xy else xy

In [39]:
xy

[('african', 'African immigration to the United States'),
 ('americans', 'African immigration to the United States'),
 ('immigrated', 'African immigration to the United States'),
 ('glaci', 'Glacier cave'),
 ('cave', 'Glacier cave'),
 ('form', 'Glacier cave'),
 ('direction', 'Circular motion'),
 ('velocity', 'Circular motion'),
 ('force', 'Circular motion'),
 ('vector', 'Circular motion'),
 ('relate', 'Circular motion'),
 ('circular', 'Circular motion'),
 ('motion', 'Circular motion'),
 ('large', 'Prison'),
 ('early', 'Prison'),
 ('jail', 'Prison'),
 ('water', 'Pump'),
 ('pump', 'Pump'),
 ('work', 'Pump'),
 ('apollo', 'Apollo Creed'),
 ('creed', 'Apollo Creed'),
 ('die', 'Apollo Creed'),
 ('long', 'United States federal judge'),
 ('term', 'United States federal judge'),
 ('federal', 'United States federal judge'),
 ('judge', 'United States federal judge'),
 ('beretta', 'Beretta 21A Bobcat'),
 ('model', 'Beretta 21A Bobcat'),
 ('21', 'Beretta 21A Bobcat'),
 ('pistol', 'Beretta 21A Bobca