In [1]:
%%time

# Importing necessary libraries

import json
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

CPU times: user 538 ms, sys: 125 ms, total: 663 ms
Wall time: 1.32 s


In [2]:
%%capture

# Utility cell for downloading SQuAD2.0 data in the IPYNB folder

!mkdir squad
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O squad/train-v2.0.json
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O squad/dev-v2.0.json

In [3]:
%%time


'''
Reading from input file and doing json normalizing upon the train data to achieve the required 
dataframe format
'''
def convert_from_json_to_dataframe(file_path, record_path=['data', 'paragraphs', 'qas', 'answers']):
    file = json.loads(open(file_path).read())
    js = pd.json_normalize(file, record_path)
    m = pd.json_normalize(file, record_path[:-1])
    r = pd.json_normalize(file, record_path[:-2])
    index = np.repeat(r['context'].values, r.qas.str.len())
    m['context'] = index
    data = m[['id', 'question', 'context', 'answers']].set_index('id').reset_index()
    data['context_id'] = data['context'].factorize()[0]
    return data
    

train_file_path = 'squad/train-v2.0.json'
train_data = convert_from_json_to_dataframe(train_file_path)
train_data

CPU times: user 8.42 s, sys: 322 ms, total: 8.75 s
Wall time: 8.76 s


Unnamed: 0,id,question,context,answers,context_id
0,56be85543aeaaa14008c9063,When did Beyonce start becoming popular?,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".","[{'text': 'in the late 1990s', 'answer_start': 269}]",0
1,56be85543aeaaa14008c9065,What areas did Beyonce compete in when she was growing up?,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".","[{'text': 'singing and dancing', 'answer_start': 207}]",0
2,56be85543aeaaa14008c9066,When did Beyonce leave Destiny's Child and become a solo singer?,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".","[{'text': '2003', 'answer_start': 526}]",0
3,56bf6b0f3aeaaa14008c9601,In what city and state did Beyonce grow up?,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".","[{'text': 'Houston, Texas', 'answer_start': 166}]",0
4,56bf6b0f3aeaaa14008c9602,In which decade did Beyonce become famous?,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".","[{'text': 'late 1990s', 'answer_start': 276}]",0
...,...,...,...,...,...
130314,5a7e070b70df9f001a875439,Physics has broadly agreed on the definition of what?,"The term ""matter"" is used throughout physics in a bewildering variety of contexts: for example, one refers to ""condensed matter physics"", ""elementary matter"", ""partonic"" matter, ""dark"" matter, ""anti""-matter, ""strange"" matter, and ""nuclear"" matter. In discussions of matter and antimatter, normal matter has been referred to by Alfvén as koinomatter (Gk. common matter). It is fair to say that in physics, there is no broad consensus as to a general definition of matter, and the term ""matter"" usually is used in conjunction with a specifying modifier.",[],19028
130315,5a7e070b70df9f001a87543a,Who coined the term partonic matter?,"The term ""matter"" is used throughout physics in a bewildering variety of contexts: for example, one refers to ""condensed matter physics"", ""elementary matter"", ""partonic"" matter, ""dark"" matter, ""anti""-matter, ""strange"" matter, and ""nuclear"" matter. In discussions of matter and antimatter, normal matter has been referred to by Alfvén as koinomatter (Gk. common matter). It is fair to say that in physics, there is no broad consensus as to a general definition of matter, and the term ""matter"" usually is used in conjunction with a specifying modifier.",[],19028
130316,5a7e070b70df9f001a87543b,What is another name for anti-matter?,"The term ""matter"" is used throughout physics in a bewildering variety of contexts: for example, one refers to ""condensed matter physics"", ""elementary matter"", ""partonic"" matter, ""dark"" matter, ""anti""-matter, ""strange"" matter, and ""nuclear"" matter. In discussions of matter and antimatter, normal matter has been referred to by Alfvén as koinomatter (Gk. common matter). It is fair to say that in physics, there is no broad consensus as to a general definition of matter, and the term ""matter"" usually is used in conjunction with a specifying modifier.",[],19028
130317,5a7e070b70df9f001a87543c,Matter usually does not need to be used in conjunction with what?,"The term ""matter"" is used throughout physics in a bewildering variety of contexts: for example, one refers to ""condensed matter physics"", ""elementary matter"", ""partonic"" matter, ""dark"" matter, ""anti""-matter, ""strange"" matter, and ""nuclear"" matter. In discussions of matter and antimatter, normal matter has been referred to by Alfvén as koinomatter (Gk. common matter). It is fair to say that in physics, there is no broad consensus as to a general definition of matter, and the term ""matter"" usually is used in conjunction with a specifying modifier.",[],19028


In [4]:
%%time

# Exploring the train dataset a bit more!

print ('Number of unique contexts in the train dataset :', len(train_data['context_id'].unique()))

Number of unique contexts in the train dataset : 19029
CPU times: user 3.82 ms, sys: 987 µs, total: 4.8 ms
Wall time: 4.79 ms


In [5]:
%%time

# Getting unique documents from the train dataset

train_unique_documents = train_data[['context', 'context_id']].drop_duplicates().reset_index(drop=True)
train_unique_documents

CPU times: user 193 ms, sys: 2.89 ms, total: 196 ms
Wall time: 197 ms


Unnamed: 0,context,context_id
0,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".",0
1,"Following the disbandment of Destiny's Child in June 2005, she released her second solo album, B'Day (2006), which contained hits ""Déjà Vu"", ""Irreplaceable"", and ""Beautiful Liar"". Beyoncé also ventured into acting, with a Golden Globe-nominated performance in Dreamgirls (2006), and starring roles in The Pink Panther (2006) and Obsessed (2009). Her marriage to rapper Jay Z and portrayal of Etta James in Cadillac Records (2008) influenced her third album, I Am... Sasha Fierce (2008), which saw the birth of her alter-ego Sasha Fierce and earned a record-setting six Grammy Awards in 2010, including Song of the Year for ""Single Ladies (Put a Ring on It)"". Beyoncé took a hiatus from music in 2010 and took over management of her career; her fourth album 4 (2011) was subsequently mellower in tone, exploring 1970s funk, 1980s pop, and 1990s soul. Her critically acclaimed fifth studio album, Beyoncé (2013), was distinguished from previous releases by its experimental production and exploration of darker themes.",1
2,"A self-described ""modern-day feminist"", Beyoncé creates songs that are often characterized by themes of love, relationships, and monogamy, as well as female sexuality and empowerment. On stage, her dynamic, highly choreographed performances have led to critics hailing her as one of the best entertainers in contemporary popular music. Throughout a career spanning 19 years, she has sold over 118 million records as a solo artist, and a further 60 million with Destiny's Child, making her one of the best-selling music artists of all time. She has won 20 Grammy Awards and is the most nominated woman in the award's history. The Recording Industry Association of America recognized her as the Top Certified Artist in America during the 2000s decade. In 2009, Billboard named her the Top Radio Songs Artist of the Decade, the Top Female Artist of the 2000s and their Artist of the Millennium in 2011. Time listed her among the 100 most influential people in the world in 2013 and 2014. Forbes magazine also listed her as the most powerful female musician of 2015.",2
3,"Beyoncé Giselle Knowles was born in Houston, Texas, to Celestine Ann ""Tina"" Knowles (née Beyincé), a hairdresser and salon owner, and Mathew Knowles, a Xerox sales manager. Beyoncé's name is a tribute to her mother's maiden name. Beyoncé's younger sister Solange is also a singer and a former member of Destiny's Child. Mathew is African-American, while Tina is of Louisiana Creole descent (with African, Native American, French, Cajun, and distant Irish and Spanish ancestry). Through her mother, Beyoncé is a descendant of Acadian leader Joseph Broussard. She was raised in a Methodist household.",3
4,"Beyoncé attended St. Mary's Elementary School in Fredericksburg, Texas, where she enrolled in dance classes. Her singing talent was discovered when dance instructor Darlette Johnson began humming a song and she finished it, able to hit the high-pitched notes. Beyoncé's interest in music and performing continued after winning a school talent show at age seven, singing John Lennon's ""Imagine"" to beat 15/16-year-olds. In fall of 1990, Beyoncé enrolled in Parker Elementary School, a music magnet school in Houston, where she would perform with the school's choir. She also attended the High School for the Performing and Visual Arts and later Alief Elsik High School. Beyoncé was also a member of the choir at St. John's United Methodist Church as a soloist for two years.",4
...,...,...
19024,"Isaac Newton (1643–1727) inherited Descartes' mechanical conception of matter. In the third of his ""Rules of Reasoning in Philosophy"", Newton lists the universal qualities of matter as ""extension, hardness, impenetrability, mobility, and inertia"". Similarly in Optics he conjectures that God created matter as ""solid, massy, hard, impenetrable, movable particles"", which were ""...even so very hard as never to wear or break in pieces"". The ""primary"" properties of matter were amenable to mathematical description, unlike ""secondary"" qualities such as color or taste. Like Descartes, Newton rejected the essential nature of secondary qualities.",19024
19025,"There is an entire literature concerning the ""structure of matter"", ranging from the ""electrical structure"" in the early 20th century, to the more recent ""quark structure of matter"", introduced today with the remark: Understanding the quark structure of matter has been one of the most important advances in contemporary physics.[further explanation needed] In this connection, physicists speak of matter fields, and speak of particles as ""quantum excitations of a mode of the matter field"". And here is a quote from de Sabbata and Gasperini: ""With the word ""matter"" we denote, in this context, the sources of the interactions, that is spinor fields (like quarks and leptons), which are believed to be the fundamental components of matter, or scalar fields, like the Higgs particles, which are used to introduced mass in a gauge theory (and that, however, could be composed of more fundamental fermion fields).""[further explanation needed]",19025
19026,"In the late 19th century with the discovery of the electron, and in the early 20th century, with the discovery of the atomic nucleus, and the birth of particle physics, matter was seen as made up of electrons, protons and neutrons interacting to form atoms. Today, we know that even protons and neutrons are not indivisible, they can be divided into quarks, while electrons are part of a particle family called leptons. Both quarks and leptons are elementary particles, and are currently seen as being the fundamental constituents of matter.",19026
19027,"These quarks and leptons interact through four fundamental forces: gravity, electromagnetism, weak interactions, and strong interactions. The Standard Model of particle physics is currently the best explanation for all of physics, but despite decades of efforts, gravity cannot yet be accounted for at the quantum level; it is only described by classical physics (see quantum gravity and graviton). Interactions between quarks and leptons are the result of an exchange of force-carrying particles (such as photons) between quarks and leptons. The force-carrying particles are not themselves building blocks. As one consequence, mass and energy (which cannot be created or destroyed) cannot always be related to matter (which can be created out of non-matter particles such as photons, or even out of pure energy, such as kinetic energy). Force carriers are usually not considered matter: the carriers of the electric force (photons) possess energy (see Planck relation) and the carriers of the weak force (W and Z bosons) are massive, but neither are considered matter either. However, while these particles are not considered matter, they do contribute to the total mass of atoms, subatomic particles, and all systems that contain them.",19027


In [6]:
%%time

'''
Brief overview of the algorithm to be followed :- 
(1) Creation of the document vectorizer. 
(2) Use of the above document vectorizer to encode the documents and the questions into vectors. 
(3) Search for a question comparing with the document vectors. 
(4) Return the 'k' most similar document vectors to a question vector.

For vectorization, TF-IDF can be used.
Wikipedia says this about TF-IDF -> It is a numerical statistic that is intended to reflect how important 
a word is to a document in a collection or corpus and is often used as a weighting factor in searches of 
information retrieval, text mining, and user modeling.  
'''

# Defining the TF-IDF vectorizer and the number of documents to retrieve 
retriever_configs = {'n_neighbors' : 1, 'metric' : 'cosine'}
tfidf_configs = {'lowercase' : True, 'analyzer' : 'word', 'stop_words' : 'english', 'binary' : True, 
                 'max_features' : 300}

# Fixing the pipeline
embedding = TfidfVectorizer(**tfidf_configs)
retriever = NearestNeighbors(**retriever_configs)

# Training the model to retrieve the document id 'context_id'
X_train = embedding.fit_transform(train_unique_documents['context'])
retriever.fit(X_train, train_unique_documents['context_id'])

CPU times: user 2.12 s, sys: 22.5 ms, total: 2.14 s
Wall time: 2.15 s


NearestNeighbors(metric='cosine', n_neighbors=1)

In [7]:
%%time

# Vectorizing a sample question

def transform_text(vectorizer, text):
    print ('Text :', text)
    vector = vectorizer.transform([text])
    vector = vectorizer.inverse_transform(vector)
    print ('Vector :', vector)
    

question = 'What are the tourist hostspots in Spain?'
transform_text(embedding, question)

Text : What are the tourist hostspots in Spain?
Vector : [array([], dtype='<U13')]
CPU times: user 2.58 ms, sys: 0 ns, total: 2.58 ms
Wall time: 2.48 ms


In [8]:
%%time

# Retrieving the most similar document to the above question and vectorizing the retrieved document

X_question = embedding.transform([question])
context_id = retriever.kneighbors(X_question, return_distance=False)[0][0]
selected_document = train_unique_documents.iloc[context_id]['context']

transform_text(embedding, selected_document)

Text : Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Vector : [array(['world', 'various', 'time', 'september', 'number', 'late',
       'groups', 'group', 'established', 'best', 'american', '100'],
      dtype='<U13')]
CPU times: user 14.5 ms, sys: 37 µs, total: 14.6 ms
Wall time: 12.6 ms


In [9]:
%%time

# Predicting the top document for each question

X_train = embedding.transform(train_data['question'])
y_test = train_data['context_id']
y_pred = retriever.kneighbors(X_train, return_distance=False)

print (y_pred)

[[4758]
 [3469]
 [ 867]
 ...
 [   0]
 [5699]
 [5239]]
CPU times: user 29.7 s, sys: 25 s, total: 54.7 s
Wall time: 54.7 s


In [10]:
%%time

# Testing the model upon the training set
def compute_accuracy(y_test, y_pred):
    num_correct, num_total = 0, len(y_test)
    for i in range(num_total):
        y_pred_list = y_pred[i]
        if y_test[i] in y_pred_list:
            num_correct += 1
    return num_correct / num_total


acc = compute_accuracy(y_test, y_pred)
print('Accuracy:', f'{acc:.4f}')

Accuracy: 0.0123
CPU times: user 1.05 s, sys: 698 µs, total: 1.05 s
Wall time: 1.05 s


# **Diving into the DEV dataset....**

In [11]:
%%time

# Loading the validation data

dev_file_path = 'squad/train-v2.0.json'
dev_data = convert_from_json_to_dataframe(dev_file_path)
print (dev_data.shape)
dev_data

(130319, 5)
CPU times: user 8.59 s, sys: 169 ms, total: 8.76 s
Wall time: 8.76 s


Unnamed: 0,id,question,context,answers,context_id
0,56be85543aeaaa14008c9063,When did Beyonce start becoming popular?,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".","[{'text': 'in the late 1990s', 'answer_start': 269}]",0
1,56be85543aeaaa14008c9065,What areas did Beyonce compete in when she was growing up?,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".","[{'text': 'singing and dancing', 'answer_start': 207}]",0
2,56be85543aeaaa14008c9066,When did Beyonce leave Destiny's Child and become a solo singer?,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".","[{'text': '2003', 'answer_start': 526}]",0
3,56bf6b0f3aeaaa14008c9601,In what city and state did Beyonce grow up?,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".","[{'text': 'Houston, Texas', 'answer_start': 166}]",0
4,56bf6b0f3aeaaa14008c9602,In which decade did Beyonce become famous?,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".","[{'text': 'late 1990s', 'answer_start': 276}]",0
...,...,...,...,...,...
130314,5a7e070b70df9f001a875439,Physics has broadly agreed on the definition of what?,"The term ""matter"" is used throughout physics in a bewildering variety of contexts: for example, one refers to ""condensed matter physics"", ""elementary matter"", ""partonic"" matter, ""dark"" matter, ""anti""-matter, ""strange"" matter, and ""nuclear"" matter. In discussions of matter and antimatter, normal matter has been referred to by Alfvén as koinomatter (Gk. common matter). It is fair to say that in physics, there is no broad consensus as to a general definition of matter, and the term ""matter"" usually is used in conjunction with a specifying modifier.",[],19028
130315,5a7e070b70df9f001a87543a,Who coined the term partonic matter?,"The term ""matter"" is used throughout physics in a bewildering variety of contexts: for example, one refers to ""condensed matter physics"", ""elementary matter"", ""partonic"" matter, ""dark"" matter, ""anti""-matter, ""strange"" matter, and ""nuclear"" matter. In discussions of matter and antimatter, normal matter has been referred to by Alfvén as koinomatter (Gk. common matter). It is fair to say that in physics, there is no broad consensus as to a general definition of matter, and the term ""matter"" usually is used in conjunction with a specifying modifier.",[],19028
130316,5a7e070b70df9f001a87543b,What is another name for anti-matter?,"The term ""matter"" is used throughout physics in a bewildering variety of contexts: for example, one refers to ""condensed matter physics"", ""elementary matter"", ""partonic"" matter, ""dark"" matter, ""anti""-matter, ""strange"" matter, and ""nuclear"" matter. In discussions of matter and antimatter, normal matter has been referred to by Alfvén as koinomatter (Gk. common matter). It is fair to say that in physics, there is no broad consensus as to a general definition of matter, and the term ""matter"" usually is used in conjunction with a specifying modifier.",[],19028
130317,5a7e070b70df9f001a87543c,Matter usually does not need to be used in conjunction with what?,"The term ""matter"" is used throughout physics in a bewildering variety of contexts: for example, one refers to ""condensed matter physics"", ""elementary matter"", ""partonic"" matter, ""dark"" matter, ""anti""-matter, ""strange"" matter, and ""nuclear"" matter. In discussions of matter and antimatter, normal matter has been referred to by Alfvén as koinomatter (Gk. common matter). It is fair to say that in physics, there is no broad consensus as to a general definition of matter, and the term ""matter"" usually is used in conjunction with a specifying modifier.",[],19028


In [12]:
%%time

# Exploring the dev dataset a bit more!

print ('Number of unique contexts :', len(dev_data['context_id'].unique()))

Number of unique contexts : 19029
CPU times: user 3.35 ms, sys: 162 µs, total: 3.51 ms
Wall time: 3.58 ms


In [13]:
%%time

# Getting unique documents from the dev dataset

dev_unique_documents = dev_data[['context', 'context_id']].drop_duplicates().reset_index(drop=True)
dev_unique_documents

CPU times: user 191 ms, sys: 990 µs, total: 192 ms
Wall time: 191 ms


Unnamed: 0,context,context_id
0,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".",0
1,"Following the disbandment of Destiny's Child in June 2005, she released her second solo album, B'Day (2006), which contained hits ""Déjà Vu"", ""Irreplaceable"", and ""Beautiful Liar"". Beyoncé also ventured into acting, with a Golden Globe-nominated performance in Dreamgirls (2006), and starring roles in The Pink Panther (2006) and Obsessed (2009). Her marriage to rapper Jay Z and portrayal of Etta James in Cadillac Records (2008) influenced her third album, I Am... Sasha Fierce (2008), which saw the birth of her alter-ego Sasha Fierce and earned a record-setting six Grammy Awards in 2010, including Song of the Year for ""Single Ladies (Put a Ring on It)"". Beyoncé took a hiatus from music in 2010 and took over management of her career; her fourth album 4 (2011) was subsequently mellower in tone, exploring 1970s funk, 1980s pop, and 1990s soul. Her critically acclaimed fifth studio album, Beyoncé (2013), was distinguished from previous releases by its experimental production and exploration of darker themes.",1
2,"A self-described ""modern-day feminist"", Beyoncé creates songs that are often characterized by themes of love, relationships, and monogamy, as well as female sexuality and empowerment. On stage, her dynamic, highly choreographed performances have led to critics hailing her as one of the best entertainers in contemporary popular music. Throughout a career spanning 19 years, she has sold over 118 million records as a solo artist, and a further 60 million with Destiny's Child, making her one of the best-selling music artists of all time. She has won 20 Grammy Awards and is the most nominated woman in the award's history. The Recording Industry Association of America recognized her as the Top Certified Artist in America during the 2000s decade. In 2009, Billboard named her the Top Radio Songs Artist of the Decade, the Top Female Artist of the 2000s and their Artist of the Millennium in 2011. Time listed her among the 100 most influential people in the world in 2013 and 2014. Forbes magazine also listed her as the most powerful female musician of 2015.",2
3,"Beyoncé Giselle Knowles was born in Houston, Texas, to Celestine Ann ""Tina"" Knowles (née Beyincé), a hairdresser and salon owner, and Mathew Knowles, a Xerox sales manager. Beyoncé's name is a tribute to her mother's maiden name. Beyoncé's younger sister Solange is also a singer and a former member of Destiny's Child. Mathew is African-American, while Tina is of Louisiana Creole descent (with African, Native American, French, Cajun, and distant Irish and Spanish ancestry). Through her mother, Beyoncé is a descendant of Acadian leader Joseph Broussard. She was raised in a Methodist household.",3
4,"Beyoncé attended St. Mary's Elementary School in Fredericksburg, Texas, where she enrolled in dance classes. Her singing talent was discovered when dance instructor Darlette Johnson began humming a song and she finished it, able to hit the high-pitched notes. Beyoncé's interest in music and performing continued after winning a school talent show at age seven, singing John Lennon's ""Imagine"" to beat 15/16-year-olds. In fall of 1990, Beyoncé enrolled in Parker Elementary School, a music magnet school in Houston, where she would perform with the school's choir. She also attended the High School for the Performing and Visual Arts and later Alief Elsik High School. Beyoncé was also a member of the choir at St. John's United Methodist Church as a soloist for two years.",4
...,...,...
19024,"Isaac Newton (1643–1727) inherited Descartes' mechanical conception of matter. In the third of his ""Rules of Reasoning in Philosophy"", Newton lists the universal qualities of matter as ""extension, hardness, impenetrability, mobility, and inertia"". Similarly in Optics he conjectures that God created matter as ""solid, massy, hard, impenetrable, movable particles"", which were ""...even so very hard as never to wear or break in pieces"". The ""primary"" properties of matter were amenable to mathematical description, unlike ""secondary"" qualities such as color or taste. Like Descartes, Newton rejected the essential nature of secondary qualities.",19024
19025,"There is an entire literature concerning the ""structure of matter"", ranging from the ""electrical structure"" in the early 20th century, to the more recent ""quark structure of matter"", introduced today with the remark: Understanding the quark structure of matter has been one of the most important advances in contemporary physics.[further explanation needed] In this connection, physicists speak of matter fields, and speak of particles as ""quantum excitations of a mode of the matter field"". And here is a quote from de Sabbata and Gasperini: ""With the word ""matter"" we denote, in this context, the sources of the interactions, that is spinor fields (like quarks and leptons), which are believed to be the fundamental components of matter, or scalar fields, like the Higgs particles, which are used to introduced mass in a gauge theory (and that, however, could be composed of more fundamental fermion fields).""[further explanation needed]",19025
19026,"In the late 19th century with the discovery of the electron, and in the early 20th century, with the discovery of the atomic nucleus, and the birth of particle physics, matter was seen as made up of electrons, protons and neutrons interacting to form atoms. Today, we know that even protons and neutrons are not indivisible, they can be divided into quarks, while electrons are part of a particle family called leptons. Both quarks and leptons are elementary particles, and are currently seen as being the fundamental constituents of matter.",19026
19027,"These quarks and leptons interact through four fundamental forces: gravity, electromagnetism, weak interactions, and strong interactions. The Standard Model of particle physics is currently the best explanation for all of physics, but despite decades of efforts, gravity cannot yet be accounted for at the quantum level; it is only described by classical physics (see quantum gravity and graviton). Interactions between quarks and leptons are the result of an exchange of force-carrying particles (such as photons) between quarks and leptons. The force-carrying particles are not themselves building blocks. As one consequence, mass and energy (which cannot be created or destroyed) cannot always be related to matter (which can be created out of non-matter particles such as photons, or even out of pure energy, such as kinetic energy). Force carriers are usually not considered matter: the carriers of the electric force (photons) possess energy (see Planck relation) and the carriers of the weak force (W and Z bosons) are massive, but neither are considered matter either. However, while these particles are not considered matter, they do contribute to the total mass of atoms, subatomic particles, and all systems that contain them.",19027


In [14]:
%%time

# Training the dev model to retrieve the document id 'context_id'
X_dev = embedding.fit_transform(dev_unique_documents['context'])
retriever.fit(X_dev, dev_unique_documents['context_id'])

CPU times: user 2.12 s, sys: 7.41 ms, total: 2.13 s
Wall time: 2.13 s


NearestNeighbors(metric='cosine', n_neighbors=1)

In [15]:
%%time

# Visualizing how a sample question is transformed into a vector by the above defined model

question = 'What are some of the tourist hotspots in Spain?'
transform_text(embedding, question)

Text : What are some of the tourist hotspots in Spain?
Vector : [array([], dtype='<U13')]
CPU times: user 2.32 ms, sys: 140 µs, total: 2.46 ms
Wall time: 2.33 ms


In [16]:
%%time

# Retrieving the most similar document to the above question and vectorizing the retrieved document

X_question = embedding.transform([question])
context_id = retriever.kneighbors(X_question, return_distance=False)[0][0]
selected_document = dev_unique_documents.iloc[context_id]['context']

transform_text(embedding, selected_document)

Text : Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Vector : [array(['world', 'various', 'time', 'september', 'number', 'late',
       'groups', 'group', 'established', 'best', 'american', '100'],
      dtype='<U13')]
CPU times: user 12.3 ms, sys: 2.13 ms, total: 14.4 ms
Wall time: 12.8 ms


In [17]:
%%time

# Predicting the top document for each question from the dev data

X_dev = embedding.transform(dev_data['question'])
y_test = dev_data['context_id']
y_pred = retriever.kneighbors(X_dev, return_distance=False)

print (y_pred)

[[4758]
 [3469]
 [ 867]
 ...
 [   0]
 [5699]
 [5239]]
CPU times: user 29.8 s, sys: 24.4 s, total: 54.2 s
Wall time: 54.2 s


In [18]:
%%time

# Testing the model upon the dev set

acc = compute_accuracy(y_test, y_pred)
print('Accuracy:', f'{acc:.4f}')

Accuracy: 0.0123
CPU times: user 1.03 s, sys: 1.55 ms, total: 1.03 s
Wall time: 1.03 s
