In [1]:
%%capture

!pip install gensim==4.0.1

In [2]:
%%time

# Importing necessary libraries

import gensim.downloader
from gensim.models import Word2Vec
from gensim.parsing.preprocessing import preprocess_string
import json
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

CPU times: user 646 ms, sys: 141 ms, total: 787 ms
Wall time: 1.16 s


In [3]:
%%capture

# Utility cell for downloading SQuAD2.0 data in the IPYNB folder

!mkdir squad
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O squad/train-v2.0.json
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O squad/dev-v2.0.json

In [4]:
%%time


'''
Reading from input file and doing json normalizing upon the train data to achieve the required 
dataframe format
'''
def convert_from_json_to_dataframe(file_path, record_path=['data', 'paragraphs', 'qas', 'answers']):
    file = json.loads(open(file_path).read())
    js = pd.json_normalize(file, record_path)
    m = pd.json_normalize(file, record_path[:-1])
    r = pd.json_normalize(file, record_path[:-2])
    index = np.repeat(r['context'].values, r.qas.str.len())
    m['context'] = index
    data = m[['id', 'question', 'context', 'answers']].set_index('id').reset_index()
    data['context_id'] = data['context'].factorize()[0]
    return data
    

train_file_path = 'squad/train-v2.0.json'
train_data = convert_from_json_to_dataframe(train_file_path)
train_data

CPU times: user 8.36 s, sys: 304 ms, total: 8.66 s
Wall time: 8.68 s


Unnamed: 0,id,question,context,answers,context_id
0,56be85543aeaaa14008c9063,When did Beyonce start becoming popular?,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".","[{'text': 'in the late 1990s', 'answer_start': 269}]",0
1,56be85543aeaaa14008c9065,What areas did Beyonce compete in when she was growing up?,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".","[{'text': 'singing and dancing', 'answer_start': 207}]",0
2,56be85543aeaaa14008c9066,When did Beyonce leave Destiny's Child and become a solo singer?,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".","[{'text': '2003', 'answer_start': 526}]",0
3,56bf6b0f3aeaaa14008c9601,In what city and state did Beyonce grow up?,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".","[{'text': 'Houston, Texas', 'answer_start': 166}]",0
4,56bf6b0f3aeaaa14008c9602,In which decade did Beyonce become famous?,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".","[{'text': 'late 1990s', 'answer_start': 276}]",0
...,...,...,...,...,...
130314,5a7e070b70df9f001a875439,Physics has broadly agreed on the definition of what?,"The term ""matter"" is used throughout physics in a bewildering variety of contexts: for example, one refers to ""condensed matter physics"", ""elementary matter"", ""partonic"" matter, ""dark"" matter, ""anti""-matter, ""strange"" matter, and ""nuclear"" matter. In discussions of matter and antimatter, normal matter has been referred to by Alfvén as koinomatter (Gk. common matter). It is fair to say that in physics, there is no broad consensus as to a general definition of matter, and the term ""matter"" usually is used in conjunction with a specifying modifier.",[],19028
130315,5a7e070b70df9f001a87543a,Who coined the term partonic matter?,"The term ""matter"" is used throughout physics in a bewildering variety of contexts: for example, one refers to ""condensed matter physics"", ""elementary matter"", ""partonic"" matter, ""dark"" matter, ""anti""-matter, ""strange"" matter, and ""nuclear"" matter. In discussions of matter and antimatter, normal matter has been referred to by Alfvén as koinomatter (Gk. common matter). It is fair to say that in physics, there is no broad consensus as to a general definition of matter, and the term ""matter"" usually is used in conjunction with a specifying modifier.",[],19028
130316,5a7e070b70df9f001a87543b,What is another name for anti-matter?,"The term ""matter"" is used throughout physics in a bewildering variety of contexts: for example, one refers to ""condensed matter physics"", ""elementary matter"", ""partonic"" matter, ""dark"" matter, ""anti""-matter, ""strange"" matter, and ""nuclear"" matter. In discussions of matter and antimatter, normal matter has been referred to by Alfvén as koinomatter (Gk. common matter). It is fair to say that in physics, there is no broad consensus as to a general definition of matter, and the term ""matter"" usually is used in conjunction with a specifying modifier.",[],19028
130317,5a7e070b70df9f001a87543c,Matter usually does not need to be used in conjunction with what?,"The term ""matter"" is used throughout physics in a bewildering variety of contexts: for example, one refers to ""condensed matter physics"", ""elementary matter"", ""partonic"" matter, ""dark"" matter, ""anti""-matter, ""strange"" matter, and ""nuclear"" matter. In discussions of matter and antimatter, normal matter has been referred to by Alfvén as koinomatter (Gk. common matter). It is fair to say that in physics, there is no broad consensus as to a general definition of matter, and the term ""matter"" usually is used in conjunction with a specifying modifier.",[],19028


In [5]:
%%time

# Exploring the train dataset a bit more!

print ('Number of unique contexts in the train dataset :', len(train_data['context_id'].unique()))

Number of unique contexts in the train dataset : 19029
CPU times: user 3.62 ms, sys: 1.06 ms, total: 4.68 ms
Wall time: 5.23 ms


In [6]:
%%time

# Getting unique documents from the train dataset

train_unique_documents = train_data[['context', 'context_id']].drop_duplicates().reset_index(drop=True)
train_unique_documents

CPU times: user 197 ms, sys: 834 µs, total: 197 ms
Wall time: 198 ms


Unnamed: 0,context,context_id
0,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".",0
1,"Following the disbandment of Destiny's Child in June 2005, she released her second solo album, B'Day (2006), which contained hits ""Déjà Vu"", ""Irreplaceable"", and ""Beautiful Liar"". Beyoncé also ventured into acting, with a Golden Globe-nominated performance in Dreamgirls (2006), and starring roles in The Pink Panther (2006) and Obsessed (2009). Her marriage to rapper Jay Z and portrayal of Etta James in Cadillac Records (2008) influenced her third album, I Am... Sasha Fierce (2008), which saw the birth of her alter-ego Sasha Fierce and earned a record-setting six Grammy Awards in 2010, including Song of the Year for ""Single Ladies (Put a Ring on It)"". Beyoncé took a hiatus from music in 2010 and took over management of her career; her fourth album 4 (2011) was subsequently mellower in tone, exploring 1970s funk, 1980s pop, and 1990s soul. Her critically acclaimed fifth studio album, Beyoncé (2013), was distinguished from previous releases by its experimental production and exploration of darker themes.",1
2,"A self-described ""modern-day feminist"", Beyoncé creates songs that are often characterized by themes of love, relationships, and monogamy, as well as female sexuality and empowerment. On stage, her dynamic, highly choreographed performances have led to critics hailing her as one of the best entertainers in contemporary popular music. Throughout a career spanning 19 years, she has sold over 118 million records as a solo artist, and a further 60 million with Destiny's Child, making her one of the best-selling music artists of all time. She has won 20 Grammy Awards and is the most nominated woman in the award's history. The Recording Industry Association of America recognized her as the Top Certified Artist in America during the 2000s decade. In 2009, Billboard named her the Top Radio Songs Artist of the Decade, the Top Female Artist of the 2000s and their Artist of the Millennium in 2011. Time listed her among the 100 most influential people in the world in 2013 and 2014. Forbes magazine also listed her as the most powerful female musician of 2015.",2
3,"Beyoncé Giselle Knowles was born in Houston, Texas, to Celestine Ann ""Tina"" Knowles (née Beyincé), a hairdresser and salon owner, and Mathew Knowles, a Xerox sales manager. Beyoncé's name is a tribute to her mother's maiden name. Beyoncé's younger sister Solange is also a singer and a former member of Destiny's Child. Mathew is African-American, while Tina is of Louisiana Creole descent (with African, Native American, French, Cajun, and distant Irish and Spanish ancestry). Through her mother, Beyoncé is a descendant of Acadian leader Joseph Broussard. She was raised in a Methodist household.",3
4,"Beyoncé attended St. Mary's Elementary School in Fredericksburg, Texas, where she enrolled in dance classes. Her singing talent was discovered when dance instructor Darlette Johnson began humming a song and she finished it, able to hit the high-pitched notes. Beyoncé's interest in music and performing continued after winning a school talent show at age seven, singing John Lennon's ""Imagine"" to beat 15/16-year-olds. In fall of 1990, Beyoncé enrolled in Parker Elementary School, a music magnet school in Houston, where she would perform with the school's choir. She also attended the High School for the Performing and Visual Arts and later Alief Elsik High School. Beyoncé was also a member of the choir at St. John's United Methodist Church as a soloist for two years.",4
...,...,...
19024,"Isaac Newton (1643–1727) inherited Descartes' mechanical conception of matter. In the third of his ""Rules of Reasoning in Philosophy"", Newton lists the universal qualities of matter as ""extension, hardness, impenetrability, mobility, and inertia"". Similarly in Optics he conjectures that God created matter as ""solid, massy, hard, impenetrable, movable particles"", which were ""...even so very hard as never to wear or break in pieces"". The ""primary"" properties of matter were amenable to mathematical description, unlike ""secondary"" qualities such as color or taste. Like Descartes, Newton rejected the essential nature of secondary qualities.",19024
19025,"There is an entire literature concerning the ""structure of matter"", ranging from the ""electrical structure"" in the early 20th century, to the more recent ""quark structure of matter"", introduced today with the remark: Understanding the quark structure of matter has been one of the most important advances in contemporary physics.[further explanation needed] In this connection, physicists speak of matter fields, and speak of particles as ""quantum excitations of a mode of the matter field"". And here is a quote from de Sabbata and Gasperini: ""With the word ""matter"" we denote, in this context, the sources of the interactions, that is spinor fields (like quarks and leptons), which are believed to be the fundamental components of matter, or scalar fields, like the Higgs particles, which are used to introduced mass in a gauge theory (and that, however, could be composed of more fundamental fermion fields).""[further explanation needed]",19025
19026,"In the late 19th century with the discovery of the electron, and in the early 20th century, with the discovery of the atomic nucleus, and the birth of particle physics, matter was seen as made up of electrons, protons and neutrons interacting to form atoms. Today, we know that even protons and neutrons are not indivisible, they can be divided into quarks, while electrons are part of a particle family called leptons. Both quarks and leptons are elementary particles, and are currently seen as being the fundamental constituents of matter.",19026
19027,"These quarks and leptons interact through four fundamental forces: gravity, electromagnetism, weak interactions, and strong interactions. The Standard Model of particle physics is currently the best explanation for all of physics, but despite decades of efforts, gravity cannot yet be accounted for at the quantum level; it is only described by classical physics (see quantum gravity and graviton). Interactions between quarks and leptons are the result of an exchange of force-carrying particles (such as photons) between quarks and leptons. The force-carrying particles are not themselves building blocks. As one consequence, mass and energy (which cannot be created or destroyed) cannot always be related to matter (which can be created out of non-matter particles such as photons, or even out of pure energy, such as kinetic energy). Force carriers are usually not considered matter: the carriers of the electric force (photons) possess energy (see Planck relation) and the carriers of the weak force (W and Z bosons) are massive, but neither are considered matter either. However, while these particles are not considered matter, they do contribute to the total mass of atoms, subatomic particles, and all systems that contain them.",19027


In [7]:
%%time

# Creating a corpus of tokens

train_corpus = train_unique_documents['context'].tolist()
train_corpus = [preprocess_string(t) for t in train_corpus]

CPU times: user 15 s, sys: 38 ms, total: 15 s
Wall time: 15 s


In [8]:
%%time

# Defining the Word2Vec model

vectorizer = Word2Vec(sentences=train_corpus, vector_size=300, window=5, min_count=10, workers=4, epochs=20).wv

CPU times: user 1min 47s, sys: 390 ms, total: 1min 47s
Wall time: 30 s


In [9]:
%%time

# Getting the 5 most similar words to a sample word

vectorizer.most_similar('tourist', topn=5)

CPU times: user 16.7 ms, sys: 16 ms, total: 32.8 ms
Wall time: 18.2 ms


[('visitor', 0.732012152671814),
 ('destin', 0.7211476564407349),
 ('tourism', 0.6160069108009338),
 ('porto', 0.5113480687141418),
 ('windhoek', 0.5041019320487976)]

In [10]:
%%time

# Text Transformation 

def text_transformation(vectorizer, text, verbose=True):
    tokens = preprocess_string(text)
    words = [vectorizer[w] for w in tokens if w in vectorizer]
    vector = [w for w in tokens if w in vectorizer]
    if verbose:
        print ('Text :', text)
        print ('Vector :', vector)
    elif len(words):
        return np.mean(words, axis=0)
    else:
        return np.zeros((300), dtype=np.float32)
    

question = 'What are the tourist spots in Spain?'    
text_transformation(vectorizer, question, True)

Text : What are the tourist spots in Spain?
Vector : ['tourist', 'spot', 'spain']
CPU times: user 2.62 ms, sys: 2.97 ms, total: 5.59 ms
Wall time: 2.15 ms


In [11]:
%%time

# Training the model

retriever_configs = {'n_neighbors' : 1, 'metric' : 'cosine'}
retriever = NearestNeighbors(**retriever_configs)
X_train = train_unique_documents['context'].apply(lambda x : text_transformation(vectorizer, x, False)).tolist()
retriever.fit(X_train, train_unique_documents['context_id'])

CPU times: user 22.5 s, sys: 28.9 ms, total: 22.6 s
Wall time: 22.6 s


NearestNeighbors(metric='cosine', n_neighbors=1)

In [12]:
%%time

# Evaluating the above model

X_train = train_data['question'].apply(lambda x : text_transformation(vectorizer, x, False)).tolist()

# predict one document for each question
y_test = train_data['context_id']
y_pred = retriever.kneighbors(X_train, return_distance=False)

CPU times: user 1min 19s, sys: 15.1 s, total: 1min 34s
Wall time: 50.5 s


In [13]:
%%time

# Testing the model upon the training set
def compute_accuracy(y_test, y_pred):
    num_correct, num_total = 0, len(y_test)
    for i in range(num_total):
        y_pred_list = y_pred[i]
        if y_test[i] in y_pred_list:
            num_correct += 1
    return num_correct / num_total


acc = compute_accuracy(y_test, y_pred)
print('Accuracy:', f'{acc:.4f}')

Accuracy: 0.2015
CPU times: user 1.02 s, sys: 3 ms, total: 1.02 s
Wall time: 1.02 s


In [14]:
%%time

validation_file_path = 'squad/dev-v2.0.json'
validation_data = convert_from_json_to_dataframe(validation_file_path)
validation_data

CPU times: user 938 ms, sys: 8.92 ms, total: 947 ms
Wall time: 945 ms


Unnamed: 0,id,question,context,answers,context_id
0,56ddde6b9a695914005b9628,In what country is Normandy located?,"The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (""Norman"" comes from ""Norseman"") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.","[{'text': 'France', 'answer_start': 159}, {'text': 'France', 'answer_start': 159}, {'text': 'France', 'answer_start': 159}, {'text': 'France', 'answer_start': 159}]",0
1,56ddde6b9a695914005b9629,When were the Normans in Normandy?,"The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (""Norman"" comes from ""Norseman"") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.","[{'text': '10th and 11th centuries', 'answer_start': 94}, {'text': 'in the 10th and 11th centuries', 'answer_start': 87}, {'text': '10th and 11th centuries', 'answer_start': 94}, {'text': '10th and 11th centuries', 'answer_start': 94}]",0
2,56ddde6b9a695914005b962a,From which countries did the Norse originate?,"The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (""Norman"" comes from ""Norseman"") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.","[{'text': 'Denmark, Iceland and Norway', 'answer_start': 256}, {'text': 'Denmark, Iceland and Norway', 'answer_start': 256}, {'text': 'Denmark, Iceland and Norway', 'answer_start': 256}, {'text': 'Denmark, Iceland and Norway', 'answer_start': 256}]",0
3,56ddde6b9a695914005b962b,Who was the Norse leader?,"The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (""Norman"" comes from ""Norseman"") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.","[{'text': 'Rollo', 'answer_start': 308}, {'text': 'Rollo', 'answer_start': 308}, {'text': 'Rollo', 'answer_start': 308}, {'text': 'Rollo', 'answer_start': 308}]",0
4,56ddde6b9a695914005b962c,What century did the Normans first gain their separate identity?,"The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (""Norman"" comes from ""Norseman"") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.","[{'text': '10th century', 'answer_start': 671}, {'text': 'the first half of the 10th century', 'answer_start': 649}, {'text': '10th', 'answer_start': 671}, {'text': '10th', 'answer_start': 671}]",0
...,...,...,...,...,...
11868,5737aafd1c456719005744ff,What is the seldom used force unit equal to one thousand newtons?,"The pound-force has a metric counterpart, less commonly used than the newton: the kilogram-force (kgf) (sometimes kilopond), is the force exerted by standard gravity on one kilogram of mass. The kilogram-force leads to an alternate, but rarely used unit of mass: the metric slug (sometimes mug or hyl) is that mass that accelerates at 1 m·s−2 when subjected to a force of 1 kgf. The kilogram-force is not a part of the modern SI system, and is generally deprecated; however it still sees use for some purposes as expressing aircraft weight, jet thrust, bicycle spoke tension, torque wrench settings and engine output torque. Other arcane units of force include the sthène, which is equivalent to 1000 N, and the kip, which is equivalent to 1000 lbf.","[{'text': 'sthène', 'answer_start': 665}, {'text': 'sthène', 'answer_start': 665}, {'text': 'sthène', 'answer_start': 665}, {'text': 'sthène', 'answer_start': 665}, {'text': 'sthène', 'answer_start': 665}]",1203
11869,5ad28ad0d7d075001a4299cc,What does not have a metric counterpart?,"The pound-force has a metric counterpart, less commonly used than the newton: the kilogram-force (kgf) (sometimes kilopond), is the force exerted by standard gravity on one kilogram of mass. The kilogram-force leads to an alternate, but rarely used unit of mass: the metric slug (sometimes mug or hyl) is that mass that accelerates at 1 m·s−2 when subjected to a force of 1 kgf. The kilogram-force is not a part of the modern SI system, and is generally deprecated; however it still sees use for some purposes as expressing aircraft weight, jet thrust, bicycle spoke tension, torque wrench settings and engine output torque. Other arcane units of force include the sthène, which is equivalent to 1000 N, and the kip, which is equivalent to 1000 lbf.",[],1203
11870,5ad28ad0d7d075001a4299cd,What is the force exerted by standard gravity on one ton of mass?,"The pound-force has a metric counterpart, less commonly used than the newton: the kilogram-force (kgf) (sometimes kilopond), is the force exerted by standard gravity on one kilogram of mass. The kilogram-force leads to an alternate, but rarely used unit of mass: the metric slug (sometimes mug or hyl) is that mass that accelerates at 1 m·s−2 when subjected to a force of 1 kgf. The kilogram-force is not a part of the modern SI system, and is generally deprecated; however it still sees use for some purposes as expressing aircraft weight, jet thrust, bicycle spoke tension, torque wrench settings and engine output torque. Other arcane units of force include the sthène, which is equivalent to 1000 N, and the kip, which is equivalent to 1000 lbf.",[],1203
11871,5ad28ad0d7d075001a4299ce,What force leads to a commonly used unit of mass?,"The pound-force has a metric counterpart, less commonly used than the newton: the kilogram-force (kgf) (sometimes kilopond), is the force exerted by standard gravity on one kilogram of mass. The kilogram-force leads to an alternate, but rarely used unit of mass: the metric slug (sometimes mug or hyl) is that mass that accelerates at 1 m·s−2 when subjected to a force of 1 kgf. The kilogram-force is not a part of the modern SI system, and is generally deprecated; however it still sees use for some purposes as expressing aircraft weight, jet thrust, bicycle spoke tension, torque wrench settings and engine output torque. Other arcane units of force include the sthène, which is equivalent to 1000 N, and the kip, which is equivalent to 1000 lbf.",[],1203


In [15]:
%%time

# Exploring the validation dataset a bit more!

print ('Number of unique contexts in the validation dataset :', len(validation_data['context_id'].unique()))

Number of unique contexts in the validation dataset : 1204
CPU times: user 1.3 ms, sys: 0 ns, total: 1.3 ms
Wall time: 1.15 ms


In [16]:
%%time

# Getting unique documents from the validation dataset

validation_unique_documents = validation_data[['context', 'context_id']].drop_duplicates().reset_index(drop=True)
validation_unique_documents

CPU times: user 22.8 ms, sys: 61 µs, total: 22.9 ms
Wall time: 22.7 ms


Unnamed: 0,context,context_id
0,"The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (""Norman"" comes from ""Norseman"") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.",0
1,"The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantines, and an expedition on behalf of their duke, William the Conqueror, led to the Norman conquest of England at the Battle of Hastings in 1066. Norman cultural and military influence spread from these new European centres to the Crusader states of the Near East, where their prince Bohemond I founded the Principality of Antioch in the Levant, to Scotland and Wales in Great Britain, to Ireland, and to the coasts of north Africa and the Canary Islands.",1
2,"The English name ""Normans"" comes from the French words Normans/Normanz, plural of Normant, modern French normand, which is itself borrowed from Old Low Franconian Nortmann ""Northman"" or directly from Old Norse Norðmaðr, Latinized variously as Nortmannus, Normannus, or Nordmannus (recorded in Medieval Latin, 9th century) to mean ""Norseman, Viking"".",2
3,"In the course of the 10th century, the initially destructive incursions of Norse war bands into the rivers of France evolved into more permanent encampments that included local women and personal property. The Duchy of Normandy, which began in 911 as a fiefdom, was established by the treaty of Saint-Clair-sur-Epte between King Charles III of West Francia and the famed Viking ruler Rollo, and was situated in the former Frankish kingdom of Neustria. The treaty offered Rollo and his men the French lands between the river Epte and the Atlantic coast in exchange for their protection against further Viking incursions. The area corresponded to the northern part of present-day Upper Normandy down to the river Seine, but the Duchy would eventually extend west beyond the Seine. The territory was roughly equivalent to the old province of Rouen, and reproduced the Roman administrative structure of Gallia Lugdunensis II (part of the former Gallia Lugdunensis).",3
4,"Before Rollo's arrival, its populations did not differ from Picardy or the Île-de-France, which were considered ""Frankish"". Earlier Viking settlers had begun arriving in the 880s, but were divided between colonies in the east (Roumois and Pays de Caux) around the low Seine valley and in the west in the Cotentin Peninsula, and were separated by traditional pagii, where the population remained about the same with almost no foreign settlers. Rollo's contingents who raided and ultimately settled Normandy and parts of the Atlantic coast included Danes, Norwegians, Norse–Gaels, Orkney Vikings, possibly Swedes, and Anglo-Danes from the English Danelaw under Norse control.",4
...,...,...
1199,"where is the mass of the object, is the velocity of the object and is the distance to the center of the circular path and is the unit vector pointing in the radial direction outwards from the center. This means that the unbalanced centripetal force felt by any object is always directed toward the center of the curving path. Such forces act perpendicular to the velocity vector associated with the motion of an object, and therefore do not change the speed of the object (magnitude of the velocity), but only the direction of the velocity vector. The unbalanced force that accelerates an object can be resolved into a component that is perpendicular to the path, and one that is tangential to the path. This yields both the tangential force, which accelerates the object by either slowing it down or speeding it up, and the radial (centripetal) force, which changes its direction.",1199
1200,"A conservative force that acts on a closed system has an associated mechanical work that allows energy to convert only between kinetic or potential forms. This means that for a closed system, the net mechanical energy is conserved whenever a conservative force acts on the system. The force, therefore, is related directly to the difference in potential energy between two different locations in space, and can be considered to be an artifact of the potential field in the same way that the direction and amount of a flow of water can be considered to be an artifact of the contour map of the elevation of an area.",1200
1201,"For certain physical scenarios, it is impossible to model forces as being due to gradient of potentials. This is often due to macrophysical considerations that yield forces as arising from a macroscopic statistical average of microstates. For example, friction is caused by the gradients of numerous electrostatic potentials between the atoms, but manifests as a force model that is independent of any macroscale position vector. Nonconservative forces other than friction include other contact forces, tension, compression, and drag. However, for any sufficiently detailed description, all these forces are the results of conservative ones since each of these macroscopic forces are the net results of the gradients of microscopic potentials.",1201
1202,"The connection between macroscopic nonconservative forces and microscopic conservative forces is described by detailed treatment with statistical mechanics. In macroscopic closed systems, nonconservative forces act to change the internal energies of the system, and are often associated with the transfer of heat. According to the Second law of thermodynamics, nonconservative forces necessarily result in energy transformations within closed systems from ordered to more random conditions as entropy increases.",1202


In [17]:
%%time

# Creating a corpus of tokens

validation_corpus = validation_unique_documents['context'].tolist()
validation_corpus = [preprocess_string(t) for t in validation_corpus]

CPU times: user 1.14 s, sys: 7.11 ms, total: 1.14 s
Wall time: 1.14 s


In [18]:
%%time

# Defining the Word2Vec model

vectorizer = Word2Vec(sentences=validation_corpus, vector_size=300, window=5, min_count=10, workers=4, epochs=20).wv

CPU times: user 5.18 s, sys: 27.5 ms, total: 5.21 s
Wall time: 1.75 s


In [20]:
%%time

# Getting the 5 most similar words to a sample word

vectorizer.most_similar('system', topn=5)

CPU times: user 5.59 ms, sys: 2.11 ms, total: 7.7 ms
Wall time: 2.85 ms


[('complement', 0.9437873363494873),
 ('respond', 0.941889762878418),
 ('innat', 0.9406814575195312),
 ('virus', 0.9395800828933716),
 ('immunolog', 0.9383890628814697)]

In [21]:
%%time

# Text Transformation

question = 'What are the tourist spots in Spain?'    
text_transformation(vectorizer, question, True)

Text : What are the tourist spots in Spain?
Vector : ['spain']
CPU times: user 226 µs, sys: 16 µs, total: 242 µs
Wall time: 247 µs
