# Question-Answering model - Information Retrival

To run this notebook please install gensim>4.0, pandas, tqdm. Create a file storage system similar to the one in my github repo: https://github.com/kapilsahukp/Question-Answering-Model_v1 .

### Import Libraries

In [8]:
#!pip install tqdm
#!pip install scipy
# !pip install nltk
# !pip install sklearn
# !pip install gensim





Collecting gensim
  Downloading gensim-4.2.0-cp39-cp39-win_amd64.whl (23.9 MB)
     --------------------------------------- 23.9/23.9 MB 11.9 MB/s eta 0:00:00
Collecting Cython==0.29.28
  Downloading Cython-0.29.28-py2.py3-none-any.whl (983 kB)
     ------------------------------------- 983.8/983.8 KB 15.5 MB/s eta 0:00:00
Collecting smart-open>=1.8.1
  Downloading smart_open-6.0.0-py3-none-any.whl (58 kB)
     ---------------------------------------- 58.4/58.4 KB 3.0 MB/s eta 0:00:00
Installing collected packages: smart-open, Cython, gensim
Successfully installed Cython-0.29.28 gensim-4.2.0 smart-open-6.0.0


You should consider upgrading via the 'C:\Users\kapil\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [1]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import itertools
# import swifter
from scipy.spatial.distance import cosine
from collections import Counter
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from gensim.utils import simple_preprocess
from gensim.models.word2vec import Word2Vec
from gensim.models.fasttext import FastText
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import remove_stopwords, preprocess_string
import gensim.downloader
tqdm.pandas()

### Load Dataset

In [2]:
path_dir =  os.path.dirname(os.getcwd())
train_df = pd.read_csv(os.path.join(path_dir,r'data\\interim\\train_data.csv'))
val_df = pd.read_csv(os.path.join(path_dir,r'data\\interim\\val_data.csv'))
train_df.drop('Unnamed: 0',axis=1,inplace = True)
val_df.drop('Unnamed: 0',axis=1,inplace = True)
train_df.head(5)

Unnamed: 0,id,title,context,question,answer,answer_start,is_impossible
0,56be85543aeaaa14008c9063,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s,269,False
1,56be85543aeaaa14008c9065,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing,207,False
2,56be85543aeaaa14008c9066,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003,526,False
3,56bf6b0f3aeaaa14008c9601,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"Houston, Texas",166,False
4,56bf6b0f3aeaaa14008c9602,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,late 1990s,276,False


In [3]:
from IPython.display import HTML as html_print

def cstr(s, color='black'):
    return "<text style=color:{}>{}</text>".format(color, s)

for i in range(3):
    print(f'\033[94m C : {train_df["context"].tolist()[i]}')
    print(f'\033[91m Q : {train_df["question"].tolist()[i]}')
    print(f'\033[92m A : {train_df["answer"].tolist()[i]}')
    print('\033[90m ' + '-'*90)

[94m C : Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
[91m Q : When did Beyonce start becoming popular?
[92m A : in the late 1990s
[90m ------------------------------------------------------------------------------------------
[94m C : Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, son

### Get whole answer sentences

In [4]:
def get_answer_context(df):
    length_context = 0
    answer = ""

    for sentence in sent_tokenize(df.context):
        length_context += len(sentence) + 1
        if df.answer_start <= length_context:
            if len(sentence) >= len(str(df.answer)):
                if answer == "":
                    return sentence
                else:
                    return answer + " " + sentence
            else:
                answer += sentence

In [5]:
train_df['answer_sentences'] = train_df.progress_apply(lambda row: get_answer_context(row),axis = 1)
val_df['answer_sentences'] = val_df.progress_apply(lambda row: get_answer_context(row),axis = 1)

100%|██████████| 86820/86820 [00:17<00:00, 4876.40it/s]
100%|██████████| 20302/20302 [00:04<00:00, 4839.60it/s]


In [6]:
train_df.head()

Unnamed: 0,id,title,context,question,answer,answer_start,is_impossible,answer_sentences
0,56be85543aeaaa14008c9063,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s,269,False,"Born and raised in Houston, Texas, she perform..."
1,56be85543aeaaa14008c9065,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing,207,False,"Born and raised in Houston, Texas, she perform..."
2,56be85543aeaaa14008c9066,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003,526,False,Their hiatus saw the release of Beyoncé's debu...
3,56bf6b0f3aeaaa14008c9601,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"Houston, Texas",166,False,"Born and raised in Houston, Texas, she perform..."
4,56bf6b0f3aeaaa14008c9602,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,late 1990s,276,False,"Born and raised in Houston, Texas, she perform..."


### Preprocess context

In [7]:
context_df = pd.DataFrame(train_df['context'].unique().tolist(),columns=['context'])
context_df['processed'] = context_df['context'].progress_apply(lambda x: simple_preprocess(x))

question_df = pd.DataFrame(train_df['question'].unique().tolist(),columns=['question'])
question_df['processed'] = question_df['question'].progress_apply(lambda x: simple_preprocess(x))


100%|██████████| 18877/18877 [00:07<00:00, 2451.50it/s]
100%|██████████| 86768/86768 [00:03<00:00, 27102.74it/s]


### Training a word2vec model

In [8]:
train_sentences = context_df['processed'].tolist() + question_df['processed'].tolist()
train_words = list(itertools.chain(*train_sentences))
low_word_count = 1
word_count_dict = Counter(train_words)
low_freq_words = [k for k, v in word_count_dict.items() if v == low_word_count]
UNK = '<UNK>'
processed_train_sentences = [[word if word not in low_freq_words else UNK for word in sentence]
                            for sentence in tqdm(train_sentences)]


100%|██████████| 105645/105645 [1:22:46<00:00, 21.27it/s]  


In [9]:
from gensim.models.callbacks import CallbackAny2Vec

# init callback class
class callback(CallbackAny2Vec):
    """
    Callback to print loss after each epoch
    """
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss

In [10]:
vector_size = 300
w2v_model = Word2Vec(min_count=20, 
                     window = 20,
                     vector_size = vector_size,
                     workers=10)

w2v_model.build_vocab(processed_train_sentences)
words = w2v_model.wv.key_to_index.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)
# Train Word Embeddings
w2v_model.train(processed_train_sentences, 
                total_examples=w2v_model.corpus_count, 
                epochs=350, 
                report_delay=1,
                compute_loss = True, # set compute_loss = True
                callbacks=[callback()]) 
print(w2v_model.get_latest_training_loss())

Vocab size 10414
Loss after epoch 0: 533307.5
Loss after epoch 1: 417033.25
Loss after epoch 2: 418462.125
Loss after epoch 3: 393721.625
Loss after epoch 4: 403788.5
Loss after epoch 5: 342991.0
Loss after epoch 6: 372567.5
Loss after epoch 7: 329419.0
Loss after epoch 8: 327155.25
Loss after epoch 9: 354881.25
Loss after epoch 10: 355103.5
Loss after epoch 11: 322187.5
Loss after epoch 12: 334823.0
Loss after epoch 13: 332382.0
Loss after epoch 14: 309330.0
Loss after epoch 15: 300277.0
Loss after epoch 16: 285930.0
Loss after epoch 17: 304231.5
Loss after epoch 18: 303569.0
Loss after epoch 19: 293080.0
Loss after epoch 20: 304979.0
Loss after epoch 21: 315790.0
Loss after epoch 22: 304274.5
Loss after epoch 23: 270321.0
Loss after epoch 24: 320940.0
Loss after epoch 25: 297017.0
Loss after epoch 26: 276776.0
Loss after epoch 27: 279544.0
Loss after epoch 28: 269837.0
Loss after epoch 29: 301193.0
Loss after epoch 30: 279987.0
Loss after epoch 31: 289734.0
Loss after epoch 32: 28418

In [11]:
w2v_model.wv.most_similar(positive="time")

[('least', 0.4090263247489929),
 ('expense', 0.2949385643005371),
 ('age', 0.27914103865623474),
 ('level', 0.26621466875076294),
 ('night', 0.26492178440093994),
 ('point', 0.24419565498828888),
 ('end', 0.23826253414154053),
 ('distances', 0.22997166216373444),
 ('beginning', 0.21926996111869812),
 ('rate', 0.21834655106067657)]

In [20]:
'their' in w2v_model.wv.key_to_index

True

In [21]:
def avg_sentence_vector(words, model, num_features):
    if isinstance(model,gensim.models.word2vec.Word2Vec):
        word_vec_model = model.wv
    else:
        word_vec_model = model
    index2word_set = word_vec_model.index_to_key 
    #function to average all words vectors in a given paragraph
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0

    for word in words:
        if word in index2word_set:
            nwords = nwords+1
            featureVec = np.add(featureVec, word_vec_model[word])

    if nwords>0:
        featureVec = np.divide(featureVec, nwords)
#     print(featureVec)
    return featureVec

In [378]:
# avg_sentence_vector(train_df['question'].tolist()[0].split(),w2v_model,100) 
# avg_sentence_vector(train_df['question'].tolist()[1].split(),w2v_model,100)

In [22]:
def get_cosine_similarity(context,question,model,vector_size=300):
    if isinstance(model,gensim.models.word2vec.Word2Vec):
        vocab = model.wv.key_to_index
    else:
        vocab = model.key_to_index
#     print(context,question)
    context_sents = sent_tokenize(context)
#     print(context_sents)
    processed_context = [simple_preprocess(sent) for sent in context_sents]
    processed_context = [[word if word in vocab else UNK for word in processed_context_sent]\
                         for processed_context_sent in processed_context]
#     print(processed_context)
    processed_question = simple_preprocess(question)
    processed_question = [word if word in vocab else UNK for word in processed_question]
    
    context_vectors = [np.array(avg_sentence_vector(processed_context_sent,model,vector_size)).reshape(1,-1) for processed_context_sent in processed_context]
    question_vector  = np.array(avg_sentence_vector(processed_question,model,vector_size)).reshape(1,-1)
#     print(len(context_vectors[0]))
#     print(cosine_similarity(np.array(context_vectors[0]).reshape(1,-1),np.array(question_vector).reshape(1,-1)))
    
    cosine_sim_list = [cosine_similarity(context_sent_vector,question_vector) for context_sent_vector in context_vectors]
    
#     print(f"Cosine scores: {cosine_sim_list}")
    max_cosine_sim = max(cosine_sim_list)
    predicted_answer = context_sents[np.argmax(cosine_sim_list)]
    return max_cosine_sim, predicted_answer

In [23]:
sample_context = train_df['context'].tolist()[0]
sample_question = train_df['question'].tolist()[1]
print(f"C:{sample_context}")
print(f"Q: {sample_question}")
get_cosine_similarity(sample_context,sample_question,w2v_model)

C:Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Q: What areas did Beyonce compete in when she was growing up?


(array([[0.22090484]], dtype=float32),
 "Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child.")

In [24]:
temp_df = train_df.head(5)

In [25]:
temp_df[['consine_sim','predicted_answer']] = temp_df[['context','question']]\
.progress_apply(lambda x: get_cosine_similarity(x[0],x[1],w2v_model),axis=1,result_type="expand")
temp_df

100%|██████████| 5/5 [00:00<00:00, 72.00it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df[['consine_sim','predicted_answer']] = temp_df[['context','question']]\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df[['consine_sim','predicted_answer']] = temp_df[['context','question']]\


Unnamed: 0,id,title,context,question,answer,answer_start,is_impossible,answer_sentences,consine_sim,predicted_answer
0,56be85543aeaaa14008c9063,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s,269,False,"Born and raised in Houston, Texas, she perform...",[[0.2374527]],Their hiatus saw the release of Beyoncé's debu...
1,56be85543aeaaa14008c9065,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing,207,False,"Born and raised in Houston, Texas, she perform...",[[0.22090484]],"Born and raised in Houston, Texas, she perform..."
2,56be85543aeaaa14008c9066,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003,526,False,Their hiatus saw the release of Beyoncé's debu...,[[0.52102005]],"Born and raised in Houston, Texas, she perform..."
3,56bf6b0f3aeaaa14008c9601,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"Houston, Texas",166,False,"Born and raised in Houston, Texas, she perform...",[[0.16701093]],Their hiatus saw the release of Beyoncé's debu...
4,56bf6b0f3aeaaa14008c9602,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,late 1990s,276,False,"Born and raised in Houston, Texas, she perform...",[[0.26433152]],"Managed by her father, Mathew Knowles, the gro..."


### Evaluvate results

#### On Train Set

In [26]:
train_df[['consine_sim','predicted_answer']] = train_df[['context','question']]\
.progress_apply(lambda x: get_cosine_similarity(x[0],x[1],w2v_model),axis=1,result_type="expand")
train_df.head(2)

100%|██████████| 86820/86820 [15:07<00:00, 95.66it/s] 


Unnamed: 0,id,title,context,question,answer,answer_start,is_impossible,answer_sentences,consine_sim,predicted_answer
0,56be85543aeaaa14008c9063,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s,269,False,"Born and raised in Houston, Texas, she perform...",[[0.2374527]],Their hiatus saw the release of Beyoncé's debu...
1,56be85543aeaaa14008c9065,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing,207,False,"Born and raised in Houston, Texas, she perform...",[[0.22090484]],"Born and raised in Houston, Texas, she perform..."


In [27]:
train_df['correct_prediction'] = train_df['answer_sentences'] == train_df['predicted_answer']
train_df['correct_prediction'].value_counts()

True     61828
False    24992
Name: correct_prediction, dtype: int64

In [28]:
print(f"accuracy: {train_df[train_df['correct_prediction']].shape[0]/train_df.shape[0]}")

accuracy: 0.7121400598940336


#### On Validation set

In [29]:
val_df[['consine_sim','predicted_answer']] = val_df[['context','question']]\
.progress_apply(lambda x: get_cosine_similarity(x[0],x[1],w2v_model),axis=1,result_type="expand")
val_df.head(2)

100%|██████████| 20302/20302 [02:38<00:00, 128.10it/s]


Unnamed: 0,id,title,context,question,answer,answer_start,is_impossible,answer_sentences,consine_sim,predicted_answer
0,56ddde6b9a695914005b9628,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France,159,False,The Normans (Norman: Nourmands; French: Norman...,[[0.3155636]],The Normans (Norman: Nourmands; French: Norman...
1,56ddde6b9a695914005b9628,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France,159,False,The Normans (Norman: Nourmands; French: Norman...,[[0.3155636]],The Normans (Norman: Nourmands; French: Norman...


In [30]:
val_df['correct_prediction'] = val_df['answer_sentences'] == val_df['predicted_answer']
val_df['correct_prediction'].value_counts()

True     14717
False     5585
Name: correct_prediction, dtype: int64

In [31]:
print(f"accuracy: {val_df[val_df['correct_prediction']].shape[0]/val_df.shape[0]}")

accuracy: 0.7249039503497192


### Download word2vec model google

In [32]:
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [411]:
google_model = gensim.downloader.load('word2vec-google-news-300')

In [416]:
sample_context = train_df['context'].tolist()[0]
sample_question = train_df['question'].tolist()[0]
print(f"C:{sample_context}")
print(f"Q: {sample_question}")
get_cosine_similarity(sample_context,sample_question,google_model)

C:Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Q: When did Beyonce start becoming popular?
[['<UNK>', '<UNK>', '<UNK>', 'carter', '<UNK>', 'bee', 'yon', 'say', 'born', 'september', 'is', 'an', 'american', 'singer', 'songwriter', 'record', 'producer', '<UNK>', 'actress'], ['born', '<UNK>', 'raised', 'in', 'houston', 'texas', 'she', 'performed', 'in'

(array([[0.6053659]], dtype=float32),
 "Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time.")

In [425]:
google_model['world'].shape

(300,)

### Evaluvate results

In [447]:
def avg_sentence_vector(words, model, num_features):
    if isinstance(model,gensim.models.word2vec.Word2Vec):
        word_vec_model = model.wv
    else:
        word_vec_model = model
    index2word_set = word_vec_model.index_to_key 
    #function to average all words vectors in a given paragraph
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0

    for word in words:
        if word in index2word_set:
            nwords = nwords+1
            featureVec = np.add(featureVec, word_vec_model[word])

    if nwords>0:
        featureVec = np.divide(featureVec, nwords)
#     print(featureVec)
    return featureVec
def get_context_vector(context,model,vector_size=300):
    if isinstance(model,gensim.models.word2vec.Word2Vec):
        vocab = model.wv.key_to_index
    else:
        vocab = model.key_to_index
    context_sents = sent_tokenize(context)
    processed_context = [simple_preprocess(sent) for sent in context_sents]
    processed_context = [[word if word in vocab else UNK for word in processed_context_sent]\
                         for processed_context_sent in processed_context]
    context_vectors = [np.array(avg_sentence_vector(processed_context_sent,model,vector_size)).reshape(1,-1) for processed_context_sent in processed_context]
    
    return context_vectors
    
def get_cosine_similarity(context,context_vectors,question,model,vector_size=300):
    context_sents = sent_tokenize(context)
    
    if isinstance(model,gensim.models.word2vec.Word2Vec):
        vocab = model.wv.key_to_index
    else:
        vocab = model.key_to_index
        
    processed_question = simple_preprocess(question)
    processed_question = [word if word in vocab else UNK for word in processed_question]
    
    question_vector  = np.array(avg_sentence_vector(processed_question,model,vector_size)).reshape(1,-1)
    
    cosine_sim_list = [cosine_similarity(context_sent_vector,question_vector) for context_sent_vector in context_vectors]
    
    max_cosine_sim = max(cosine_sim_list)
    predicted_answer = context_sents[np.argmax(cosine_sim_list)]
    return max_cosine_sim, predicted_answer

In [449]:
temp_df['context_vec'] = temp_df['context'].swifter\
.progress_bar(enable=True, desc=None).apply(lambda x: get_context_vector(x,google_model))

temp_df[['consine_sim','predicted_answer']] = temp_df[['context','context_vec','question']]\
.swifter.progress_bar(enable=True, desc=None)\
.apply(lambda x: get_cosine_similarity(x[0],x[1],x[2],google_model,300),axis=1,result_type="expand")
temp_df.head(2)

Pandas Apply:   0%|          | 0/5 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['context_vec'] = temp_df['context'].swifter\


Pandas Apply:   0%|          | 0/5 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df[['consine_sim','predicted_answer']] = temp_df[['context','context_vec','question']]\


Unnamed: 0,id,title,context,question,answer,answer_start,is_impossible,answer_sentences,consine_sim,predicted_answer,context_vec
0,56be85543aeaaa14008c9063,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s,269,False,"Born and raised in Houston, Texas, she perform...",[[0.6053659]],"Managed by her father, Mathew Knowles, the gro...","[[[-0.011311122, -0.024881635, -0.053231377, 0..."
1,56be85543aeaaa14008c9065,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing,207,False,"Born and raised in Houston, Texas, she perform...",[[0.6998993]],"Managed by her father, Mathew Knowles, the gro...","[[[-0.011311122, -0.024881635, -0.053231377, 0..."


In [450]:
# temp_df[['consine_sim','predicted_answer']] = temp_df[['context','question']]\
# .swifter.progress_bar(enable=True, desc=None)\
# .apply(lambda x: get_cosine_similarity(x[0],x[1],google_model,300),axis=1,result_type="expand")
# temp_df.head(2)

#### On val set 

In [457]:
val_df['context_vec'] = val_df['context'].swifter\
.progress_bar(enable=True, desc=None).apply(lambda x: get_context_vector(x,google_model))

Pandas Apply:   0%|          | 0/20302 [00:00<?, ?it/s]

In [458]:
val_df[['consine_sim','predicted_answer']] = val_df[['context','context_vec','question']]\
.swifter.progress_bar(enable=True, desc=None)\
.apply(lambda x: get_cosine_similarity(x[0],x[1],x[2],google_model,300),axis=1,result_type="expand")
val_df.head(2)

Pandas Apply:   0%|          | 0/20302 [00:00<?, ?it/s]

Unnamed: 0,id,title,context,question,answer,answer_start,is_impossible,answer_sentences,consine_sim,predicted_answer,correct_prediction,context_vec
0,56ddde6b9a695914005b9628,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France,159,False,The Normans (Norman: Nourmands; French: Norman...,[[0.50003517]],The distinct cultural and ethnic identity of t...,True,"[[[0.064170435, 0.075368784, 0.09860872, 0.118..."
1,56ddde6b9a695914005b9628,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France,159,False,The Normans (Norman: Nourmands; French: Norman...,[[0.50003517]],The distinct cultural and ethnic identity of t...,True,"[[[0.064170435, 0.075368784, 0.09860872, 0.118..."


In [462]:
val_df['correct_prediction'] = val_df['answer_sentences'] == val_df['predicted_answer']
val_df['correct_prediction'].value_counts()

True     14660
False     5642
Name: correct_prediction, dtype: int64

In [465]:
print(f"accuracy: {val_df[val_df['correct_prediction']].shape[0]/val_df.shape[0]}")

accuracy: 0.7220963451876662


### Training a fasttext model

In [30]:
vector_size = 400
fast_text_model = FastText(min_count=1, 
                     window = 5,
                     vector_size = vector_size,
                     sg = 1,
                     hs = 1,
                     workers=10)
fast_text_model.build_vocab(train_sentences)
words = fast_text_model.wv.key_to_index.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)
# Train Word Embeddings
fast_text_model.train(train_sentences, 
                total_examples=fast_text_model.corpus_count, 
                epochs=500, 
                report_delay=1,
                compute_loss = True,) # set compute_loss = True
#                 callbacks=[callback()]) 
print(fast_text_model.get_latest_training_loss())

Vocab size 78414
0.0


In [31]:
fast_text_model.wv.most_similar(positive="time")

[('period', 0.46009957790374756),
 ('gamedaily', 0.3613353967666626),
 ('during', 0.34807538986206055),
 ('beatle', 0.3249432444572449),
 ('same', 0.3225545287132263),
 ('periods', 0.32006415724754333),
 ('johnathon', 0.319020539522171),
 ('span', 0.31888797879219055),
 ('ascession', 0.31213393807411194),
 ('accension', 0.30582237243652344)]

In [32]:
print(fast_text_model.wv.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant']))

0.4006621


In [33]:
def get_cosine_similarity(context,question,model,vector_size=300):
    if isinstance(model,gensim.models.word2vec.Word2Vec):
        vocab = model.wv.key_to_index
    else:
        vocab = model.key_to_index
    context_sents = sent_tokenize(context)
    
    processed_context = [simple_preprocess(sent) for sent in context_sents]
#     processed_context = [[word if word in vocab else UNK for word in processed_context_sent]\
#                          for processed_context_sent in processed_context]
    processed_question = simple_preprocess(question)
#     print(processed_context)
#     processed_question = [word if word in vocab else UNK for word in processed_question]
    
#     context_vectors = [np.array(avg_sentence_vector(processed_context_sent,model,vector_size)).reshape(1,-1) for processed_context_sent in processed_context]
#     question_vector  = np.array(avg_sentence_vector(processed_question,model,vector_size)).reshape(1,-1)
#     print(len(context_vectors[0]))
#     print(cosine_similarity(np.array(context_vectors[0]).reshape(1,-1),np.array(question_vector).reshape(1,-1)))
    
#     cosine_sim_list = [cosine_similarity(context_sent_vector,question_vector) for context_sent_vector in context_vectors]
        #     print(f"Cosine scores: {cosine_sim_list}")
    cosine_sim_list = [model.wv.n_similarity(context_sent,processed_question) for context_sent in processed_context if len(context_sent) > 0]
#     print(cosine_sim_list)
    max_cosine_sim = max(cosine_sim_list)
    predicted_answer = context_sents[np.argmax(cosine_sim_list)]
    return max_cosine_sim, predicted_answer

In [34]:
sample_context = train_df['context'].tolist()[0]
sample_question = train_df['question'].tolist()[1]
print(f"C:{sample_context}")
print(f"Q: {sample_question}")
get_cosine_similarity(sample_context,sample_question,fast_text_model)

C:Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Q: What areas did Beyonce compete in when she was growing up?


(0.5806109,
 "Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child.")

In [35]:
train_df[['consine_sim','predicted_answer']] = train_df[['context','question']]\
.progress_apply(lambda x: get_cosine_similarity(x[0],x[1],fast_text_model,100),axis=1,result_type="expand")
train_df.head(2)

100%|███████████████████████████| 86820/86820 [01:01<00:00, 1422.03it/s]


Unnamed: 0,id,title,context,question,answer,answer_start,is_impossible,answer_sentences,consine_sim,predicted_answer,correct_prediction
0,56be85543aeaaa14008c9063,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s,269,False,"Born and raised in Houston, Texas, she perform...",0.508958,"Born and raised in Houston, Texas, she perform...",True
1,56be85543aeaaa14008c9065,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing,207,False,"Born and raised in Houston, Texas, she perform...",0.580611,"Born and raised in Houston, Texas, she perform...",True


In [36]:
train_df['correct_prediction'] = train_df['answer_sentences'] == train_df['predicted_answer']
train_df['correct_prediction'].value_counts()

True     61594
False    25226
Name: correct_prediction, dtype: int64

In [37]:
print(f"accuracy: {train_df[train_df['correct_prediction']].shape[0]/train_df.shape[0]}")

accuracy: 0.7094448283805574


In [38]:
val_df[['consine_sim','predicted_answer']] = val_df[['context','question']]\
.progress_apply(lambda x: get_cosine_similarity(x[0],x[1],fast_text_model),axis=1,result_type="expand")
val_df.head(2)

100%|████████████████████████████| 20302/20302 [00:22<00:00, 919.60it/s]


Unnamed: 0,id,title,context,question,answer,answer_start,is_impossible,answer_sentences,consine_sim,predicted_answer,correct_prediction
0,56ddde6b9a695914005b9628,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France,159,False,The Normans (Norman: Nourmands; French: Norman...,0.621735,The Normans (Norman: Nourmands; French: Norman...,True
1,56ddde6b9a695914005b9628,Normans,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France,159,False,The Normans (Norman: Nourmands; French: Norman...,0.621735,The Normans (Norman: Nourmands; French: Norman...,True


In [39]:
val_df['correct_prediction'] = val_df['answer_sentences'] == val_df['predicted_answer']
val_df['correct_prediction'].value_counts()

True     14381
False     5921
Name: correct_prediction, dtype: int64

In [40]:
print(f"accuracy: {val_df[val_df['correct_prediction']].shape[0]/val_df.shape[0]}")

accuracy: 0.7083538567628805


#### Download and use GloVe

In [42]:
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [43]:
glove_model = gensim.downloader.load('glove-wiki-gigaword-300')



IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [44]:
def avg_sentence_vector(words, model, num_features):
    if isinstance(model,gensim.models.word2vec.Word2Vec):
        word_vec_model = model.wv
    else:
        word_vec_model = model
    index2word_set = word_vec_model.index_to_key 
    #function to average all words vectors in a given paragraph
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0

    for word in words:
        if word in index2word_set:
            nwords = nwords+1
            featureVec = np.add(featureVec, word_vec_model[word])

    if nwords>0:
        featureVec = np.divide(featureVec, nwords)
#     print(featureVec)
    return featureVec
def get_context_vector(context,model,vector_size=300):
    if isinstance(model,gensim.models.word2vec.Word2Vec):
        vocab = model.wv.key_to_index
    else:
        vocab = model.key_to_index
    context_sents = sent_tokenize(context)
    processed_context = [simple_preprocess(sent) for sent in context_sents]
    processed_context = [[word if word in vocab else UNK for word in processed_context_sent]\
                         for processed_context_sent in processed_context]
    context_vectors = [np.array(avg_sentence_vector(processed_context_sent,model,vector_size)).reshape(1,-1) for processed_context_sent in processed_context]
    
    return context_vectors
    
def get_cosine_similarity(context,context_vectors,question,model,vector_size=300):
    context_sents = sent_tokenize(context)
    
    if isinstance(model,gensim.models.word2vec.Word2Vec):
        vocab = model.wv.key_to_index
    else:
        vocab = model.key_to_index
        
    processed_question = simple_preprocess(question)
    processed_question = [word if word in vocab else UNK for word in processed_question]
    
    question_vector  = np.array(avg_sentence_vector(processed_question,model,vector_size)).reshape(1,-1)
    
    cosine_sim_list = [cosine_similarity(context_sent_vector,question_vector) for context_sent_vector in context_vectors]
    
    max_cosine_sim = max(cosine_sim_list)
    predicted_answer = context_sents[np.argmax(cosine_sim_list)]
    return max_cosine_sim, predicted_answer

In [45]:
val_df['context_vec'] = val_df['context'].swifter\
.progress_bar(enable=True, desc=None).apply(lambda x: get_context_vector(x,glove_model))

Pandas Apply:   0%|          | 0/20302 [00:00<?, ?it/s]

In [46]:
val_df['correct_prediction'] = val_df['answer_sentences'] == val_df['predicted_answer']
val_df['correct_prediction'].value_counts()

True     14381
False     5921
Name: correct_prediction, dtype: int64

In [47]:
print(f"accuracy: {val_df[val_df['correct_prediction']].shape[0]/val_df.shape[0]}")

accuracy: 0.7083538567628805
