# About

This notebook reads in the Train Test sets prepared by data_pipeline.ipynb and conducts word2vec modelling

reference: https://www.analyticsvidhya.com/blog/2020/08/information-retrieval-using-word2vec-based-vector-space-model/

# Load libraries

In [5]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

# Load Train Test queries and passages

In [6]:
# query
query_train = pd.read_csv("./output/query_train_set_with_passage_info.csv")
query_test =pd.read_csv("./output/query_test_set_with_passage_info.csv")

# passage
passage_train = pd.read_csv("./output/train_passage_id_content_cleaned.csv")
passage_test = pd.read_csv("./output/test_passage_id_content_cleaned.csv")

In [7]:
print(query_train.shape)
print(query_test.shape)
print(passage_train.shape)
print(passage_test.shape)

(20000, 9)
(20000, 9)
(19898, 4)
(19893, 4)


In [8]:
display(query_train.head())
display(query_test.head())
display(passage_train.head())
display(passage_test.head())

Unnamed: 0,query_id,query,used,passage_id,rank,score,username,rel,query_cleaned
0,560129,what are hues,Q0,msmarco_passage_03_496902198,1,8.4601,Anserini,1,what are hues
1,560129,what are hues,Q0,msmarco_passage_35_561149885,2,8.460099,Anserini,1,what are hues
2,560129,what are hues,Q0,msmarco_passage_05_224676265,3,8.1767,Anserini,1,what are hues
3,560129,what are hues,Q0,msmarco_passage_04_168335684,4,8.1125,Anserini,1,what are hues
4,560129,what are hues,Q0,msmarco_passage_02_769341954,5,8.0855,Anserini,1,what are hues


Unnamed: 0,query_id,query,used,passage_id,rank,score,username,rel,query_cleaned
0,916247,what us state bears the slogan the land enchan...,Q0,msmarco_passage_05_840839268,1,16.004101,Anserini,1,what us state bears the slogan the land enchan...
1,916247,what us state bears the slogan the land enchan...,Q0,msmarco_passage_06_203354916,2,15.7155,Anserini,1,what us state bears the slogan the land enchan...
2,916247,what us state bears the slogan the land enchan...,Q0,msmarco_passage_45_489369159,3,15.715499,Anserini,1,what us state bears the slogan the land enchan...
3,916247,what us state bears the slogan the land enchan...,Q0,msmarco_passage_50_676325639,4,14.9837,Anserini,1,what us state bears the slogan the land enchan...
4,916247,what us state bears the slogan the land enchan...,Q0,msmarco_passage_21_464076261,5,14.3472,Anserini,1,what us state bears the slogan the land enchan...


Unnamed: 0,passage_id,passage,passage_cleaned,passage_cleaned_lemmatized
0,msmarco_passage_03_496902198,Let’s dig a little deeper into each. Hues are ...,let s dig a little deeper into each hues are c...,let s dig little deeply hue color hue dependen...
1,msmarco_passage_35_561149885,Let’s dig a little deeper into each. Hues are ...,let s dig a little deeper into each hues are c...,let s dig little deeply hue color hue dependen...
2,msmarco_passage_05_224676265,Hue: This is what we usually mean when we ask ...,hue this is what we usually mean when we ask w...,hue usually mean ask color property color actu...
3,msmarco_passage_04_168335684,hue = color or a shade of color\nexample sente...,hue color or a shade of color example sentence...,hue color shade color example sentence baby sk...
4,msmarco_passage_02_769341954,Hue: Hue is what we normally think of as color...,hue hue is what we normally think of as color ...,hue hue normally think color technically hue d...


Unnamed: 0,passage_id,passage,passage_cleaned,passage_cleaned_lemmatized
0,msmarco_passage_05_840839268,New Mexico State Symbols. State Nickname: The ...,new mexico state symbols state nickname the la...,new mexico state symbols state nickname land e...
1,msmarco_passage_06_203354916,Land of Enchantment. Before Land of Enchantmen...,land of enchantment before land of enchantment...,land enchantment land enchantment state slogan...
2,msmarco_passage_45_489369159,Land of Enchantment. Before Land of Enchantmen...,land of enchantment before land of enchantment...,land enchantment land enchantment state slogan...
3,msmarco_passage_50_676325639,New Mexico State Slogans. Whereas the New Mexi...,new mexico state slogans whereas the new mexic...,new mexico state slogan new mexico state motto...
4,msmarco_passage_21_464076261,"1 to approximately 99-000. First use of the ""L...",to approximately first use of the land of enc...,approximately use land enchantment slogan em...


# Train and Test set - combine queries with passage

In [9]:
query_train["query_cleaned"].head()

0    what are hues
1    what are hues
2    what are hues
3    what are hues
4    what are hues
Name: query_cleaned, dtype: object

In [10]:
passage_train["passage_cleaned_lemmatized"].head()

0    let s dig little deeply hue color hue dependen...
1    let s dig little deeply hue color hue dependen...
2    hue usually mean ask color property color actu...
3    hue color shade color example sentence baby sk...
4    hue hue normally think color technically hue d...
Name: passage_cleaned_lemmatized, dtype: object

In [11]:
passage_train.rename(columns={"passage_cleaned_lemmatized":"text"}).head()

Unnamed: 0,passage_id,passage,passage_cleaned,text
0,msmarco_passage_03_496902198,Let’s dig a little deeper into each. Hues are ...,let s dig a little deeper into each hues are c...,let s dig little deeply hue color hue dependen...
1,msmarco_passage_35_561149885,Let’s dig a little deeper into each. Hues are ...,let s dig a little deeper into each hues are c...,let s dig little deeply hue color hue dependen...
2,msmarco_passage_05_224676265,Hue: This is what we usually mean when we ask ...,hue this is what we usually mean when we ask w...,hue usually mean ask color property color actu...
3,msmarco_passage_04_168335684,hue = color or a shade of color\nexample sente...,hue color or a shade of color example sentence...,hue color shade color example sentence baby sk...
4,msmarco_passage_02_769341954,Hue: Hue is what we normally think of as color...,hue hue is what we normally think of as color ...,hue hue normally think color technically hue d...


## Remove passage that get null after lemmatization

In [12]:
passage_train[passage_train["passage_cleaned_lemmatized"].isnull()]

Unnamed: 0,passage_id,passage,passage_cleaned,passage_cleaned_lemmatized
10153,msmarco_passage_25_423645543,what's your take on that. what's yours. What's...,what is your take on that what is yours what i...,


In [13]:
passage_test[passage_test["passage_cleaned_lemmatized"].isnull()]

Unnamed: 0,passage_id,passage,passage_cleaned,passage_cleaned_lemmatized
10786,msmarco_passage_25_423645543,what's your take on that. what's yours. What's...,what is your take on that what is yours what i...,


In [14]:
passage_train = passage_train[passage_train["passage_id"]!="msmarco_passage_25_423645543"]

In [15]:
passage_test = passage_test[passage_test["passage_id"]!="msmarco_passage_25_423645543"]

In [16]:
#query_train = query_train[query_train["passage_id"]!="msmarco_passage_25_423645543"]

In [17]:
query_train[query_train["query_cleaned"].isnull()]

Unnamed: 0,query_id,query,used,passage_id,rank,score,username,rel,query_cleaned


In [18]:
# Combining corpus and queries for training
combined_training = (pd.concat([passage_train.rename(columns={"passage_cleaned_lemmatized":"text"})["text"],
                             query_train.rename(columns={"query_cleaned":'text'})['text']])
                             .sample(frac=1) #shuffle
                             .reset_index(drop=True))
print(passage_train.shape)
print(query_train.shape)
print(combined_training.shape)
display(combined_training.head())


(19897, 4)
(20000, 9)
(39897,)


0    think bicycle chain chain protein link chain a...
1                               what is a growing zone
2                            what shade is balboa mist
3       what does a development strategic plan include
4    patrick henry famous speech good essay word pa...
Name: text, dtype: object

In [19]:
# Creating data for the model training
train_data=[]
for i in combined_training:
    train_data.append(i.split())

# Training a word2vec model from the given data set
w2v_model = Word2Vec(train_data
                     , vector_size = 300
                     , min_count = 2
                     , window = 5
                     , sg = 1
                     , workers = 4)

In [20]:
# Vocabulary size
print('Vocabulary size:', len(w2v_model.wv))

Vocabulary size: 19605


In [21]:
# Function returning vector reperesentation of a document
def get_embedding_w2v(doc_tokens):
    embeddings = []
    if len(doc_tokens)<1:
        return np.zeros(300)
    else:
        for tok in doc_tokens:
            if tok in w2v_model.wv:
                embeddings.append(w2v_model.wv.get_vector(tok))
            else:
                embeddings.append(np.random.rand(300))
        # mean the vectors of individual words to get the vector of the document
        return np.mean(embeddings, axis=0)



In [22]:
"hello" in w2v_model.wv

True

In [23]:
len(w2v_model.wv.get_vector("hello"))

300

In [24]:
# Getting Word2Vec Vectors for Testing Corpus and Queries
passage_test["vector"] = passage_test["passage_cleaned_lemmatized"].apply(lambda x :get_embedding_w2v(x.split()))
query_test["vector"] = query_test["query_cleaned"].apply(lambda x :get_embedding_w2v(x.split()))

In [25]:
passage_test.head()

Unnamed: 0,passage_id,passage,passage_cleaned,passage_cleaned_lemmatized,vector
0,msmarco_passage_05_840839268,New Mexico State Symbols. State Nickname: The ...,new mexico state symbols state nickname the la...,new mexico state symbols state nickname land e...,"[0.11938315887280374, 0.07699647027280246, 0.1..."
1,msmarco_passage_06_203354916,Land of Enchantment. Before Land of Enchantmen...,land of enchantment before land of enchantment...,land enchantment land enchantment state slogan...,"[0.03001411608592936, 0.10840147123437748, 0.0..."
2,msmarco_passage_45_489369159,Land of Enchantment. Before Land of Enchantmen...,land of enchantment before land of enchantment...,land enchantment land enchantment state slogan...,"[0.010797798526635205, 0.12425462115874586, 0...."
3,msmarco_passage_50_676325639,New Mexico State Slogans. Whereas the New Mexi...,new mexico state slogans whereas the new mexic...,new mexico state slogan new mexico state motto...,"[0.050797651260532414, 0.14191664327714773, 0...."
4,msmarco_passage_21_464076261,"1 to approximately 99-000. First use of the ""L...",to approximately first use of the land of enc...,approximately use land enchantment slogan em...,"[0.0844783973082838, 0.12607239512611462, 0.20..."


In [26]:
len(passage_test["vector"][0])

300

In [27]:
query_test.head()

Unnamed: 0,query_id,query,used,passage_id,rank,score,username,rel,query_cleaned,vector
0,916247,what us state bears the slogan the land enchan...,Q0,msmarco_passage_05_840839268,1,16.004101,Anserini,1,what us state bears the slogan the land enchan...,"[0.19544760670821437, 0.05241127713214914, 0.1..."
1,916247,what us state bears the slogan the land enchan...,Q0,msmarco_passage_06_203354916,2,15.7155,Anserini,1,what us state bears the slogan the land enchan...,"[0.19473821635276328, 0.08082901877568266, 0.2..."
2,916247,what us state bears the slogan the land enchan...,Q0,msmarco_passage_45_489369159,3,15.715499,Anserini,1,what us state bears the slogan the land enchan...,"[0.22112144328101987, 0.05655473228224195, 0.1..."
3,916247,what us state bears the slogan the land enchan...,Q0,msmarco_passage_50_676325639,4,14.9837,Anserini,1,what us state bears the slogan the land enchan...,"[0.23997162888790827, 0.06643368186228349, 0.2..."
4,916247,what us state bears the slogan the land enchan...,Q0,msmarco_passage_21_464076261,5,14.3472,Anserini,1,what us state bears the slogan the land enchan...,"[0.17617256554579738, 0.0903300079082119, 0.14..."


In [28]:
passage_test.head()

Unnamed: 0,passage_id,passage,passage_cleaned,passage_cleaned_lemmatized,vector
0,msmarco_passage_05_840839268,New Mexico State Symbols. State Nickname: The ...,new mexico state symbols state nickname the la...,new mexico state symbols state nickname land e...,"[0.11938315887280374, 0.07699647027280246, 0.1..."
1,msmarco_passage_06_203354916,Land of Enchantment. Before Land of Enchantmen...,land of enchantment before land of enchantment...,land enchantment land enchantment state slogan...,"[0.03001411608592936, 0.10840147123437748, 0.0..."
2,msmarco_passage_45_489369159,Land of Enchantment. Before Land of Enchantmen...,land of enchantment before land of enchantment...,land enchantment land enchantment state slogan...,"[0.010797798526635205, 0.12425462115874586, 0...."
3,msmarco_passage_50_676325639,New Mexico State Slogans. Whereas the New Mexi...,new mexico state slogans whereas the new mexic...,new mexico state slogan new mexico state motto...,"[0.050797651260532414, 0.14191664327714773, 0...."
4,msmarco_passage_21_464076261,"1 to approximately 99-000. First use of the ""L...",to approximately first use of the land of enc...,approximately use land enchantment slogan em...,"[0.0844783973082838, 0.12607239512611462, 0.20..."


## Test the code - add display, only use 1 query to test

In [29]:
# Function for calculating average precision for a query
def average_precision(qid, qvector):
    
    # ========= for each query_id, do the following:
    print(f"Started to work on query_id {qid}")
    # get passage_id (each query_id would have upto 10 passage_id)
    qresult = query_test.loc[query_test["query_id"] == qid, ["passage_id", "rel"]]
    # get passage vector for each passage_id
    qcorpus = passage_test.loc[passage_test["passage_id"].isin(qresult["passage_id"]),["passage_id","vector"]]
    
    # inner join passage_id with passage vector
    # this is a result table of <= 10 rows, 2 columns: passage_id and vector
    qresult = pd.merge(qresult, qcorpus, on = "passage_id", how = "inner")

    # create a new column "similarity", 
    # by comparing similarity between the query vector (input qvector) with passage vector (the "vector" column)
    qresult["similarity"] = qresult["vector"].apply(lambda x: cosine_similarity(
                          np.array(qvector).reshape(1, -1)
                          , np.array(x).reshape(1, -1)).item())
    
    # sort rows so that passages with the highest similarity ranks as top 
    qresult.sort_values(by="similarity", ascending = False, inplace=True)

    # # Taking Top 10 documents for the evaluation
    ranking = qresult.head(10)['rel'].values
    display(qresult.head(15))
    
    # # Calculating precision
    precision=[]
    for i in range(1,11):
      if ranking[i-1]:
        precision.append(np.sum(ranking[:i])/i)
    
    # If no relevant document in list then return 0
    if precision==[]:
      return 0
    print(len(precision))
    print(precision)
    avg_precision = np.mean(precision)
    print("Avg precision: {}".format(avg_precision))
    return avg_precision, len(precision), qresult.shape[0]

In [30]:
query_test.iloc[[0]]

Unnamed: 0,query_id,query,used,passage_id,rank,score,username,rel,query_cleaned,vector
0,916247,what us state bears the slogan the land enchan...,Q0,msmarco_passage_05_840839268,1,16.004101,Anserini,1,what us state bears the slogan the land enchan...,"[0.19544760670821437, 0.05241127713214914, 0.1..."


In [31]:
# use one sample to decompose the function
query_test.iloc[[0]].apply(lambda x: average_precision(x['query_id']
                                    , x['vector']),axis=1)

Started to work on query_id 916247


Unnamed: 0,passage_id,rel,vector,similarity
10,msmarco_passage_06_45103616,0,"[0.1421021706711745, 0.08327692914072557, 0.05...",0.673069
17,msmarco_passage_31_763502859,0,"[0.04876301315863561, 0.03618491944910469, 0.0...",0.666931
9,msmarco_passage_00_583662759,1,"[0.19576311429170473, 0.14312731534507056, 0.0...",0.663334
18,msmarco_passage_59_350363899,0,"[0.20352333616803917, 0.16360212203097196, 0.0...",0.655521
14,msmarco_passage_21_306563782,0,"[0.11884961974667733, 0.17699329820126228, 0.0...",0.642346
16,msmarco_passage_32_313105536,0,"[0.03211158510718074, 0.09714098228668096, 0.0...",0.63567
8,msmarco_passage_22_677091138,1,"[0.10020050107642264, 0.15511715263286255, 0.0...",0.625087
1,msmarco_passage_06_203354916,1,"[0.03001411608592936, 0.10840147123437748, 0.0...",0.624896
7,msmarco_passage_32_845400104,1,"[0.10440869072361501, 0.18244040828270194, 0.0...",0.620899
5,msmarco_passage_50_676326055,1,"[0.15773380868831488, 0.12544590586303836, 0.0...",0.620466


5
[0.3333333333333333, 0.2857142857142857, 0.375, 0.4444444444444444, 0.5]
Avg precision: 0.3876984126984127


0    (0.3876984126984127, 5, 20)
dtype: object

## Run the code of average_precision - do not display, run on all query_id

In [35]:
# Function for calculating average precision for a query
def average_precision(qid, qvector):   
    # ========= for each query_id, do the following:
    #print(f"\nStarted to work on query_id {qid}")
    # get passage_id (each query_id would have upto 10 passage_id)
    qresult = query_test.loc[query_test["query_id"] == qid, ["passage_id", "rel"]]
    # get passage vector for each passage_id
    qcorpus = passage_test.loc[passage_test["passage_id"].isin(qresult["passage_id"]),["passage_id","vector"]]
    
    # inner join passage_id with passage vector
    # this is a result table of <= 10 rows, 2 columns: passage_id and vector
    qresult = pd.merge(qresult, qcorpus, on = "passage_id", how = "inner")

    # create a new column "similarity", 
    # by comparing similarity between the query vector (input qvector) with passage vector (the "vector" column)
    qresult["similarity"] = qresult["vector"].apply(lambda x: cosine_similarity(
                          np.array(qvector).reshape(1, -1)
                          , np.array(x).reshape(1, -1)).item())
    
    # sort rows so that passages with the highest similarity ranks as top 
    qresult.sort_values(by="similarity", ascending = False, inplace=True)

    # # Taking Top 10 documents for the evaluation
    ranking = qresult.head(10)['rel'].values
    #display(qresult.head(15))
    
    # # Calculating precision
    precision=[]
    for i in range(1,11):
      if ranking[i-1]:
        precision.append(np.sum(ranking[:i])/i)
    
    # If no relevant document in list then return 0
    if precision==[]:
      return 0
    #print(len(precision))
    #print(precision)
    avg_precision = np.mean(precision)
    #print("Avg precision: {}".format(avg_precision))
    return avg_precision

In [36]:
# Calculating average precision for all queries in the test set
query_test["avg_precision"] = query_test.apply(lambda x: average_precision(x["query_id"]
                                                                           ,x["vector"]),axis=1)

In [37]:
query_test.head()

Unnamed: 0,query_id,query,used,passage_id,rank,score,username,rel,query_cleaned,vector,avg_precision
0,916247,what us state bears the slogan the land enchan...,Q0,msmarco_passage_05_840839268,1,16.004101,Anserini,1,what us state bears the slogan the land enchan...,"[0.19544760670821437, 0.05241127713214914, 0.1...",0.387698
1,916247,what us state bears the slogan the land enchan...,Q0,msmarco_passage_06_203354916,2,15.7155,Anserini,1,what us state bears the slogan the land enchan...,"[0.19473821635276328, 0.08082901877568266, 0.2...",0.371032
2,916247,what us state bears the slogan the land enchan...,Q0,msmarco_passage_45_489369159,3,15.715499,Anserini,1,what us state bears the slogan the land enchan...,"[0.22112144328101987, 0.05655473228224195, 0.1...",0.371032
3,916247,what us state bears the slogan the land enchan...,Q0,msmarco_passage_50_676325639,4,14.9837,Anserini,1,what us state bears the slogan the land enchan...,"[0.23997162888790827, 0.06643368186228349, 0.2...",0.387698
4,916247,what us state bears the slogan the land enchan...,Q0,msmarco_passage_21_464076261,5,14.3472,Anserini,1,what us state bears the slogan the land enchan...,"[0.17617256554579738, 0.0903300079082119, 0.14...",0.371032


In [38]:
# Finding Mean Average Precision
print('Mean Average Precision=>', query_test["avg_precision"].mean())

Mean Average Precision=> 0.6871618449389015


In [39]:
query_test.drop(columns = ["vector"]).to_csv("./output/query_test_w2v_result.csv", index = False)