# Information Retrieval and Web Analytics

# GROUP NAMES:
- Judith Camacho 218863 
- Jordi Marín    207552 
- Xavier Vives   218900

# **Project 1**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import re
import json
import pandas as pd
from datetime import datetime
import numpy as np
from ast import literal_eval
import collections
import time
from numpy import linalg as la

We read the data that we preprocessed in the first part of the project.

In [None]:
data_path = "/content/drive/Shareddrives/IRWA/Project/Part 2/data/"
docs_path = data_path+'tweet_df.csv'

tweet_df = pd.read_csv(docs_path, index_col=[0])
tweet_df['processed_text'] = tweet_df['processed_text'].apply(lambda x: literal_eval(x))
tweet_df.head()

Unnamed: 0,Tweet,Username,Date,Hashtags,Likes,Retweets,Url,Doc,processed_text
1575918182698979328,So this will keep spinning over us until 7 pm…...,Suz👻,2022-09-30 18:39:08,['HurricaneIan'],0,0,https://twitter.com/Suz/status/157591818269897...,doc_1,"[keep, spin, us, 7, pmgo, away, alreadi]"
1575918151862304768,Our hearts go out to all those affected by . W...,Lytx,2022-09-30 18:39:01,['HurricaneIan'],0,0,https://twitter.com/Lytx/status/15759181518623...,doc_2,"[heart, go, affect, wish, everyon, road, curre..."
1575918140839673873,Kissimmee neighborhood off of Michigan Ave. \n,Christopher Heath,2022-09-30 18:38:58,['HurricaneIan'],0,0,https://twitter.com/Christopher_Heath/status/1...,doc_3,"[kissimme, neighborhood, michigan, ave]"
1575918135009738752,I have this one tree in my backyard that scare...,alex ✨,2022-09-30 18:38:57,"['scwx', 'HurricaneIan']",0,0,https://twitter.com/alex_/status/1575918135009...,doc_4,"[one, tree, backyard, scare, poltergeist, tree..."
1575918119251419136,"I pray for everyone affected by , but...",Tess 💋,2022-09-30 18:38:53,['HurricaneIan'],0,0,https://twitter.com/Tess_/status/1575918119251...,doc_5,"[pray, everyon, affect, associ, winknew, sympa..."


We copy the function from the previous part because we will used it later.

In [None]:
def build_terms(tweet_text):
    '''
    input: the text of a tweet
    function: pre-process the text as said above
    output: a list of strings of the processed text
    '''
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))

    tweet_text = tweet_text.lower() ## Transform in lowercase
    tweet_text = re.sub(r'[^\w\s]', '', tweet_text) #remove punctuation marks and hashtags using regex

    tweet_text = tweet_text.split() ## Tokenize the text to get a list of terms
    tweet_text = [x for x in tweet_text if x not in stop_words]  ##eliminate the stopwords
    tweet_text = [stemmer.stem(x) for x in tweet_text] ## perform stemming 
    return tweet_text

# **Indexing**

Once we have preprocessed the text, we create the inverted index modifying the code given in class:


## **PART 1**

In [None]:
def create_index(lines):
    """
    Implement the inverted index
    
    Argument:
    lines -- collection of tweets
    
    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of documents where these keys appears in (and the positions) as values.
    """
    index = defaultdict(list)

    for doc,terms in zip(lines['Doc'].values, lines['processed_text'].values):  # Remember, lines contain all documents from file
        ## ===============================================================        
        ## create the index for the current page and store it in current_page_index (current_page_index)
        ## current_page_index ==> { ‘term1’: [current_doc, [list of positions]], ...,‘term_n’: [current_doc, [list of positions]]}
        ## ===============================================================

        current_page_index = {}

        for position, term in enumerate(terms): # terms contains page_title + page_text. Loop over all terms
            try:
                # if the term is already in the index for the current page (current_page_index)
                # append the position to the corresponding list
                current_page_index[term][1].append(position)
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                current_page_index[term] = [doc, array('I', [position])]  #'I' indicates unsigned int (int in Python)
        # merge the current page index with the main index
        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)
    return index

In [None]:
start_time = time.time()
index = create_index(tweet_df)
print("Total time to create the index: {} seconds".format(np.round(time.time() - start_time, 2)))

Total time to create the index: 0.19 seconds


## **PART 2**

We propose the following 5 queries:

- power outage in carolina
- school closed
- deaths hurricane florida
- how to help hurricane
- president speech

We choose these ones because they could be made by a real user and the results are easy to classify between relevant/non-relevant.





In [None]:
queries = ['power outage in carolina',
           'school closed',
           'deaths hurricane florida',
           'how to help hurricane',
           'president speech']

Using the previously defined function build_terms(), we preprocess the queries also 


In [None]:
print("Insert your query (i.e.: Computer Science):\n")
query = input()
prossesed_query = build_terms(query)

print("Prossesed Query= {}".format(prossesed_query))

Insert your query (i.e.: Computer Science):

Computer Science
Prossesed Query= ['comput', 'scienc']


## **PART 3**

Next we implement the tf-idf algorithm modifying the code given in class:


In [None]:
def create_tfidf(lines, num_documents):
    """
    Implement the inverted index and compute tf, df and idf
    
    Argument:
    lines -- collection of tweets
    num_documents -- total number of documents
    
    Returns:
    tf - normalized term frequency for each term in each document
    idf - inverse document frequency of each term
    """

    tf = defaultdict(list)  # term frequencies of terms in documents (documents in the same order as in the main index)
    df = defaultdict(int)  # document frequencies of terms in the corpus
    idf = defaultdict(float)

    for doc,terms in zip(lines['Doc'].values, lines['processed_text'].values):
        ## ===============================================================        
        ## create the index for the **current page** and store it in current_page_index
        ## current_page_index ==> { ‘term1’: [current_doc, [list of positions]], ...,‘term_n’: [current_doc, [list of positions]]}
        ## ===============================================================

        current_page_index = {}

        for position, term in enumerate(terms):  ## terms contains page_title + page_text
            try:
                # if the term is already in the dict append the position to the corresponding list
                current_page_index[term][1].append(position)
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                current_page_index[term]=[doc, array('I',[position])] #'I' indicates unsigned int (int in Python)

        #normalize term frequencies
        # Compute the denominator to normalize term frequencies (formula 2 above)
        # norm is the same for all terms of a document.
        norm = 0
        for term, posting in current_page_index.items():
            # posting will contain the list of positions for current term in current document. 
            # posting ==> [current_doc, [list of positions]] 
            # you can use it to infer the frequency of current term.
            norm += len(posting[1]) ** 2
        norm = math.sqrt(norm)

        # calculate the tf(dividing the term frequency by the above computed norm) and df weights
        for term, posting in current_page_index.items():
            # append the tf for current term (tf = term frequency in current doc/norm)
            tf[term].append(np.round(len(posting[1])/norm,4)) ## SEE formula (1) above
            #increment the document frequency of current term (number of documents containing the current term)
            df[term] += 1 # increment DF for current term

        # Compute IDF following the formula (3) above. HINT: use np.log
        for term in df:
            idf[term] = np.round(np.log(float(num_documents/df[term])), 4)

    return tf, idf

In [None]:
start_time = time.time()
num_documents = len(tweet_df)
tf, idf = create_tfidf(tweet_df, num_documents)
print("Total time to create the index: {} seconds" .format(np.round(time.time() - start_time, 2)))

Total time to create the index: 184.0 seconds


### Ranking
Now we want to rank. 
First we want to see which are the top 50 words (50 more popular) with lowest idf value.
We do this to be able to decide the final queries we want. Hence, after executing the next cell, we changed our original queries based on the result of the execution.

In [None]:
k = 50

sort_df = pd.Series(data=np.fromiter(idf.values(),dtype=float),index=idf.keys())
sort_df = sort_df.sort_values()#ascending=False)
#sort_df.head(k)

The function rank_documents() returns the ranked list of documents.
The documents on top of the list will be the ones with highest cosine similarity, thus the ones most similar to the query.

In [None]:
def rank_documents(terms, docs, index, idf, tf, sorted=True):
    """
    Perform the ranking of the results of a search based on the tf-idf weights
    
    Argument:
    terms -- list of query terms
    docs -- list of documents, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies
    sorted -- if we want that the function returns list ordered or not. Yes, by default.
    
    Returns:
    Print the list of ranked documents and the doc_id and its tf-idf value.
    """

    # I'm interested only on the element of the docVector corresponding to the query terms 
    # The remaining elements would became 0 when multiplied to the query_vector
    doc_vectors = defaultdict(lambda: [0] * len(terms)) # I call doc_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    query_vector = [0] * len(terms)

    # compute the norm for the query tf
    query_terms_count = collections.Counter(terms)  # get the frequency of each term in the query. 

    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue
        ## Compute tf*idf(normalize TF as done with documents)
        query_vector[termIndex] = query_terms_count[term] / query_norm * idf[term]

        # Generate doc_vectors for matching docs
        for doc_index, (doc, postings) in enumerate(index[term]):
                     
            if doc in docs:
                doc_vectors[doc][termIndex] = tf[term][doc_index] * idf[term]  

    # Calculate the score of each doc 
    # compute the cosine similarity between queyVector and each docVector:
    
    doc_scores = [[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items()]
    if sorted:
      doc_scores.sort(reverse=True)
      result_docs = [x[1] for x in doc_scores]
      #print document titles instead if document id's
      #result_docs=[ title_index[x] for x in result_docs ]
      if len(result_docs) == 0:
          print("No results found, try again")
          query = input()
          docs = search_tf_idf(query, index)
      #print ('\n'.join(result_docs), '\n')
    else:
      result_docs = []
    return result_docs, doc_scores

The following function, search_tf_idf is the "searching part". For a given query, it preprocesses it and then returns the list of most relevant documents to that query.

In [None]:
def search_tf_idf(query, index):
    """
    Arguments: 
    - index -- inverted index data structure
    - query: query we are interested in
    Returns: 
    The list of documents that contain any of the query terms. 
    So, we will get the list of documents for each query term, and take the union of them.
    """
    query = build_terms(query)
    docs = set()
    for term in query:
        try:
            # store in term_docs the ids of the docs that contain "term"                        
            term_docs = [posting[0] for posting in index[term]]
            
            # docs = docs Union term_docs
            docs |= set(term_docs)
        except:
            #term is not in index
            pass
    docs = list(docs)
    ranked_docs,_ = rank_documents(query, docs, index, idf, tf)
    return ranked_docs

We sort the dataset by doc_id:

In [None]:
tweet_df_doc = tweet_df.copy(deep=True)
tweet_df_doc = tweet_df_doc.set_index('Doc')
tweet_df_doc.head()

Unnamed: 0_level_0,Tweet,Username,Date,Hashtags,Likes,Retweets,Url,processed_text
Doc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
doc_1,So this will keep spinning over us until 7 pm…...,Suz👻,2022-09-30 18:39:08,['HurricaneIan'],0,0,https://twitter.com/Suz/status/157591818269897...,"[keep, spin, us, 7, pmgo, away, alreadi]"
doc_2,Our hearts go out to all those affected by . W...,Lytx,2022-09-30 18:39:01,['HurricaneIan'],0,0,https://twitter.com/Lytx/status/15759181518623...,"[heart, go, affect, wish, everyon, road, curre..."
doc_3,Kissimmee neighborhood off of Michigan Ave. \n,Christopher Heath,2022-09-30 18:38:58,['HurricaneIan'],0,0,https://twitter.com/Christopher_Heath/status/1...,"[kissimme, neighborhood, michigan, ave]"
doc_4,I have this one tree in my backyard that scare...,alex ✨,2022-09-30 18:38:57,"['scwx', 'HurricaneIan']",0,0,https://twitter.com/alex_/status/1575918135009...,"[one, tree, backyard, scare, poltergeist, tree..."
doc_5,"I pray for everyone affected by , but...",Tess 💋,2022-09-30 18:38:53,['HurricaneIan'],0,0,https://twitter.com/Tess_/status/1575918119251...,"[pray, everyon, affect, associ, winknew, sympa..."


Test that the searching engine works:

In [None]:
print("Insert your query (i.e.: Computer Science):\n")
query = input()
ranked_docs = search_tf_idf(query, index)
top = 20

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(ranked_docs)))
for d_id in ranked_docs[:top]:
    print("{}: {}".format(d_id, tweet_df_doc.loc[d_id]['Tweet']))

Insert your query (i.e.: Computer Science):

Computer Science

Top 20 results out of 7 for the searched query:

doc_2308: As we await the effects of , here's a look at the science behind predicting hurricanes and why it's so tricky.


doc_264: Oh…so NOW Floridians believe the science and listen to the advice of government officials. Good to know! 
doc_2653: My point is hurricanes are becoming MORE stronger and powerful.
We know it and we see it.
And mostly we know why.
Science.
We have the evidence.
We have temperatures increasing in Earth's atmosphere.
We have the receipts. 🌀🌧🌍
 


 
doc_3331:  "study" sez "man-caused"  added 10% rain to ?
•Computer simulation? Same scam that saw 2.2Μ  dead in US?
•Gov't lab
•Not peer-reviewed
•Pimped by   
•You'd buy eco-dictatorship "to save earth"?

doc_1798: Stuck in WA state due to  - Damage assessment has commenced through the use of  on my 12-yr old’s school computer. 😂 a week trip turned into a two week trip.  
doc_3787: 🌀Charles, a student , 

Now we show the top 20 most relevant documents for each query that we proposed.

In [None]:
print("Queries:")
#query = input()
for query in queries:
  ranked_docs = search_tf_idf(query, index)
  top = 20
  print("\n======================\nQuery: {}".format(query))
  print("\nTop {} results out of {} for the searched query:\n".format(top, len(ranked_docs)))
  for d_id in ranked_docs[:top]:
      print("{}: {}".format(d_id, tweet_df_doc.loc[d_id]['Tweet']))

Queries:

Query: power outage in carolina

Top 20 results out of 552 for the searched query:

doc_755: We have 10,000  responders prepared for outages across the  including &gt;6,000 lineworkers. We will respond to outages as soon as it’s safe to do so. Customers should expect outages and have a plan.     
doc_2607: HOW TO REPORT AN OUTAGE WHERE YOU ARE: 
- North Carolina: 
- South Carolina: 
   
doc_337: Up to 40,000+ power outages in NC now. 

SC has 115,000. 

  
doc_1302: We are having power outages here. Stay safe everyone and check in on family and friends  
doc_686: Landfall of  is imminent just up the coast from Charleston as power outages begin to mount in coastal South Carolina.

Stay with  for the latest. 
doc_1622: With  🌀 passing through today, remember to report any power outages ⚡ directly to your service provider.
➡️ : 
➡️ : 
➡️  
doc_1850: The current electrical customers in Florida without power is 1.8M, after a peak of 2.6M.

Track power outages in real-time: .

    

# *Evaluation*

First we upload the data that we are given and we explore it.

In [None]:
docs_path = data_path+'evaluation_gt.csv'

evaluation_gt = pd.read_csv(docs_path)
evaluation_gt.head()

Unnamed: 0,doc,query_id,label
0,doc_12,1,1
1,doc_9,1,1
2,doc_18,1,1
3,doc_45,1,1
4,doc_501,1,1


In [None]:
evaluation_gt.query_id.value_counts()

evaluation_gt[evaluation_gt.query_id == 3].doc.unique()

array(['doc_30', 'doc_65', 'doc_66', 'doc_112', 'doc_148', 'doc_150',
       'doc_198', 'doc_370', 'doc_125', 'doc_306', 'doc_441', 'doc_494',
       'doc_525', 'doc_1002', 'doc_1076', 'doc_1096', 'doc_1195',
       'doc_1233'], dtype=object)

Just some cells above, we have retrieved the top 20 docs for each query. Now we are manually scoring the results of our algorithm. We have given values from 0-3 to each result by reading and interpreting the tweet. Hence the scoring is subjective to our criteria.

In [None]:
q1 = [['doc_755',2], ['doc_2607',2], ['doc_337',3], ['doc_1302',1], ['doc_686',3], ['doc_1622',1], ['doc_1850',0], ['doc_2171',0], ['doc_1589',1], ['doc_3042',0], ['doc_2829',1], ['doc_955',0], ['doc_942',0], ['doc_932',0], ['doc_487',0], ['doc_2552',3], ['doc_3017',0], ['doc_3074',3], ['doc_1760',1], ['doc_380',1]]
q2 = [['doc_1297',2], ['doc_2659',3], ['doc_1144',3], ['doc_2829',2], ['doc_3580',2], ['doc_3533',1], ['doc_933',2], ['doc_515',1], ['doc_3655',0], ['doc_3906',0], ['doc_766',0], ['doc_3994',3], ['doc_3932',1], ['doc_3597',0], ['doc_102',0], ['doc_2456',0], ['doc_228',0], ['doc_1393',0], ['doc_1798',0], ['doc_3871',0]]
q3 = [['doc_2893',3], ['doc_1874',2], ['doc_781',2], ['doc_1356',3], ['doc_3670',3], ['doc_2054',3], ['doc_1719',2], ['doc_850',3], ['doc_1751',3], ['doc_2369',3], ['doc_3198',2], ['doc_1011',0], ['doc_1700',3], ['doc_3129',2], ['doc_2010',1], ['doc_2905',3],[ 'doc_3636',3], ['doc_2700',3], ['doc_3218',1], ['doc_1404',2]]
q4 = [['doc_1333',3], ['doc_1458',3], ['doc_1025',3], ['doc_477',1], ['doc_3495',2], ['doc_1374',0], ['doc_116',3], ['doc_1679',3], ['doc_2142',3], ['doc_3740',1], ['doc_3057',3], ['doc_2183',3], ['doc_2108',3], ['doc_2051',3], ['doc_1993',3], ['doc_2964',3], ['doc_947',3], ['doc_3952',0], ['doc_2027',2], ['doc_1301',3]]
q5 = [['doc_3412',3], ['doc_1043',3], ['doc_1037',3], ['doc_1645',2], ['doc_844',2], ['doc_900',3], ['doc_1769',1], ['doc_429',3], ['doc_391',3], ['doc_388',3], ['doc_2214',3], ['doc_692',1], ['doc_795',1], ['doc_872',1], ['doc_2225',0], ['doc_2202',2], ['doc_2125',3], ['doc_3182',1], ['doc_1715',2], ['doc_1014',3]]

Now we convert the above cell into a Pandas DataFrame that fits the data structure that we have been using until now. The resulting DataFrame is called df_queries.


In [None]:
df_q1 = pd.DataFrame(data = q1, columns = ['doc', 'doc_score'])
df_q1['query_id'] = 1
df_q2 = pd.DataFrame(data = q2, columns = ['doc', 'doc_score'])
df_q2['query_id'] = 2
df_q3 = pd.DataFrame(data = q3, columns = ['doc', 'doc_score'])
df_q3['query_id'] = 3
df_q4 = pd.DataFrame(data = q4, columns = ['doc', 'doc_score'])
df_q4['query_id'] = 4
df_q5 = pd.DataFrame(data = q5, columns = ['doc', 'doc_score'])
df_q5['query_id'] = 5

df_queries = pd.concat([df_q1,df_q2,df_q3,df_q4,df_q5]).reset_index(drop=True)
df_queries["label"] = df_queries["doc_score"].apply(lambda y: 1 if y >=2 else 0)

df_queries

Unnamed: 0,doc,doc_score,query_id,label
0,doc_755,2,1,1
1,doc_2607,2,1,1
2,doc_337,3,1,1
3,doc_1302,1,1,0
4,doc_686,3,1,1
...,...,...,...,...
95,doc_2202,2,5,1
96,doc_2125,3,5,1
97,doc_3182,1,5,0
98,doc_1715,2,5,1


Now we want to predict the relevance for each document and query.


In [None]:
def computed_predicted_relevance(index, queries, df):
    """
    Arguments: 
    - index -- inverted index data structure
    - queries: list of queries we are interested in
    - df: dataframe
    Returns: 
    A dataframe containing a new column with the predicted relevance (tf-idf score)
    """
    #create the dict of predicted_relevance
    p_r = {}
    #for each query 
    for id,query in enumerate(queries):
      id += 1 #to readjust indices
      docs = df[df.query_id == id].doc.unique() #obtain the list of docs
      terms = build_terms(query) #preprocess query text
      _, doc_scores = rank_documents(terms, docs, index, idf, tf, sorted=False) #for each doc obtain it's cosine_similarity to the query
      dicdoc_scores = {} #dictionary to store the score of each doc
      for elm in doc_scores: #for each element([score, doc]) in doc_score
        dicdoc_scores[elm[1]] = elm[0] #store the element and it's score
      for doc in docs:
        if doc not in dicdoc_scores:
          dicdoc_scores[doc] = 0 #this is to handle some exceptions, if there's no score, set it to 0

      p_r[id] = dicdoc_scores #for each query stores dict of scores
    
    query_doc = zip(df.query_id.values, df.doc.values) #create a zip element which is the concatenation of query and doc. Used to iterate over two columns together of the dataframe
    df['predicted_relevance'] = [p_r[q][d] for q,d in query_doc] #we add the new column to the dataframe

    return df

Now we create the datframe with the predicted score using the previous function computed_predicted_relevance and we rename the dataframe to search_results

In [None]:
# is_relevant ==> label
# TF-IDF de del doc i query ==> predicted_relevance

queries = ['power outage in carolina','school closed','deaths hurricane florida','how to help hurricane','president speech']
#queries = ['Landfall in South Carolina','Help and recovery during the hurricane disaster','Floodings in South Carolina']

search_results = computed_predicted_relevance(index, queries, df_queries)

search_results.head()

Unnamed: 0,doc,doc_score,query_id,label,predicted_relevance
0,doc_755,2,1,1,6.727996
1,doc_2607,2,1,1,6.703382
2,doc_337,3,1,1,6.178443
3,doc_1302,1,1,0,5.352027
4,doc_686,3,1,1,5.304446


### Precision@K (P@K)

In [None]:
def precision_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score: Ground truth (true relevance labels).
    y_score: Predicted scores.
    k : number of doc to consider.

    Returns
    -------
    precision @k : float

    """
    order = np.argsort(y_score)[::-1]
    doc_score = np.take(doc_score, order[:k]) #y_true
    relevant = sum(doc_score == 1)
    return float(relevant) / k

In [None]:
k = 5
for i in range(1,6):
  current_query = i
  current_query_res = search_results[search_results["query_id"] == current_query]
  print("==> Query {}:\n\tPrecision@{}: {}\n".format(i, k, precision_at_k(current_query_res["label"], current_query_res["predicted_relevance"], k)))

==> Query 1:
	Precision@5: 0.8

==> Query 2:
	Precision@5: 1.0

==> Query 3:
	Precision@5: 1.0

==> Query 4:
	Precision@5: 0.8

==> Query 5:
	Precision@5: 1.0



In [None]:
current_query = 1
current_query_res = search_results[search_results["query_id"] == current_query]
k = 5

print("==> Precision@{}: {}\n".format(k, precision_at_k(current_query_res["label"], current_query_res["predicted_relevance"], k)))
print("\nCheck on the dataset sorted by score:\n")

current_query_res.sort_values("predicted_relevance", ascending=False).head(k)

==> Precision@5: 0.8


Check on the dataset sorted by score:



Unnamed: 0,doc,doc_score,query_id,label,predicted_relevance
0,doc_755,2,1,1,6.727996
1,doc_2607,2,1,1,6.703382
2,doc_337,3,1,1,6.178443
3,doc_1302,1,1,0,5.352027
4,doc_686,3,1,1,5.304446


### Recall@K (R@K)

In [None]:
def recall_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score: Ground truth (true relevance labels).
    y_score: Predicted scores.
    k : number of doc to consider.

    Returns
    -------
    recall @k : float

    """
    max_doc = doc_score.sum()
    order = np.argsort(y_score)[::-1]
    doc_score = np.take(doc_score, order[:k]) #y_true
    relevant = sum(doc_score == 1)
    return float(relevant) / max_doc

In [None]:
k = 5
for i in range(1,6):
  current_query = i
  current_query_res = search_results[search_results["query_id"] == current_query]
  print("==> Query {}:\n\tRecall@{}: {}\n".format(i, k, recall_at_k(current_query_res["label"], current_query_res["predicted_relevance"], k)))

==> Query 1:
	Recall@5: 0.6666666666666666

==> Query 2:
	Recall@5: 0.7142857142857143

==> Query 3:
	Recall@5: 0.29411764705882354

==> Query 4:
	Recall@5: 0.25

==> Query 5:
	Recall@5: 0.35714285714285715



In [None]:
current_query = 1
current_query_res = search_results[search_results["query_id"] == current_query]

k = 5
print("==> Recall@{}: {}\n".format(k, recall_at_k(current_query_res["label"], current_query_res["predicted_relevance"], k)))
print("\nCheck on the dataset sorted by score:\n")

current_query_res.sort_values("predicted_relevance", ascending=False).head()

==> Recall@5: 0.6666666666666666


Check on the dataset sorted by score:



Unnamed: 0,doc,doc_score,query_id,label,predicted_relevance
0,doc_755,2,1,1,6.727996
1,doc_2607,2,1,1,6.703382
2,doc_337,3,1,1,6.178443
3,doc_1302,1,1,0,5.352027
4,doc_686,3,1,1,5.304446


### Average Precision@K - AP@K

In [None]:
def avg_precision_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score: Ground truth (true relevance labels).
    y_score: Predicted scores.
    k : number of doc to consider.

    Returns
    -------
    average precision @k : float
    """
    gtp = np.sum(doc_score == 1)
    order = np.argsort(y_score)[::-1]
    doc_score = np.take(doc_score, order[:k])
    ## if all documents are not relevant
    if gtp == 0:
        return 0
    n_relevant_at_i = 0
    prec_at_i = 0
    for i in range(len(doc_score)):
        if doc_score[i] == 1:
            n_relevant_at_i += 1
            prec_at_i += n_relevant_at_i / (i + 1)
    return prec_at_i / gtp

In [None]:
k = 20
for i in range(1,6):
  current_query = i
  current_query_res = search_results[search_results["query_id"] == current_query]
  print("==> Query {}:\n\tAverage Precision@{}: {}\n".format(i, k, avg_precision_at_k(np.array(current_query_res["label"]), np.array(current_query_res["predicted_relevance"]), k)))

==> Query 1:
	Average Precision@20: 0.738048245614035

==> Query 2:
	Average Precision@20: 0.9200680272108842

==> Query 3:
	Average Precision@20: 0.961640598924336

==> Query 4:
	Average Precision@20: 0.8214777512112074

==> Query 5:
	Average Precision@20: 0.8848988837959425



### Average Recall@K - AR@K

In [None]:
def avg_recall_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score: Ground truth (true relevance labels).
    y_score: Predicted scores.
    k : number of doc to consider.

    Returns
    -------
    average recall @k : float
    """
    gtp = np.sum(doc_score == 1)
    max_doc = doc_score.sum()
    order = np.argsort(y_score)[::-1]
    doc_score = np.take(doc_score, order[:k])
    ## if all documents are not relevant
    if gtp == 0:
        return 0
    n_relevant_at_i = 0
    prec_at_i = 0
    for i in range(len(doc_score)):
        if doc_score[i] == 1:
            n_relevant_at_i += 1
            prec_at_i += n_relevant_at_i / max_doc
    return prec_at_i / gtp

In [None]:
k = 20
for i in range(1,6):
  current_query = i
  current_query_res = search_results[search_results["query_id"] == current_query]
  print("==> Query {}:\n\tAverage Recall@{}: {}\n".format(i, k, avg_recall_at_k(np.array(current_query_res["label"]), np.array(current_query_res["predicted_relevance"]), k)))

==> Query 1:
	Average Precision@20: 0.5833333333333334

==> Query 2:
	Average Precision@20: 0.5714285714285714

==> Query 3:
	Average Precision@20: 0.5294117647058824

==> Query 4:
	Average Precision@20: 0.53125

==> Query 5:
	Average Precision@20: 0.5357142857142857



### F1-Score

In [None]:
def f1_score(doc_score, y_score):
  k = len(doc_score)
  P = avg_precision_at_k(doc_score, y_score, k)
  R = avg_recall_at_k(doc_score, y_score, k)
  return (2*P*R)/(P+R)

In [None]:
for i in range(1,6):
  current_query = i
  current_query_res = search_results[search_results["query_id"] == current_query]
  print("==> Query {}:\n\tF1-Score: {}\n".format(i, f1_score(np.array(current_query_res["label"]), np.array(current_query_res["predicted_relevance"]))))

==> Query 1:
	F1-Score: 0.6516333360993555

==> Query 2:
	F1-Score: 0.7050008144648965

==> Query 3:
	F1-Score: 0.6828785613536137

==> Query 4:
	F1-Score: 0.6452296922868632

==> Query 5:
	F1-Score: 0.6673920580724265



### Mean Average Precision (mAP)

In [None]:
def map_at_k(search_res, k=10):
    """
    Parameters
    ----------
    search_res: search results dataset containing:
        query_id: query id.
        doc_id: document id.
        predicted_relevance: relevance predicted through LightGBM.
        doc_score: actual score of the document for the query (ground truth).

    Returns
    -------
    mean average precision @ k : float
    """
    avp = []
    for q in search_res["query_id"].unique():  # loop over all query id
        curr_data = search_res[search_res["query_id"] == q]  # select data for current query
        avp.append(avg_precision_at_k(np.array(curr_data["label"]),
                                      np.array(curr_data["predicted_relevance"]), k))  #append average precision for current query
    return np.sum(avp) / len(avp), avp  # return mean average precision

In [None]:
k=10
map_k, avp = map_at_k(search_results, k)

print("==> Mean Average Precision (of k={}): {}\n".format(k, map_k))

==> Mean Average Precision (of k=10): 0.6109562991863411



### Mean Reciprocal Rank (MRR)

In [None]:
def rr_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score: Ground truth (true relevance labels).
    y_score: Predicted scores.
    k : number of doc to consider.

    Returns
    -------
    Reciprocal Rank for qurrent query
    """

    order = np.argsort(y_score)[::-1] # get the list of indexes of the predicted score sorted in descending order.
    doc_score = np.take(doc_score, order[:k])  # sort the actual relevance label of the documents based on predicted score(hint: np.take) and take first k.
    if np.sum(doc_score) == 0:  # if there are not relevant doument return 0
        return 0
    return 1 / (np.argmax(doc_score == 1) + 1)  # hint: to get the position of the first relevant document use "np.argmax"

In [None]:
def mrr_at_k(search_results, k):
    RRs = []
    for q in search_results["query_id"].unique():  # loop over all query ids
        labels = np.array(search_results[search_results['query_id'] == q]["label"])  # get labels for current query
        scores = np.array(search_results[search_results['query_id'] == q]["predicted_relevance"])  # get predicted score for current query
        RRs.append(rr_at_k(labels, scores, k))  # append RR for current query
    return np.round(float(sum(RRs) / len(RRs)), 4)  # Mean RR at current k

In [None]:
k=10
mrr_k = mrr_at_k(search_results, k)

print("==> Mean Reciprocal Rank (of k={}): {}\n".format(k, mrr_k))

==> Mean Reciprocal Rank (of k=10): 1.0



### Normalized Discounted Cumulative Gain (NDCG)

In [None]:
def dcg_at_k(doc_score, y_score, k=10):
    order = np.argsort(y_score)[::-1]  # get the list of indexes of the predicted score sorted in descending order.
    doc_score = np.take(doc_score, order[:k])  # sort the actual relevance label of the documents based on predicted score(hint: np.take) and take first k.
    gain = 2 ** doc_score -1  # Compute gain (use formula 7 above)
    discounts = np.log2(np.arange(len(doc_score)) + 2)  # Compute denominator
    return np.sum(gain / discounts)  #return dcg@k


def ndcg_at_k(doc_score, y_score, k=10):
    dcg_max = dcg_at_k(doc_score, doc_score, k) # Ideal dcg
    if not dcg_max:
        return 0
    return np.round(dcg_at_k(doc_score, y_score, k) / dcg_max, 4) # return ndcg@k

In [None]:
query_id = 2
k = 10
labels = np.array(search_results[search_results['query_id'] == query_id]["doc_score"])
scores = np.array(search_results[search_results['query_id'] == query_id]["predicted_relevance"])
ndcg_k = np.round(ndcg_at_k(labels, scores, k), 4)
print("ndcg@{} for query with query_id={}: {}".format(k, query_id, ndcg_k))

ndcg@10 for query with query_id=2: 0.7386


In [None]:
k = 10
for i in range(1,6):
  query_id = i
  labels = np.array(search_results[search_results['query_id'] == query_id]["doc_score"])
  scores = np.array(search_results[search_results['query_id'] == query_id]["predicted_relevance"])
  ndcg_k = np.round(ndcg_at_k(labels, scores, k), 4)
  print("==> Query {}:\n\tndcg@{}: {}\n".format(i, k, ndcg_k))

==> Query 1:
	ndcg@10: 0.5703

==> Query 2:
	ndcg@10: 0.7386

==> Query 3:
	ndcg@10: 0.8158

==> Query 4:
	ndcg@10: 0.7372

==> Query 5:
	ndcg@10: 0.8343



In [None]:
def avg_ndcg_at_k(search_results, k=10):
  ndcgs = []
  for q in search_results["query_id"].unique(): # loop over all query ids
    labels = np.array(search_results[search_results['query_id'] == q]["doc_score"]) ## get labels for current query
    scores = np.array(search_results[search_results['query_id'] == q]["predicted_relevance"]) # get predicted score for current query
    ndcgs.append(np.round(ndcg_at_k(labels, scores, k), 4)) # append NDCG for current query

  return np.round(float(sum(ndcgs) / len(ndcgs)), 4) # Compute average NDCG

In [None]:
k = 10

avg_ndcg = avg_ndcg_at_k(search_results, k)
print("==> Average ndcg@{}: {}\n".format(k, avg_ndcg))

==> Average ndcg@10: 0.7392

