<p style="font-size:78px">Final Project IRWA (2024-2025)</p>

<p style="font-size:48px">Part 2: Indexing and Evaluation</p>

In [1]:
# Standard library imports
import os
import sys

# Third-party imports
import pandas as pd
import numpy as np

# Local application imports
current_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in locals() else os.getcwd()
project_root = os.path.join(current_dir, '..')
if project_root not in sys.path:
    sys.path.append(project_root)
import irwa.loading as ild 
import irwa.preprocessing as ipp
import irwa.indexing as ind
import irwa.ranking as irk

# The following lines allow for autoreload of modules. They allow changes in modules without the need to reload the kernel.
%load_ext autoreload
%autoreload 2

# 1) Indexing

In [2]:
# Loading
file_path = '../data/farmers-protest-tweets.json'
tweets = ild.load_tweets_from_json(file_path)
print(f"Loaded {len(tweets)} tweets")

# Preprocessing
tweet_document_ids_map_df = "../data/tweet_document_ids_map.csv"
docid_to_tweetid, token_tweets = ipp.create_tokenized_dictionary(tweets, tweet_document_ids_map_df)
print(f"Loaded {len(token_tweets)} documents with their corresponding tokenized tweet content")

Loaded 117407 tweets
Loaded 48429 documents with their corresponding tokenized tweet content


In [3]:
inverted_index, tf, idf = ind.create_inverted_index_tf_idf(token_tweets)

In [5]:
# Definition of test queries
query1 = "Indian protest"      # Example given in handout
query2 = "support farmers"     # Example given in handout
query3 = "Delhi farmers"
query4 = "Government corrupt"
query5 = "president India"

#### Query 1

In [16]:
# Ranking results with TF-IDF
ranked_documents_1 = irk.rank_documents(ipp.build_terms(query1), token_tweets, inverted_index, tf, idf, filter=irk.conjunctive_filtering)
irk.display_scores_tf_idf(ranked_documents_1, docid_to_tweetid, tweets, 5)


Top 5 Results:
------------------------------------------------------------
RESULT 1
Document doc_9676: 1.418501666
Content: This is why Indian Farmers are protesting #FarmersProtest https://t.co/9mzFBGQaXL
------------------------------------------------------------
RESULT 2
Document doc_34729: 1.406224306875
Content: Indian farmers' protests: Why they matter to British Indians
#FarmersProtest  https://t.co/kyCWnDVyEm
------------------------------------------------------------
RESULT 3
Document doc_39111: 1.1820847216666666
Content: Indian farmers have right to peacefull protest #FarmersProtest
------------------------------------------------------------
RESULT 4
Document doc_30422: 1.1820847216666666
Content: Why are Indian farmers protesting against the government?
#FarmersProtest  https://t.co/eMUGoXtabZ
------------------------------------------------------------
RESULT 5
Document doc_33904: 1.1249794455000002
Content: Indian farmers' protests: Why they matter to British Indians


#### Query 2

In [17]:
# Ranking results with TF-IDF
ranked_documents_2 = irk.rank_documents(ipp.build_terms(query2), token_tweets, inverted_index, tf, idf, filter=irk.conjunctive_filtering)
irk.display_scores_tf_idf(ranked_documents_2, docid_to_tweetid, tweets, 5)


Top 5 Results:
------------------------------------------------------------
RESULT 1
Document doc_31878: 1.131033092
Content: Support farmers, support humanity #Farmersprotest
------------------------------------------------------------
RESULT 2
Document doc_38864: 1.131033092
Content: support farmers support #FarmersProtest 
#शहीद_जवान_शहीद_किसा
------------------------------------------------------------
RESULT 3
Document doc_45741: 1.0921162466666667
Content: Support Farmers 🙏🙏🙏🙏🙏🙏#FarmersProtest
------------------------------------------------------------
RESULT 4
Document doc_2815: 1.0921162466666667
Content: Support Farmers 🙏🙏🙏🙏🙏🙏#FarmersProtest
------------------------------------------------------------
RESULT 5
Document doc_30390: 1.0921162466666667
Content: Support farmers #FarmersProtest
------------------------------------------------------------


#### Query 3

In [18]:
# Ranking results with TF-IDF
ranked_documents_3 = irk.rank_documents(ipp.build_terms(query3), token_tweets, inverted_index, tf, idf, filter=irk.conjunctive_filtering)
irk.display_scores_tf_idf(ranked_documents_3, docid_to_tweetid, tweets, 5)

Top 5 Results:
------------------------------------------------------------
RESULT 1
Document doc_14534: 1.1625731736363636
Content: Farmers are in Delhi for their rights, Delhi Police consider them as your fellow countrymen. #DPstopIntimidatingFarmers 
#FarmersProtest https://t.co/9VqzgkG0Sr
------------------------------------------------------------
RESULT 2
Document doc_29107: 0.9837157623076924
Content: Sadly he could not see the lakhs of farmers protesting outside Delhi when he was flying out of Delhi .

#IamAgainstModiGovt 
#FarmersProtest https://t.co/JHrtbfYjOg
------------------------------------------------------------
RESULT 3
Document doc_38281: 0.9837157623076924
Content: I blame the Delhi fog. Otherwise he would have seen protesting farmers when he flew out of Delhi this morning. #FarmersProtest https://t.co/DFMS52Ne0R
------------------------------------------------------------
RESULT 4
Document doc_30534: 0.9837157623076924
Content: Sadly he could not see the lakhs of 

#### Query 4

In [19]:
# Ranking results with TF-IDF
ranked_documents_4 = irk.rank_documents(ipp.build_terms(query4), token_tweets, inverted_index, tf, idf, filter=irk.conjunctive_filtering)
irk.display_scores_tf_idf(ranked_documents_4, docid_to_tweetid, tweets, 5)


Top 5 Results:
------------------------------------------------------------
RESULT 1
Document doc_14701: 3.4573308637499998
Content: Corruption Corruption thats what this Government will be Remembered for #DPstopIntimidatingFarmers
#FarmersProtest https://t.co/7QsxGwbWmr
------------------------------------------------------------
RESULT 2
Document doc_14680: 3.0731829899999994
Content: Corruption Corruption thats what this Government will be Remembered for #DPstopIntimidatingFarmers
#FarmersProtest https://t.co/Oxe9SDB01w
#DPstopIntimidatingFarmers
------------------------------------------------------------
RESULT 3
Document doc_14671: 3.0731829899999994
Content: #MSP_किसान_का_हक 
Corruption Corruption thats what this Government will be Remembered for #DPstopIntimidatingFarmers
#FarmersProtest https://t.co/aTNdwitLS9
------------------------------------------------------------
RESULT 4
Document doc_37261: 1.7815834961111108
Content: @rihanna Shame on India fake media and corrupt gove

#### Query 5

In [20]:
# Ranking results with TF-IDF
ranked_documents_5 = irk.rank_documents(ipp.build_terms(query5), token_tweets, inverted_index, tf, idf, filter=irk.conjunctive_filtering)
irk.display_scores_tf_idf(ranked_documents_5, docid_to_tweetid, tweets, 5)

Top 5 Results:
------------------------------------------------------------
RESULT 1
Document doc_31745: 2.4150228049999996
Content: it's president's day. 

@joebiden any thoughts about what's been happening in india?? #farmersprotest #nofarmersnofood
------------------------------------------------------------
RESULT 2
Document doc_22798: 1.8112671037499997
Content: @POTUS @JoeBiden Mr. President please support Indian farmers #FarmersProtest democracy killed in India..
------------------------------------------------------------
RESULT 3
Document doc_30908: 1.7125492237499995
Content: US Lawyers write to President #Biden on #Farmers' Protests:
'Your administration comes to office at a time when minority communities across India are in peril,' notes the letter, urging the US President to act.
#FarmersProtest https://t.co/wlQVau3nkw
------------------------------------------------------------
RESULT 4
Document doc_27740: 1.1439581707894735
Content: I request US President @JoeBiden to pr

# Evaluation

In [10]:
query_to_ev_1 = "People's rights"
query_to_ev_2 = "Indian Government"

queryid2text = {
    1: query_to_ev_1,
    2: query_to_ev_2
}

## Given queries

In [44]:
scores_evq1 = irk.tf_idf(inverted_index, query_to_ev_1, token_tweets)
scores_evq2 = irk.tf_idf(inverted_index, query_to_ev_2, token_tweets)

In [52]:
# Convert the dictionaries to DataFrames
df_query_1 = pd.DataFrame(list(scores_evq1.items()), columns=['doc_id', 'doc_score'])
df_query_2 = pd.DataFrame(list(scores_evq2.items()), columns=['doc_id', 'doc_score'])

# Add a column for query_id
df_query_1['query_id'] = 1
df_query_2['query_id'] = 2

# Concatenate the two DataFrames
search_results = pd.concat([df_query_1, df_query_2], ignore_index=True)

In [53]:
evaluation = pd.read_csv("../data/evaluation_gt.csv", delimiter=";")
results = pd.merge(evaluation, search_results, how='left', left_on=['docId', 'query_id'], right_on=['doc_id', 'query_id'])
results.drop(columns=['doc_id'], inplace=True)
# Fill Nan with 0 as it means that is has not found any relevant score for such query
results.fillna(0, inplace=True)

#Rename columns for better usage
results.columns = ["doc_id", "query_id", "is_relevant", "predicted_relevance" ]
mean_predicted_relevance = results['predicted_relevance'].mean()
std_predicted_relevance = results['predicted_relevance'].std()

# Standardizing the predicted_relevance column
results['predicted_relevance'] = (results['predicted_relevance'] - mean_predicted_relevance) / std_predicted_relevance

In [54]:
for query in results['query_id'].unique():
    print(query)
    # Count relevant documents for the current query
    relevant_count = results[results['query_id'] == query]['is_relevant'].sum()
    
    print("-------------------------------------------------------------------------------------------")
    print(f"Out of {len(results[results['query_id'] == query])} documents, {relevant_count} are found relevant for query '{queryid2text.get(query, 'Unknown query')}'")
    print("-------------------------------------------------------------------------------------------")

1
-------------------------------------------------------------------------------------------
Out of 30 documents, 15 are found relevant for query 'people's rights'
-------------------------------------------------------------------------------------------
2
-------------------------------------------------------------------------------------------
Out of 30 documents, 15 are found relevant for query 'Indian Government'
-------------------------------------------------------------------------------------------


### Precision@K (P@K)

In [75]:
def precision_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score: Ground truth (true relevance labels).
    y_score: Predicted scores.
    k : number of doc to consider.

    Returns
    -------
    precision @k : float
    recall @k : float

    """
    order = np.argsort(y_score)[::-1]
    doc_score = doc_score[order[:k]]
    relevant = sum(doc_score == 1)
    precision = float(relevant) / k

    total_relevant = sum(doc_score)
    if total_relevant == 0:
        recall = 0
    else:
        recall = float(relevant) / total_relevant
    return precision, recall

In [69]:
# Assign the current query
current_query_res = results[search_results["query_id"] == 1]

  current_query_res = results[search_results["query_id"] == 1]


In [76]:
pre_k25, rec_k25 = precision_at_k(current_query_res["is_relevant"], current_query_res["predicted_relevance"], 25)
pre_k20, rec_k20 = precision_at_k(current_query_res["is_relevant"], current_query_res["predicted_relevance"], 20)


print("==> Precision@{}: {}\n".format(25, pre_k25))
print("==> Precision@{}: {}\n".format(20, pre_k20))

==> Precision@25: 0.92

==> Precision@20: 0.95



### Recall@K (R@k) 

In [78]:
print("==> Recall@{}: {}\n".format(25, rec_k25))
print("==> Recall@{}: {}\n".format(20, rec_k20))

==> Recall@25: 1.0

==> Recall@20: 1.0



### Avgerage Precision@K (P@K)

In [18]:
def avg_precision_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score: Ground truth (true relevance labels).
    y_score: Predicted scores.
    k : number of doc to consider.

    Returns
    -------
    average precision @k : floa
    """
    order = np.argsort(y_score)[::-1]  # get the list of indexes of the predicted score sorted in descending order.

    prec_at_i = 0
    prec_at_i_list = []
    number_of_relevant = 0
    number_to_iterate = min(k, len(order))

    for i in range(number_to_iterate):
        if doc_score[order[i]] == 1:
            number_of_relevant += 1
            prec_at_i = number_of_relevant / (i + 1)
            prec_at_i_list.append(prec_at_i)

    if number_of_relevant == 0:
        return 0
    else:
      return np.sum(prec_at_i_list) / number_of_relevant

In [60]:
avg_precision_at_k(np.array(current_query_res["is_relevant"]), np.array(current_query_res["predicted_relevance"]), 25)

np.float64(0.9870474390134127)

In [61]:
from sklearn.metrics import average_precision_score

k = 25
temp = current_query_res.sort_values("predicted_relevance", ascending=False).head(k)
average_precision_score(np.array(temp["is_relevant"]), np.array(temp["predicted_relevance"][:k]))

np.float64(0.9860869565217392)

### F1-Score@K

In [80]:
def f1_score(precision, recall):

    if precision + recall == 0:
        return 0.0
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

In [87]:
print(f"F-1 Score for k = 20 is: {f1_score(pre_k20, rec_k20)}")
print(f"F-1 Score for k = 25 is: {f1_score(pre_k25, rec_k25)}")


F-1 Score for k = 20 is: 0.9743589743589743
F-1 Score for k = 25 is: 0.9583333333333334


### Mean Average Precision (MAP)

In [62]:
def map_at_k(search_res, k=10):
    """
    Parameters
    ----------
    search_res: search results dataset containing:
        query_id: query id.
        doc_id: document id.
        predicted_relevance: relevance predicted through LightGBM.
        doc_score: actual score of the document for the query (ground truth).

    Returns
    -------
    mean average precision @ k : float
    """
    avp = []
    for q in search_res["query_id"].unique():  # loop over all query id
        curr_data = search_res[search_res["query_id"] == q]  # select data for current query
        avp.append(avg_precision_at_k(np.array(curr_data["is_relevant"]),
                   np.array(curr_data["predicted_relevance"]), k))  #append average precision for current query
    return np.sum(avp) / len(avp), avp  # return mean average precision

In [63]:
map_k, avp = map_at_k(results, 25)
map_k

np.float64(0.9625440630797774)

### Mean Reciprocal Rank (MRR)

In [64]:
def rr_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score: Ground truth (true relevance labels).
    y_score: Predicted scores.
    k : number of doc to consider.

    Returns
    -------
    Reciprocal Rank for qurrent query
    """

    order = np.argsort(y_score)[::-1]  # get the list of indexes of the predicted score sorted in descending order.
    doc_score = np.take(doc_score, order[:k])  # sort the actual relevance label of the documents based on predicted score(hint: np.take) and take first k.
    if np.sum(doc_score) == 0:  # if there are not relevant doument return 0
        return 0
    return 1 / (np.argmax(doc_score == 1) + 1)  # hint: to get the position of the first relevant document use "np.argmax"


In [65]:
k = 5
labels = np.array(results[results['query_id'] == 1]["is_relevant"])
scores = np.array(results[results['query_id'] == 1]["predicted_relevance"])
np.round(rr_at_k(labels, scores, 13), 4)

np.float64(1.0)

### Normalized Discounted Cumulative Gain (NDCG)

In [66]:
def dcg_at_k(doc_score, y_score, k=10):
    order = np.argsort(y_score)[::-1]  # get the list of indexes of the predicted score sorted in descending order.
    doc_score = np.take(doc_score, order[:k])  # sort the actual relevance label of the documents based on predicted score(hint: np.take) and take first k.
    gain = 2 ** doc_score - 1  # Compute gain (use formula 7 above)
    discounts = np.log2(np.arange(len(doc_score)) + 2)  # Compute denominator
    return np.sum(gain / discounts)  #return dcg@k


def ndcg_at_k(doc_score, y_score, k=10):
    dcg_max = dcg_at_k(doc_score, doc_score, k)
    if not dcg_max:
        return 0
    return np.round(dcg_at_k(doc_score, y_score, k) / dcg_max, 4)

In [67]:
ndcg_k = np.round(ndcg_at_k(labels, scores, k), 4)
print("ndcg@{} for query with query_id={}: {}".format(k, 1, ndcg_k))

ndcg@5 for query with query_id=1: 1.0


In [27]:
labels = np.array(results[results['query_id'] == 2]["is_relevant"])
scores = np.array(results[results['query_id'] == 2]["predicted_relevance"])
ndcg_k = np.round(ndcg_at_k(labels, scores, k), 4)
print("ndcg@{} for query with query_id={}: {}".format(k, 2, ndcg_k))

ndcg@5 for query with query_id=2: 1.0


# Own queries

## Defining Scores

### Precision@K (P@K)

### Recall@K (R@K)

### Average Precision@K (P@K)

### F1-Score@K

### Mean Average Precision (MAP)

### Mean Reciprocal Rank (MRR)

### Normalized Discounted Cumulative Gain (NDCG)

## T-SNE