<p style="font-size:78px">Final Project IRWA (2024-2025)</p>

<p style="font-size:48px">Part 2: Indexing and Evaluation</p>

In [1]:
# Standard library imports
import os
import sys

# Third-party imports
import pandas as pd
import numpy as np

# Local application imports
current_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in locals() else os.getcwd()
project_root = os.path.join(current_dir, '..')
if project_root not in sys.path:
    sys.path.append(project_root)
import irwa.loading as ild 
import irwa.preprocessing as ipp
import irwa.indexing as ind
import irwa.ranking as irk

# The following lines allow for autoreload of modules. They allow changes in modules without the need to reload the kernel.
%load_ext autoreload
%autoreload 2

# 1) Indexing

In [2]:
# Loading and preprocessing
file_path = '../data/farmers-protest-tweets.json'
tweets = ild.load_tweets_from_json(file_path)
print(f"Loaded {len(tweets)} tweets")
tweet_document_ids_map_df = "../data/tweet_document_ids_map.csv"
docid_to_tweetid, token_tweets = ipp.create_tokenized_dictionary(tweets, tweet_document_ids_map_df)
print(f"Loaded {len(token_tweets)} documents with their corresponding tokenized tweet content")

Loaded 117407 tweets
Loaded 48429 documents with their corresponding tokenized tweet content


In [3]:
# Inverted Index construction
inverted_index = ind.create_inverted_index(token_tweets)

In [4]:
# Definition of test queries
query1 = "indian protest"      # Example given in handout
query2 = "support farmers"     # Example given in handout
query3 = "delhi farmers"
query4 = "government corrupt"
query5 = "president india"

#### Query 1

In [5]:
# Ranking results with TF-IDF
scores_q1 = irk.tf_idf(inverted_index, query1, token_tweets)
irk.sort_scores_tf_idf(scores_q1, docid_to_tweetid, tweets, 5)

Top 5 Results:
Document doc_13095: 13.034242103125635
Content: INDIAN FARMERS are protesting in DELHI for last 3 months. 220+ farmers had died so far in #FarmersProtest .Protests are held all over the world to show solidarity with Indian Farmers.A protest will be held in Australia this Sunday.
#DPstopIntimidatingFarmers
@UNHumanRights
@bbc https://t.co/Ct5hqEEXRE
Document doc_445: 12.573739182581026
Content: Farmers Protest | Pawri Ho Rahi Hai 🌾
Dedicated to The 2020–2021 Indian farmers' protest. #FarmersProtest​ is an ongoing protest against three farm acts which were passed by the Parliament of India in Sep 2020. Millions of farmers are protesting in India.
https://t.co/cR5ltghf6X
Document doc_5374: 11.53260069180757
Content: @VP Dear madam,
Not only Indian farmers need justice but every Indian need justice who love democracy
please save Indian democracy and Indian constitution🙏🙏🙏
#FarmersProtest
Document doc_9022: 11.53260069180757
Content: #modi_rojgar_do - indian youth.
#FarmersPr

#### Query 2

In [6]:
# Ranking results with TF-IDF
scores_q2 = irk.tf_idf(inverted_index, query2, token_tweets)
irk.sort_scores_tf_idf(scores_q2, docid_to_tweetid, tweets, 5)

Top 5 Results:
Document doc_36673: 10.905015117394331
Content: We Support #FarmersProtest
We Support #GretaThunberg 
We Support #Rehanna
We Support #NodeepKaur
We Support #DishaRavi

#ReleaseDishaRavi #ReleaseNovdeepKaur
Document doc_3199: 10.718059609339209
Content: @dhruv_rathee Not a farmer
No fOd
Not a farmer
No greenery
Not a farmer
Not haPiness
Not a farmer
Not healthy
Not a farmer
No Humans and animals
Not a farmer
No employment,business,economy
Not a farmer
No progreS of the nation
If not farmer
Is the country not even the land
#FarmersProtest
Document doc_22998: 10.562281423938803
Content: I support farmers. Supporting farmers is not a anti national deed.Every citizen must support farmers. Please back 3 farm reform bill.#FarmersProtest
Document doc_33745: 10.562281423938803
Content: Increasingly, supporting farmers is becoming a crime. From Journalists to Activists, doesn't matter who you are, if you Support Farmers you will be attacked by the Government. Supporting Farmers Is

#### Query 3

In [7]:
# Ranking results with TF-IDF
scores_q3 = irk.tf_idf(inverted_index, query3, token_tweets)
irk.sort_scores_tf_idf(scores_q3, docid_to_tweetid, tweets, 5)

Top 5 Results:
Document doc_21757: 13.022516656851371
Content: The 'Delhi Chalo' farmers' protest at border points of New Delhi has entered the 84th day today. Thousands of farmers, especially from Punjab and Haryana, are staging a sit-in protest along Delhi borders.
 #RailRokoForFarmers #FarmersProtest
Document doc_941: 11.68275920568397
Content: @anilca95 @ArvindKejriwal Our honorable CM is busy with farmers from outside and handed over DELHI to #FarmersProtest He has no time or attention to problems of Delhi, yamuna continues to suffer, pollute and no govt has any policy or plan to #save yamuna! Sic of you Delhi political circles
Document doc_29928: 11.68275920568397
Content: Today, the 82nd day of the 'Delhi Chalo' demonstrations at New Delhi boundary areas. A sit-in protest along the Delhi border is being staged by thousands of farmers, 

#FarmersProtest  #DelhiChalo  #ProtestTopStories

https://t.co/O67STnJJ0M
Document doc_42270: 11.68275920568397
Content: #FarmersProtest     @Fa

#### Query 4

In [8]:
# Ranking results with TF-IDF
scores_q4 = irk.tf_idf(inverted_index, query4, token_tweets)
irk.sort_scores_tf_idf(scores_q4,docid_to_tweetid, tweets, 5)

Top 5 Results:
Document doc_4328: 14.847119073774913
Content: Nothing different between British government and Modi Government. I think British government was more sensitive than this Modi Government because they repealed the laws but this government is too much ignorant.
#FarmersProtest
#DPstopIntimidatingFarmers
#Pagdi_Sambhal_Jatta
Document doc_37665: 14.45743958052082
Content: Good news for Indian, bad news for Fake #FarmersProtest corrupt #DhruvRathee corrupt #BarkhaDutt corrupt @ndtv Antinational #Sikh #Khalistanis https://t.co/FUeSUyjII8
Document doc_14671: 12.60771686843553
Content: #MSP_किसान_का_हक 
Corruption Corruption thats what this Government will be Remembered for #DPstopIntimidatingFarmers
#FarmersProtest https://t.co/aTNdwitLS9
Document doc_14680: 12.60771686843553
Content: Corruption Corruption thats what this Government will be Remembered for #DPstopIntimidatingFarmers
#FarmersProtest https://t.co/Oxe9SDB01w
#DPstopIntimidatingFarmers
Document doc_14701: 12.607716868

#### Query 5

In [9]:
# Ranking results with TF-IDF
scores_q5 = irk.tf_idf(inverted_index, query5, token_tweets)
irk.sort_scores_tf_idf(scores_q5, docid_to_tweetid, tweets, 5)

Top 5 Results:
Document doc_30908: 14.602910470127531
Content: US Lawyers write to President #Biden on #Farmers' Protests:
'Your administration comes to office at a time when minority communities across India are in peril,' notes the letter, urging the US President to act.
#FarmersProtest https://t.co/wlQVau3nkw
Document doc_30305: 13.059745520898112
Content: India doesn't give a shit about minorities. India doesn't give a shit about minorities. India doesn't give a shit about minorities. India doesn't give a shit about minorities. India doesn't give a shit about minorities. India doesn't give a shit about minorities. #FarmersProtest
Document doc_15774: 12.426286216644513
Content: @mausamii2u @sardesairajdeep @iwpcdelhi @AishPaliwal @kamaljitsandhu Iss tuchha jeevi.. very own ? Amazing 👏

He is "Liar-In-Chief". Has been taken 2 task by many including Ex President Pranab da, current President Kovind. Got suspended frm his own employer for his blatant lies during #FarmersProtest Shame

@

In [58]:
query_to_ev_1 = "people's rights"
query_to_ev_2 = "Indian Government"

queryid2text = {
    1: query_to_ev_1,
    2: query_to_ev_2
}

In [40]:
scores_evq1 = irk.tf_idf(inverted_index, query_to_ev_1, token_tweets)
scores_evq2 = irk.tf_idf(inverted_index, query_to_ev_2, token_tweets)

In [41]:
# Convert the dictionaries to DataFrames
df_query_1 = pd.DataFrame(list(scores_evq1.items()), columns=['doc_id', 'doc_score'])
df_query_2 = pd.DataFrame(list(scores_evq2.items()), columns=['doc_id', 'doc_score'])

# Add a column for query_id
df_query_1['query_id'] = 1
df_query_2['query_id'] = 2

# Concatenate the two DataFrames
search_results = pd.concat([df_query_1, df_query_2], ignore_index=True)

In [87]:
evaluation = pd.read_csv("../data/evaluation_gt.csv", delimiter=";")
results = pd.merge(evaluation, search_results, how='left', left_on=['docId', 'query_id'], right_on=['doc_id', 'query_id'])
results.drop(columns=['doc_id'], inplace=True)
results.dropna(inplace=True)

#Rename columns for better usage
results.columns = ["doc_id", "query_id", "is_relevant", "predicted_relevance" ]
mean_predicted_relevance = results['predicted_relevance'].mean()
std_predicted_relevance = results['predicted_relevance'].std()

# Standardizing the predicted_relevance column
results['predicted_relevance'] = (results['predicted_relevance'] - mean_predicted_relevance) / std_predicted_relevance
results.head(5)


Unnamed: 0,doc_id,query_id,is_relevant,predicted_relevance
0,doc_156,1.0,0.0,-1.054902
1,doc_1039,1.0,0.0,-1.054902
2,doc_1047,1.0,1.0,0.372815
3,doc_1685,1.0,0.0,-1.054902
4,doc_2100,1.0,1.0,1.787434


In [88]:
for query in results['query_id'].unique():
    # Count relevant documents for the current query
    relevant_count = results[results['query_id'] == query]['is_relevant'].sum()
    
    print("-------------------------------------------------------------------------------------------")
    print(f"Out of {len(results[results['query_id'] == query])} documents, {relevant_count} are found relevant for query '{queryid2text.get(query, 'Unknown query')}'")
    print("-------------------------------------------------------------------------------------------")

-------------------------------------------------------------------------------------------
Out of 30 documents, 15.0 are found relevant for query 'people's rights'
-------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------
Out of 26 documents, 15.0 are found relevant for query 'Indian Government'
-------------------------------------------------------------------------------------------


### Precision@K (P@K)

In [89]:
def precision_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score: Ground truth (true relevance labels).
    y_score: Predicted scores.
    k : number of doc to consider.

    Returns
    -------
    precision @k : float

    """
    order = np.argsort(y_score)[::-1]
    #doc_score = np.take(doc_score, order[:k])
    doc_score = doc_score[order[:k]]
    relevant = sum(doc_score == 1)
    return float(relevant) / k

In [90]:
# Assign the current query
current_query_res = results[search_results["query_id"] == 1]

  current_query_res = results[search_results["query_id"] == 1]


In [91]:
k = 25
print("==> Precision@{}: {}\n".format(k, precision_at_k(current_query_res["is_relevant"], current_query_res["predicted_relevance"], k)))

k = 20
print("==> Precision@{}: {}\n".format(k, precision_at_k(current_query_res["is_relevant"], current_query_res["predicted_relevance"], k)))

==> Precision@25: 0.92

==> Precision@20: 0.95



### Avgerage Precision@K (P@K)

In [92]:
def avg_precision_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score: Ground truth (true relevance labels).
    y_score: Predicted scores.
    k : number of doc to consider.

    Returns
    -------
    average precision @k : floa
    """
    order = np.argsort(y_score)[::-1]  # get the list of indexes of the predicted score sorted in descending order.

    prec_at_i = 0
    prec_at_i_list = []
    number_of_relevant = 0
    number_to_iterate = min(k, len(order))

    for i in range(number_to_iterate):
        if doc_score[order[i]] == 1:
            number_of_relevant += 1
            prec_at_i = number_of_relevant / (i + 1)
            prec_at_i_list.append(prec_at_i)

    if number_of_relevant == 0:
        return 0
    else:
      return np.sum(prec_at_i_list) / number_of_relevant

In [93]:
avg_precision_at_k(np.array(current_query_res["is_relevant"]), np.array(current_query_res["predicted_relevance"]), 25)

np.float64(0.9870474390134127)

In [94]:
from sklearn.metrics import average_precision_score

k = 25
temp = current_query_res.sort_values("predicted_relevance", ascending=False).head(k)
average_precision_score(np.array(temp["is_relevant"]), np.array(temp["predicted_relevance"][:k]))

np.float64(0.9860869565217392)

### Mean Average Precision (MAP)

In [95]:
def map_at_k(search_res, k=10):
    """
    Parameters
    ----------
    search_res: search results dataset containing:
        query_id: query id.
        doc_id: document id.
        predicted_relevance: relevance predicted through LightGBM.
        doc_score: actual score of the document for the query (ground truth).

    Returns
    -------
    mean average precision @ k : float
    """
    avp = []
    for q in search_res["query_id"].unique():  # loop over all query id
        curr_data = search_res[search_res["query_id"] == q]  # select data for current query
        avp.append(avg_precision_at_k(np.array(curr_data["is_relevant"]),
                   np.array(curr_data["predicted_relevance"]), k))  #append average precision for current query
    return np.sum(avp) / len(avp), avp  # return mean average precision

In [96]:
map_k, avp = map_at_k(results, 25)
map_k

np.float64(0.9625440630797774)

### Mean Reciprocal Rank (MRR)

In [97]:
def rr_at_k(doc_score, y_score, k=10):
    """
    Parameters
    ----------
    doc_score: Ground truth (true relevance labels).
    y_score: Predicted scores.
    k : number of doc to consider.

    Returns
    -------
    Reciprocal Rank for qurrent query
    """

    order = np.argsort(y_score)[::-1]  # get the list of indexes of the predicted score sorted in descending order.
    doc_score = np.take(doc_score, order[:k])  # sort the actual relevance label of the documents based on predicted score(hint: np.take) and take first k.
    if np.sum(doc_score) == 0:  # if there are not relevant doument return 0
        return 0
    return 1 / (np.argmax(doc_score == 1) + 1)  # hint: to get the position of the first relevant document use "np.argmax"


In [111]:
k = 5
labels = np.array(results[results['query_id'] == 1]["is_relevant"])
scores = np.array(results[results['query_id'] == 1]["predicted_relevance"])
np.round(rr_at_k(labels, scores, 13), 4)

np.float64(1.0)

### Normalized Discounted Cumulative Gain (NDCG)

In [112]:
def dcg_at_k(doc_score, y_score, k=10):
    order = np.argsort(y_score)[::-1]  # get the list of indexes of the predicted score sorted in descending order.
    doc_score = np.take(doc_score, order[:k])  # sort the actual relevance label of the documents based on predicted score(hint: np.take) and take first k.
    gain = 2 ** doc_score - 1  # Compute gain (use formula 7 above)
    discounts = np.log2(np.arange(len(doc_score)) + 2)  # Compute denominator
    return np.sum(gain / discounts)  #return dcg@k


def ndcg_at_k(doc_score, y_score, k=10):
    dcg_max = dcg_at_k(doc_score, doc_score, k)
    if not dcg_max:
        return 0
    return np.round(dcg_at_k(doc_score, y_score, k) / dcg_max, 4)

In [113]:
ndcg_k = np.round(ndcg_at_k(labels, scores, k), 4)
print("ndcg@{} for query with query_id={}: {}".format(k, 1, ndcg_k))

ndcg@5 for query with query_id=1: 1.0


In [114]:
labels = np.array(results[results['query_id'] == 2]["is_relevant"])
scores = np.array(results[results['query_id'] == 2]["predicted_relevance"])
ndcg_k = np.round(ndcg_at_k(labels, scores, k), 4)
print("ndcg@{} for query with query_id={}: {}".format(k, 2, ndcg_k))

ndcg@5 for query with query_id=2: 1.0
