<p style="font-size:78px">Final Project IRWA (2024-2025)</p>

<p style="font-size:48px">Part 2: Indexing and Evaluation</p>

In [67]:
# Standard library imports
import os
import sys

# Third-party imports
import pandas as pd
import numpy as np

# Local application imports
current_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in locals() else os.getcwd()
project_root = os.path.join(current_dir, '..')
if project_root not in sys.path:
    sys.path.append(project_root)
import irwa.loading as ild 
import irwa.preprocessing as ipp
import irwa.indexing as ind
import irwa.ranking as irk
import irwa.evaluation as eva
#from sklearn.metrics import average_precision_score

# The following lines allow for autoreload of modules. They allow changes in modules without the need to reload the kernel.
%load_ext autoreload
%autoreload 2

ModuleNotFoundError: No module named 'sklearn'

# 1) Indexing

In [2]:
# Loading
file_path = '../data/farmers-protest-tweets.json'
tweets = ild.load_tweets_from_json(file_path)
print(f"Loaded {len(tweets)} tweets")

# Preprocessing
tweet_document_ids_map_df = "../data/tweet_document_ids_map.csv"
docid_to_tweetid, token_tweets = ipp.create_tokenized_dictionary(tweets, tweet_document_ids_map_df)
print(f"Loaded {len(token_tweets)} documents with their corresponding tokenized tweet content")

Loaded 117407 tweets
Loaded 48429 documents with their corresponding tokenized tweet content


In [3]:
inverted_index, tf, idf = ind.create_inverted_index_tf_idf(token_tweets)

In [5]:
# Definition of test queries
query1 = "Indian protest"      # Example given in handout
query2 = "support farmers"     # Example given in handout
query3 = "Delhi farmers"
query4 = "Government corrupt"
query5 = "president India"

#### Query 1

In [16]:
# Ranking results with TF-IDF
ranked_documents_1 = irk.rank_documents(ipp.build_terms(query1), token_tweets, inverted_index, tf, idf, filter=irk.conjunctive_filtering)
irk.display_scores_tf_idf(ranked_documents_1, docid_to_tweetid, tweets, 5)


Top 5 Results:
------------------------------------------------------------
RESULT 1
Document doc_9676: 1.418501666
Content: This is why Indian Farmers are protesting #FarmersProtest https://t.co/9mzFBGQaXL
------------------------------------------------------------
RESULT 2
Document doc_34729: 1.406224306875
Content: Indian farmers' protests: Why they matter to British Indians
#FarmersProtest  https://t.co/kyCWnDVyEm
------------------------------------------------------------
RESULT 3
Document doc_39111: 1.1820847216666666
Content: Indian farmers have right to peacefull protest #FarmersProtest
------------------------------------------------------------
RESULT 4
Document doc_30422: 1.1820847216666666
Content: Why are Indian farmers protesting against the government?
#FarmersProtest  https://t.co/eMUGoXtabZ
------------------------------------------------------------
RESULT 5
Document doc_33904: 1.1249794455000002
Content: Indian farmers' protests: Why they matter to British Indians


#### Query 2

In [17]:
# Ranking results with TF-IDF
ranked_documents_2 = irk.rank_documents(ipp.build_terms(query2), token_tweets, inverted_index, tf, idf, filter=irk.conjunctive_filtering)
irk.display_scores_tf_idf(ranked_documents_2, docid_to_tweetid, tweets, 5)


Top 5 Results:
------------------------------------------------------------
RESULT 1
Document doc_31878: 1.131033092
Content: Support farmers, support humanity #Farmersprotest
------------------------------------------------------------
RESULT 2
Document doc_38864: 1.131033092
Content: support farmers support #FarmersProtest 
#शहीद_जवान_शहीद_किसा
------------------------------------------------------------
RESULT 3
Document doc_45741: 1.0921162466666667
Content: Support Farmers 🙏🙏🙏🙏🙏🙏#FarmersProtest
------------------------------------------------------------
RESULT 4
Document doc_2815: 1.0921162466666667
Content: Support Farmers 🙏🙏🙏🙏🙏🙏#FarmersProtest
------------------------------------------------------------
RESULT 5
Document doc_30390: 1.0921162466666667
Content: Support farmers #FarmersProtest
------------------------------------------------------------


#### Query 3

In [18]:
# Ranking results with TF-IDF
ranked_documents_3 = irk.rank_documents(ipp.build_terms(query3), token_tweets, inverted_index, tf, idf, filter=irk.conjunctive_filtering)
irk.display_scores_tf_idf(ranked_documents_3, docid_to_tweetid, tweets, 5)

Top 5 Results:
------------------------------------------------------------
RESULT 1
Document doc_14534: 1.1625731736363636
Content: Farmers are in Delhi for their rights, Delhi Police consider them as your fellow countrymen. #DPstopIntimidatingFarmers 
#FarmersProtest https://t.co/9VqzgkG0Sr
------------------------------------------------------------
RESULT 2
Document doc_29107: 0.9837157623076924
Content: Sadly he could not see the lakhs of farmers protesting outside Delhi when he was flying out of Delhi .

#IamAgainstModiGovt 
#FarmersProtest https://t.co/JHrtbfYjOg
------------------------------------------------------------
RESULT 3
Document doc_38281: 0.9837157623076924
Content: I blame the Delhi fog. Otherwise he would have seen protesting farmers when he flew out of Delhi this morning. #FarmersProtest https://t.co/DFMS52Ne0R
------------------------------------------------------------
RESULT 4
Document doc_30534: 0.9837157623076924
Content: Sadly he could not see the lakhs of 

#### Query 4

In [19]:
# Ranking results with TF-IDF
ranked_documents_4 = irk.rank_documents(ipp.build_terms(query4), token_tweets, inverted_index, tf, idf, filter=irk.conjunctive_filtering)
irk.display_scores_tf_idf(ranked_documents_4, docid_to_tweetid, tweets, 5)


Top 5 Results:
------------------------------------------------------------
RESULT 1
Document doc_14701: 3.4573308637499998
Content: Corruption Corruption thats what this Government will be Remembered for #DPstopIntimidatingFarmers
#FarmersProtest https://t.co/7QsxGwbWmr
------------------------------------------------------------
RESULT 2
Document doc_14680: 3.0731829899999994
Content: Corruption Corruption thats what this Government will be Remembered for #DPstopIntimidatingFarmers
#FarmersProtest https://t.co/Oxe9SDB01w
#DPstopIntimidatingFarmers
------------------------------------------------------------
RESULT 3
Document doc_14671: 3.0731829899999994
Content: #MSP_किसान_का_हक 
Corruption Corruption thats what this Government will be Remembered for #DPstopIntimidatingFarmers
#FarmersProtest https://t.co/aTNdwitLS9
------------------------------------------------------------
RESULT 4
Document doc_37261: 1.7815834961111108
Content: @rihanna Shame on India fake media and corrupt gove

#### Query 5

In [20]:
# Ranking results with TF-IDF
ranked_documents_5 = irk.rank_documents(ipp.build_terms(query5), token_tweets, inverted_index, tf, idf, filter=irk.conjunctive_filtering)
irk.display_scores_tf_idf(ranked_documents_5, docid_to_tweetid, tweets, 5)

Top 5 Results:
------------------------------------------------------------
RESULT 1
Document doc_31745: 2.4150228049999996
Content: it's president's day. 

@joebiden any thoughts about what's been happening in india?? #farmersprotest #nofarmersnofood
------------------------------------------------------------
RESULT 2
Document doc_22798: 1.8112671037499997
Content: @POTUS @JoeBiden Mr. President please support Indian farmers #FarmersProtest democracy killed in India..
------------------------------------------------------------
RESULT 3
Document doc_30908: 1.7125492237499995
Content: US Lawyers write to President #Biden on #Farmers' Protests:
'Your administration comes to office at a time when minority communities across India are in peril,' notes the letter, urging the US President to act.
#FarmersProtest https://t.co/wlQVau3nkw
------------------------------------------------------------
RESULT 4
Document doc_27740: 1.1439581707894735
Content: I request US President @JoeBiden to pr

# Evaluation

In [22]:
query_to_ev_1 = "People's rights"
query_to_ev_2 = "Indian Government"

queryid2text = {
    1: query_to_ev_1,
    2: query_to_ev_2
}

## Given queries

In [25]:
scores_evq1 = irk.rank_documents(ipp.build_terms(query_to_ev_1), token_tweets, inverted_index, tf, idf, filter=irk.conjunctive_filtering)
scores_evq2 = irk.rank_documents(ipp.build_terms(query_to_ev_2), token_tweets, inverted_index, tf, idf, filter=irk.conjunctive_filtering)

[('doc_11633', np.float64(1.0880939042857143)), ('doc_44224', np.float64(0.9520821662500001)), ('doc_47221', np.float64(0.9520821662500001)), ('doc_46347', np.float64(0.8815411888461537)), ('doc_31420', np.float64(0.8462952588888888)), ('doc_2732', np.float64(0.8462952588888888)), ('doc_40001', np.float64(0.8462952588888888)), ('doc_18881', np.float64(0.8462952588888888)), ('doc_16748', np.float64(0.8462952588888888)), ('doc_14469', np.float64(0.8424008744444444)), ('doc_4053', np.float64(0.8185739610714285)), ('doc_45806', np.float64(0.8185739610714285)), ('doc_16330', np.float64(0.8135668953571429)), ('doc_27566', np.float64(0.8135668953571429)), ('doc_2100', np.float64(0.8135668953571429)), ('doc_41111', np.float64(0.7640023636666666)), ('doc_12676', np.float64(0.7640023636666666)), ('doc_27577', np.float64(0.7640023636666666)), ('doc_21418', np.float64(0.7616657330000001)), ('doc_13361', np.float64(0.7616657330000001)), ('doc_10184', np.float64(0.7593291023333333)), ('doc_10185', n

In [42]:
# Convert the lists to DataFrames
df_query_1 = pd.DataFrame(scores_evq1, columns=['doc_id', 'score'])
df_query_2 = pd.DataFrame(scores_evq2, columns=['doc_id', 'score'])

# Add a column for query_id
df_query_1['query_id'] = 1
df_query_2['query_id'] = 2

# Concatenate the two DataFrames
search_results = pd.concat([df_query_1, df_query_2], ignore_index=True)
search_results = search_results[["query_id", "doc_id", "score"]] #reorder cols

evaluation = pd.read_csv("../data/evaluation_gt.csv", delimiter=";")
results = pd.merge(evaluation, search_results, how='left', left_on=['docId', 'query_id'], right_on=['doc_id', 'query_id'])
results.drop(columns=['doc_id'], inplace=True)
# Fill Nan with 0 as it means that is has not found any relevant score for such query
results.fillna(0, inplace=True)

#Rename columns for better usage
results.columns = ["doc_id", "query_id", "is_relevant", "predicted_relevance"]
results = results[["query_id", "doc_id", "is_relevant", "predicted_relevance"]]

In [40]:
for query in results['query_id'].unique():
    print(query)
    # Count relevant documents for the current query
    relevant_count = results[results['query_id'] == query]['is_relevant'].sum()
    
    print("-------------------------------------------------------------------------------------------")
    print(f"Out of {len(results[results['query_id'] == query])} documents, {relevant_count} are found relevant for query '{queryid2text.get(query, 'Unknown query')}'")
    print("-------------------------------------------------------------------------------------------")

1
-------------------------------------------------------------------------------------------
Out of 30 documents, 15 are found relevant for query 'People's rights'
-------------------------------------------------------------------------------------------
2
-------------------------------------------------------------------------------------------
Out of 30 documents, 15 are found relevant for query 'Indian Government'
-------------------------------------------------------------------------------------------


### Precision@K (P@K)

In [69]:
# Assign the current query
current_query_res = results[search_results["query_id"] == 1]

  current_query_res = results[search_results["query_id"] == 1]


In [76]:
pre_k25, rec_k25 = eva.precision_at_k(current_query_res["is_relevant"], current_query_res["predicted_relevance"], 25)
pre_k20, rec_k20 = eva.precision_at_k(current_query_res["is_relevant"], current_query_res["predicted_relevance"], 20)


print("==> Precision@{}: {}\n".format(25, pre_k25))
print("==> Precision@{}: {}\n".format(20, pre_k20))

==> Precision@25: 0.92

==> Precision@20: 0.95



### Recall@K (R@k) 

In [78]:
print("==> Recall@{}: {}\n".format(25, rec_k25))
print("==> Recall@{}: {}\n".format(20, rec_k20))

==> Recall@25: 1.0

==> Recall@20: 1.0



### Avgerage Precision@K (P@K)

In [60]:
eva.avg_precision_at_k(np.array(current_query_res["is_relevant"]), np.array(current_query_res["predicted_relevance"]), 25)

np.float64(0.9870474390134127)

In [61]:
k = 25
temp = current_query_res.sort_values("predicted_relevance", ascending=False).head(k)
average_precision_score(np.array(temp["is_relevant"]), np.array(temp["predicted_relevance"][:k]))

np.float64(0.9860869565217392)

### F1-Score@K

In [87]:
print(f"F-1 Score for k = 20 is: {eva.f1_score(pre_k20, rec_k20)}")
print(f"F-1 Score for k = 25 is: {eva.f1_score(pre_k25, rec_k25)}")


F-1 Score for k = 20 is: 0.9743589743589743
F-1 Score for k = 25 is: 0.9583333333333334


### Mean Average Precision (MAP)

In [63]:
map_k, avp = eva.map_at_k(results, 25)
map_k

np.float64(0.9625440630797774)

### Mean Reciprocal Rank (MRR)

In [65]:
k = 5
labels = np.array(results[results['query_id'] == 1]["is_relevant"])
scores = np.array(results[results['query_id'] == 1]["predicted_relevance"])
np.round(eva.rr_at_k(labels, scores, 13), 4)

np.float64(1.0)

### Normalized Discounted Cumulative Gain (NDCG)

In [67]:
ndcg_k = np.round(eva.ndcg_at_k(labels, scores, k), 4)
print("ndcg@{} for query with query_id={}: {}".format(k, 1, ndcg_k))

ndcg@5 for query with query_id=1: 1.0


In [27]:
labels = np.array(results[results['query_id'] == 2]["is_relevant"])
scores = np.array(results[results['query_id'] == 2]["predicted_relevance"])
ndcg_k = np.round(eva.ndcg_at_k(labels, scores, k), 4)
print("ndcg@{} for query with query_id={}: {}".format(k, 2, ndcg_k))

ndcg@5 for query with query_id=2: 1.0


# Own queries

## Defining Scores

In [None]:
query1 = "Indian protest"      # Example given in handout
query2 = "support farmers"     # Example given in handout
query3 = "Delhi farmers"
query4 = "Government corrupt"
query5 = "president India"

# Convert the lists to DataFrames
df_q1 = pd.DataFrame(ranked_documents_1, columns=['doc_id', 'score'])
df_q2 = pd.DataFrame(ranked_documents_2, columns=['doc_id', 'score'])
df_q3 = pd.DataFrame(ranked_documents_3, columns=['doc_id', 'score'])
df_q4 = pd.DataFrame(ranked_documents_4, columns=['doc_id', 'score'])
df_q5 = pd.DataFrame(ranked_documents_5, columns=['doc_id', 'score'])


# Add a column for query_id
df_q1['query_id'] = 1
df_q2['query_id'] = 2
df_q3['query_id'] = 3
df_q4['query_id'] = 4
df_q5['query_id'] = 5

# Concatenate the two DataFrames
search_results = pd.concat([df_q1, df_q2, df_q3, df_q4, df_q5], ignore_index=True)
search_results = search_results[["query_id", "doc_id", "score"]] #reorder cols

evaluation = pd.read_csv("../data/evaluation_gt_5queries.csv", delimiter=";")
results = pd.merge(evaluation, search_results, how='left', left_on=['docId', 'query_id'], right_on=['doc_id', 'query_id'])
results.drop(columns=['doc_id'], inplace=True)
# Fill Nan with 0 as it means that is has not found any relevant score for such query
results.fillna(0, inplace=True)

#Rename columns for better usage
results.columns = ["doc_id", "query_id", "is_relevant", "predicted_relevance"]
results = results[["query_id", "doc_id", "is_relevant", "predicted_relevance"]]

### Precision@K (P@K)

### Recall@K (R@K)

### Average Precision@K (P@K)

### F1-Score@K

### Mean Average Precision (MAP)

### Mean Reciprocal Rank (MRR)

## T-SNE