<p style="font-size:78px">Final Project IRWA (2024-2025)</p>

# Part 3: Ranking

In [20]:
# Standard library imports
import os
import sys

# Third-party imports
import pandas as pd
import numpy as np

# Local application imports
current_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in locals() else os.getcwd()
project_root = os.path.join(current_dir, '..')
if project_root not in sys.path:
    sys.path.append(project_root)
import irwa.loading as ild 
import irwa.preprocessing as ipp
import irwa.indexing as ind
import irwa.ranking as irk
import irwa.evaluation as eva

# The following lines allow for autoreload of modules. They allow changes in modules without the need to reload the kernel.
%load_ext autoreload
%autoreload 2

In [14]:
# Loading
file_path = '../data/farmers-protest-tweets.json'
tweets = ild.load_tweets_from_json(file_path)
print(f"Loaded {len(tweets)} tweets")

# Preprocessing
tweet_document_ids_map_df = "../data/tweet_document_ids_map.csv"
docid_to_tweetid, token_tweets = ipp.create_tokenized_dictionary(tweets, tweet_document_ids_map_df)
print(f"Loaded {len(token_tweets)} documents with their corresponding tokenized tweet content")

Loaded 117407 tweets
Loaded 48429 documents with their corresponding tokenized tweet content


### Query: What is the indian protest?

In [15]:
query = "What is the indian protest?"          

## a) Tf-idf

In [16]:
# Create inverted index
inverted_index, tf, idf = ind.create_inverted_index_tf_idf(token_tweets)

In [17]:
# Ranking results with TF-IDF
ranked_documents_tf_idf = irk.rank_documents_tf_idf(ipp.build_terms(query), token_tweets, inverted_index, tf, idf, document_filtering=irk.conjunctive_filtering)
irk.display_scores_tf_idf(ranked_documents_tf_idf, docid_to_tweetid, tweets, 20)

Top 20 Results:
------------------------------------------------------------
RESULT 1
Document doc_34729: 5.031061026856039
Content: Indian farmers' protests: Why they matter to British Indians
#FarmersProtest  https://t.co/kyCWnDVyEm
------------------------------------------------------------
RESULT 2
Document doc_19653: 4.865420574825832
Content: @PunYaab Farmers are Indian ... Each n every person in protest is Indian first ... So think before tweets against protesters 

#farmersprotest
------------------------------------------------------------
RESULT 3
Document doc_33904: 4.592709354323596
Content: Indian farmers' protests: Why they matter to British Indians

#FarmersStandingFirm #FarmersProtest #StandWithFarmers 

https://t.co/ywgPhLCvm9
------------------------------------------------------------
RESULT 4
Document doc_9676: 4.485696129303427
Content: This is why Indian Farmers are protesting #FarmersProtest https://t.co/9mzFBGQaXL
-----------------------------------------------

## b) Our score

In [38]:
ranked_documents_our_score = irk.rank_documents_our_score(tweets, docid_to_tweetid, ranked_documents_tf_idf, alpha=0.15, k0 = 0.5, k1=0.5,k2=1,k3=0.5)
irk.display_scores_tf_idf(ranked_documents_our_score, docid_to_tweetid, tweets, 20)

Top 20 Results:
------------------------------------------------------------
RESULT 1
Document doc_23286: 1.0
Content: Arresting #ClimateChange activist #DishaRavi is a huge mistake by Indian police ! It has now taken the #FarmersProtest to another level &amp; audience ! 

The human rights of all activists &amp; protesters must be respected ! 

#FreeDishaRavi !
------------------------------------------------------------
RESULT 2
Document doc_37376: 0.20108788453315748
Content: Indian #climate activist, Disha Ravi, aged 22, has been arrested after sharing a document intended to help farmers protest against new agricultural laws @fridays_india #FarmersProtest #FarmersProtests via @BBCWorld https://t.co/WgvMNMhbnj
------------------------------------------------------------
RESULT 3
Document doc_14687: 0.19865092408436588
Content: Indian 
#FarmersProtest
Largest protest in the world..
Support Farmers
‚úä‚úä
#MSP_‡§ï‡§ø‡§∏‡§æ‡§®_‡§ï‡§æ_‡§π‡§ï
#DPstopIntimidatingFarmers https://t.co/NA5kGo

## c) BM25

# Word2vec