<p style="font-size:78px">Final Project IRWA (2024-2025)</p>

# Part 3: Ranking

## 0) Loading and setup

In [None]:
# Standard library imports
import os
import sys

# Third-party imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Local application imports
current_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in locals() else os.getcwd()
project_root = os.path.join(current_dir, '..')
if project_root not in sys.path:
    sys.path.append(project_root)
import irwa.loading as ild 
import irwa.preprocessing as ipp
import irwa.indexing as ind
import irwa.ranking as irk
import irwa.evaluation as eva
import irwa.saving as sa
from gensim.models.word2vec import Word2Vec

# The following lines allow for autoreload of modules. They allow changes in modules without the need to reload the kernel.
%load_ext autoreload
%autoreload 2

In [None]:
# Loading
file_path = '../data/farmers-protest-tweets.json'
tweets = ild.load_tweets_from_json(file_path)
print(f"Loaded {len(tweets)} tweets")

# Preprocessing
tweet_document_ids_map_df = "../data/tweet_document_ids_map.csv"
docid_to_tweetid, token_tweets = ipp.create_tokenized_dictionary(tweets, tweet_document_ids_map_df)
print(f"Loaded {len(token_tweets)} documents with their corresponding tokenized tweet content")

In [None]:
query1 = "What is the indian protest?"          
query2 = "Where to support the farmers?"        
query3 = "Who are the Delhi farmers?"          
query4 = "Is the government corrupt?"       
query5 = "What do farmers fight for?"
queries = [query1, query2, query3, query4, query5]               

## 1) Tf-idf with cosine similarity, custom score and BM25 

### a) Tf-idf

In [None]:
# Create inverted index
inverted_index, tf, idf = ind.create_inverted_index_tf_idf(token_tweets)

In [None]:
# Ranking results with TF-IDF
ranked_documents_tf_idf = irk.rank_documents_tf_idf(ipp.build_terms(query1), token_tweets, inverted_index, tf, idf, document_filtering=irk.conjunctive_filtering)
irk.display_scores_tf_idf(ranked_documents_tf_idf, docid_to_tweetid, tweets, 20)

sa.save_scores_to_csv(ranked_documents_tf_idf, filename="../data/tf_idf_ranking.csv")

### b) Our score

In [None]:
ranked_documents_our_score_15 = irk.rank_documents_our_score(tweets, docid_to_tweetid, ranked_documents_tf_idf, alpha=0.15, k0 = 0.5, k1=0.5,k2=1,k3=0.5)
irk.display_scores_tf_idf(ranked_documents_our_score_15, docid_to_tweetid, tweets, 20)

sa.save_scores_to_csv(ranked_documents_our_score_15, filename="../data/our_score15_ranking.csv")

In [None]:
ranked_documents_our_score_70 = irk.rank_documents_our_score(tweets, docid_to_tweetid, ranked_documents_tf_idf, alpha=0.7, k0 = 0.5, k1=0.5,k2=1,k3=0.5)
irk.display_scores_tf_idf(ranked_documents_our_score_70, docid_to_tweetid, tweets, 20)

sa.save_scores_to_csv(ranked_documents_our_score_70, filename="../data/our_score70_ranking.csv")

### c) BM25

#### b = 0.7

In [None]:
# Ranking results with TF-IDF
ranked_documents_bm25_b7 = irk.rank_documents_bm25(ipp.build_terms(query1), token_tweets, inverted_index, tf, idf)
irk.display_scores_tf_idf(ranked_documents_bm25_b7, docid_to_tweetid, tweets, 20)

sa.save_scores_to_csv(ranked_documents_bm25_b7, filename="../data/bm25_ranking_b7.csv")

#### b = 0.15

In [None]:
# Ranking results with TF-IDF
ranked_documents_bm25_b15 = irk.rank_documents_bm25(ipp.build_terms(query1), token_tweets, inverted_index, tf, idf, b= 0.15)
irk.display_scores_tf_idf(ranked_documents_bm25_b15, docid_to_tweetid, tweets, 20)

sa.save_scores_to_csv(ranked_documents_bm25_b15, filename="../data/bm25_ranking_b15.csv")

### Comparison

In [None]:
# Read the CSV files
tfidf_df = pd.read_csv("../data/tf_idf_ranking.csv").head(20)
our_score_15_df = pd.read_csv("../data/our_score15_ranking.csv").head(20)
our_score_70_df = pd.read_csv("../data/our_score70_ranking.csv").head(20)
bm25_b7_df = pd.read_csv("../data/bm25_ranking_b7.csv").head(20)
bm25_b15_df = pd.read_csv("../data/bm25_ranking_b15.csv").head(20)

In [None]:
def plot_rank_comparison(dfs, score_cols, token_tweets):
    """
    Plots accumulated position points for documents across selected scoring methods.

    Parameters:
    - dfs: Dictionary where keys are method names and values are dataframes with 'Document ID' and 'score' columns.
    - score_cols: List of keys from dfs to specify which methods to compare.
    """
    # Rename columns and assign ranking points
    for name in dfs:
        dfs[name].rename(columns={'Document ID': 'document_id', 'score': f'points_{name}'}, inplace=True)
        dfs[name][f'points_{name}'] = range(20, 0, -1)  # Rank position points: 1 = 20, 2 = 19, ..., 20 = 1

    # Merge dataframes based on selected methods
    merged_df = dfs[score_cols[0]][['document_id', f'points_{score_cols[0]}']]
    for col in score_cols[1:]:
        merged_df = pd.merge(merged_df, dfs[col][['document_id', f'points_{col}']], on='document_id', how='outer')

    # Fill NaN values with 0 for documents not present in all rankings
    merged_df.fillna(0, inplace=True)

    # Calculate total points across selected methods
    merged_df['total_points'] = merged_df[[f'points_{col}' for col in score_cols]].sum(axis=1)

    # Sort documents by total points
    merged_df.sort_values(by='total_points', ascending=False, inplace=True)

    # Prepare data for stacked bar plot
    documents = merged_df['document_id']
    colors = ['skyblue', 'lightgreen', 'salmon', 'plum', 'orange']  # Predefined colors for up to 5 methods
    color_map = {col: colors[i % len(colors)] for i, col in enumerate(score_cols)}

    # Create a new list of formatted document labels with lengths
    formatted_documents = [f"{doc} ({len(token_tweets[doc])})" for doc in documents]

    # Plot
    fig, ax = plt.subplots(figsize=(9, 7))
    
    # Accumulate points for each method in a stacked manner
    left_values = [0] * len(documents)
    for col in score_cols:
        bars = ax.barh(formatted_documents, merged_df[f'points_{col}'], left=left_values, color=color_map[col], label=col)
        
        # Add point value annotations inside each bar
        for bar, value in zip(bars, merged_df[f'points_{col}']):
            if value > 0:  # Only annotate non-zero points
                value = 21 - value
                ax.text(
                    bar.get_x() + bar.get_width() / 2,
                    bar.get_y() + bar.get_height() / 2,
                    f'{int(value)}',
                    ha='center', va='center', color='black', fontsize=8, weight='bold'
                )
                
        left_values = left_values + merged_df[f'points_{col}']

    # Add legend and labels
    ax.set_xlabel('Total Points (Based on Ranking Position)')
    ax.set_title('Accumulated Position Points by Document in Selected Top 20 Rankings')
    ax.legend(loc='lower right')
    
    # Show plot
    plt.gca().invert_yaxis()  # Invert y-axis to have the highest rank at the top
    plt.tight_layout()
    plt.show()

In [None]:
# Dataframes in a dictionary with custom names
dfs = {
    'tfidf': tfidf_df,
    'our_score_15': our_score_15_df,
    'our_score_70': our_score_70_df,
    'bm25_b7': bm25_b7_df,
    'bm25_b15': bm25_b15_df

}

# Plot only tfidf and bm25
plot_rank_comparison(dfs, ['tfidf','bm25_b7','bm25_b15','our_score_15', 'our_score_70'], token_tweets)

In [None]:
# Plot only tfidf and bm25
plot_rank_comparison(dfs, ['tfidf','bm25_b7', 'bm25_b15'], token_tweets)

In [None]:
plot_rank_comparison(dfs, ['tfidf', 'bm25_b7'], token_tweets)

In [None]:
plot_rank_comparison(dfs, ['tfidf', 'bm25_b15'], token_tweets)

In [None]:
# Plot only tfidf and bm25
plot_rank_comparison(dfs, ['our_score_15', 'our_score_70'], token_tweets)

## 2) Word2vec with cosine similarity

In [None]:
model = Word2Vec(token_tweets.values(), workers=4, vector_size=100, min_count=50, window=10, sample=1e-3)
tweet2vec_dict = irk.create_tweet2vec(token_tweets, model)

In [None]:
for i in range(0,5):
    print("\n\n*RESULTS FOR", str("query" + str(i+1)), "*\n")
    scores = irk.tweet2vec_cossim(tweet2vec_dict, model, ipp.build_terms(queries[i]))
    irk.display_scores_tf_idf(scores, docid_to_tweetid, tweets, n=20)
