In [1]:
import string
import contractions
import nltk
import numpy as np
import pandas as pd
import joblib
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import ir_datasets

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ahmad17/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def expand_contractions(text):
    return contractions.fix(text)

def preprocess(text):
    text = expand_contractions(text)
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

# Assuming the necessary imports and NLTK downloads are already done as shown in the previous example

def prepare_dataset(dataset,output_prefix):
    docs = [preprocess(doc.text) for doc in dataset.docs_iter()]
    docs_ids = [doc.doc_id for doc in dataset.docs_iter()]

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(docs)

    num_clusters = 10 
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(tfidf_matrix)
    cluster_labels = kmeans.labels_

    docs_df = pd.DataFrame({'doc_id': docs_ids, 'text': docs, 'clusters': cluster_labels})
    docs_df.to_csv(f'{output_prefix}/docs_df.csv', index=False)
    joblib.dump(tfidf_matrix, f'{output_prefix}/tfidf_matrix.pkl')
    joblib.dump(vectorizer, f'{output_prefix}/vectorizer.pkl')
    joblib.dump(kmeans, f'{output_prefix}/kmeans.pkl')



# DON'T run the following 2 cells

In [8]:
antique_dataset = ir_datasets.load("antique/train")
prepare_dataset(antique_dataset, "models/antique")


In [6]:
wiki_dataset = ir_datasets.load("wikir/en1k/training")
prepare_dataset(wiki_dataset, "models/wiki")

[INFO] If you have a local copy of https://zenodo.org/record/3565761/files/wikIR1k.zip, you can symlink it here to avoid downloading it again: /home/ahmad17/.ir_datasets/downloads/554299bca984640cb283d6ba55753608
[INFO] [starting] https://zenodo.org/record/3565761/files/wikIR1k.zip
[INFO] [finished] https://zenodo.org/record/3565761/files/wikIR1k.zip: [01:54] [165MB] [1.45MB/s]
                                                                               

In [3]:
def tfidf_search(query, tfidf_matrix, vectorizer, top_n=10):
    query_vec = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_indices = cosine_similarities.argsort()[-top_n:][::-1]
    return top_indices, cosine_similarities[top_indices]


In [4]:
def search_with_clustering(query, tfidf_matrix, vectorizer, kmeans, top_n=10):
    query_vec = vectorizer.transform([preprocess(query)])
    top_cluster = kmeans.predict(query_vec)[0]
    cluster_indices = np.where(kmeans.labels_ == top_cluster)[0]
    cluster_similarities = cosine_similarity(query_vec, tfidf_matrix[cluster_indices]).flatten()
    cluster_top_indices = cluster_indices[cluster_similarities.argsort()[-top_n:][::-1]]
    return cluster_top_indices, cluster_similarities



In [5]:

def evaluate_tfidf_search(queries, qrels, tfidf_matrix, vectorizer, docs_df):
    average_precisions = []
    reciprocal_ranks = []
    precision_at_10 = []
    recall_scores = []

    for query_id, query_text in queries.items():
        relevant_docs = {doc_id for doc_id, relevance in qrels.get(query_id, []) if relevance > 0}

        if not relevant_docs:
            continue

        top_indices, _ = tfidf_search(query_text, tfidf_matrix, vectorizer, top_n=10)
        retrieved_docs = set(docs_df.iloc[top_indices].doc_id)

        # Calculate Precision@10
        top_10_retrieved = set(docs_df.iloc[top_indices[:10]].doc_id)
        precision_10 = len(top_10_retrieved & relevant_docs) / 10.0
        precision_at_10.append(precision_10)

        # Calculate Recall
        recall = len(relevant_docs & retrieved_docs) / len(relevant_docs)
        recall_scores.append(recall)

        # Calculate Average Precision
        num_relevant = 0
        sum_precisions = 0.0
        for i, doc_id in enumerate(docs_df.iloc[top_indices].doc_id):
            if doc_id in relevant_docs:
                num_relevant += 1
                sum_precisions += num_relevant / (i + 1.0)
        if num_relevant > 0:
            average_precisions.append(sum_precisions / num_relevant)

        # Calculate Reciprocal Rank
        rr = 0.0
        for i, doc_id in enumerate(docs_df.iloc[top_indices].doc_id):
            if doc_id in relevant_docs:
                rr = 1.0 / (i + 1.0)
                break
        reciprocal_ranks.append(rr)

    MAP = np.mean(average_precisions) if average_precisions else 0.0
    MRR = np.mean(reciprocal_ranks) if reciprocal_ranks else 0.0
    mean_precision_10 = np.mean(precision_at_10) if precision_at_10 else 0.0
    mean_recall = np.mean(recall_scores) if recall_scores else 0.0

    return MAP, MRR, mean_precision_10, mean_recall

def evaluate_tfidf_search_with_clustering(queries, qrels, tfidf_matrix, vectorizer, docs_df, kmeans):
    average_precisions = []
    reciprocal_ranks = []
    precision_at_10 = []
    recall_scores = []

    for query_id, query_text in queries.items():
        relevant_docs = {doc_id for doc_id, relevance in qrels.get(query_id, []) if relevance > 0}

        if not relevant_docs:
            continue

        top_indices, _ = search_with_clustering(query_text, tfidf_matrix, vectorizer, kmeans, top_n=10)
        retrieved_docs = set(docs_df.iloc[top_indices].doc_id)

        # Calculate Precision@10
        top_10_retrieved = set(docs_df.iloc[top_indices[:10]].doc_id)
        precision_10 = len(top_10_retrieved & relevant_docs) / 10.0
        precision_at_10.append(precision_10)

        # Calculate Recall
        recall = len(relevant_docs & retrieved_docs) / len(relevant_docs)
        recall_scores.append(recall)

        # Calculate Average Precision
        num_relevant = 0
        sum_precisions = 0.0
        for i, doc_id in enumerate(docs_df.iloc[top_indices].doc_id):
            if doc_id in relevant_docs:
                num_relevant += 1
                sum_precisions += num_relevant / (i + 1.0)
        if num_relevant > 0:
            average_precisions.append(sum_precisions / num_relevant)

        # Calculate Reciprocal Rank
        rr = 0.0
        for i, doc_id in enumerate(docs_df.iloc[top_indices].doc_id):
            if doc_id in relevant_docs:
                rr = 1.0 / (i + 1.0)
                break
        reciprocal_ranks.append(rr)

    MAP = np.mean(average_precisions) if average_precisions else 0.0
    MRR = np.mean(reciprocal_ranks) if reciprocal_ranks else 0.0
    mean_precision_10 = np.mean(precision_at_10) if precision_at_10 else 0.0
    mean_recall = np.mean(recall_scores) if recall_scores else 0.0

    return MAP, MRR, mean_precision_10, mean_recall




In [6]:
antique_tfidf_matrix = joblib.load("./models/antique/tfidf_matrix.pkl")
antique_vectorizer = joblib.load("./models/antique/vectorizer.pkl")
antique_kmeans = joblib.load("./models/antique/kmeans.pkl")
antique_docs_df = pd.read_csv("./models/antique/docs_df.csv")


In [7]:
wiki_tfidf_matrix = joblib.load("./models/wiki/tfidf_matrix.pkl")
wiki_vectorizer = joblib.load("./models/wiki/vectorizer.pkl")
wiki_kmeans = joblib.load("./models/wiki/kmeans.pkl")
wiki_docs_df = pd.read_csv("./models/wiki/docs_df.csv")

In [8]:
antique_dataset = ir_datasets.load("antique/train")
antique_queries = {query.query_id: query.text for query in antique_dataset.queries_iter()}
antique_qrels = {}
for qrel in antique_dataset.qrels_iter():
    if qrel.query_id not in antique_qrels:
        antique_qrels[qrel.query_id] = []
    antique_qrels[qrel.query_id].append((qrel.doc_id, qrel.relevance))



In [9]:
wiki_dataset = ir_datasets.load("wikir/en1k/training")
wiki_queries = {query.query_id: query.text for query in wiki_dataset.queries_iter()}
wiki_qrels = {}
for qrel in wiki_dataset.qrels_iter():
    if qrel.query_id not in wiki_qrels:
        wiki_qrels[qrel.query_id] = []
    wiki_qrels[qrel.query_id].append((qrel.doc_id, qrel.relevance))

# DON'T run the following 4 cells

In [37]:
print("Evaluating TF-IDF System ( Antique ) ...")
tfidf_antique_results = evaluate_tfidf_search(antique_queries, antique_qrels, antique_tfidf_matrix, antique_vectorizer, antique_docs_df)
print(f"TF-IDF Results: {tfidf_antique_results}")

Evaluating TF-IDF System ( Antique ) ...
TF-IDF Results: (0.14731879736223874, 0.08120193050879908, 0.0304204451772465, 0.11765890734283808)


In [72]:
print("Evaluating clustered TF-IDF System ( Antique ) ...")
antique_clustered_results = evaluate_tfidf_search_with_clustering(antique_queries, antique_qrels, antique_tfidf_matrix, antique_vectorizer, antique_docs_df,  antique_kmeans)
print(f"clustered TF-IDF Results: {antique_clustered_results}")

Evaluating clustered TF-IDF System ( Antique ) ...
clustered TF-IDF Results: (0.5787161922272746, 0.2408775958858399, 0.07061005770816159, 0.08247039539032426)


In [11]:
print("Evaluating TF-IDF System ( Wiki ) ...")
tfidf_wiki_results = evaluate_tfidf_search(wiki_queries, wiki_qrels, wiki_tfidf_matrix, wiki_vectorizer, wiki_docs_df)
print(f"TF-IDF Results: {tfidf_wiki_results}")

Evaluating TF-IDF System ( Wiki ) ...


In [10]:
print("Evaluating clustered TF-IDF System ( Wiki ) ...")
wiki_clustered_results = evaluate_tfidf_search_with_clustering(wiki_queries, wiki_qrels, wiki_tfidf_matrix, wiki_vectorizer, wiki_docs_df, wiki_kmeans)
print(f"clustered TF-IDF Results: {wiki_clustered_results}")

Evaluating clustered TF-IDF System ( Wiki ) ...
