In [None]:
!pip install ir-datasets
import ir_datasets

dataset = ir_datasets.load('cranfield')

In [None]:
pip install nltk

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import pandas as pd

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if not isinstance(text, str):
        return []
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word not in stop_words and word.isalpha()
    ]
    return tokens

processed_docs = [' '.join(preprocess_text(doc.text)) for doc in dataset.docs_iter()]

In [None]:
processed_docs

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(processed_docs)
feature_names = vectorizer.get_feature_names_out()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

processed_queries = []
query_ids = []
for query in dataset.queries_iter():
    processed_queries.append(' '.join(preprocess_text(query.text)))
    query_ids.append(query.query_id)

tfidf_queries = vectorizer.transform(processed_queries)

qrels_dict = {}
for qrel in dataset.qrels_iter():
    if qrel.relevance > 0:
        qrels_dict.setdefault(qrel.query_id, set()).add(qrel.doc_id)

doc_list = list(dataset.docs_iter())
index_to_doc_id = [doc.doc_id for doc in doc_list]

N = 10
query_index_to_evaluate = 0
query_id = query_ids[query_index_to_evaluate]

query_doc_similarity = cosine_similarity(tfidf_queries[query_index_to_evaluate], tfidf_matrix)
sorted_doc_indices = query_doc_similarity.flatten().argsort()[::-1]

top_n_indices = sorted_doc_indices[:N]
retrieved_doc_ids = [index_to_doc_id[i] for i in top_n_indices]

relevant_doc_ids = qrels_dict.get(query_id, set())

hits = sum(1 for doc_id in retrieved_doc_ids if doc_id in relevant_doc_ids)
precision_at_N = hits / N if N > 0 else 0

print(f"\nAvaliação para o ID da consulta: {query_id}")
print(f"Número de documentos relevantes para esta consulta: {len(relevant_doc_ids)}")
print(f"Top {N} IDs de documentos recuperados: {retrieved_doc_ids}")
print(f"Número de documentos relevantes nos top {N}: {hits}")
print(f"Precisão@{N}: {precision_at_N:.4f}")

In [None]:
def calculate_interpolated_metrics(ranked_doc_indices, relevant_doc_ids, doc_index_to_id, recall_levels):
    precision_recall_points = []
    hits = 0
    num_relevant = len(relevant_doc_ids)
    num_retrieved = 0

    if num_relevant == 0:
        return ([0] * len(recall_levels), [0] * len(recall_levels), [0] * len(recall_levels))

    for doc_idx in ranked_doc_indices:
        num_retrieved += 1
        doc_id = doc_index_to_id[doc_idx]
        if doc_id in relevant_doc_ids:
            hits += 1

        current_precision = hits / num_retrieved
        current_recall = hits / num_relevant
        precision_recall_points.append((current_precision, current_recall))

    precision_recall_points.insert(0, (1.0, 0.0))
    if precision_recall_points[-1][1] < 1:
        precision_recall_points.append((0.0, 1.0))

    precision_recall_points.sort(key=lambda x: x[1])
    interpolated_precision = []
    current_max_precision = 0

    for i in range(len(precision_recall_points) - 1, -1, -1):
        current_max_precision = max(current_max_precision, precision_recall_points[i][0])
        interpolated_precision.insert(0, (current_max_precision, precision_recall_points[i][1]))

    interpolated_precisions_at_levels = []
    interpolated_recalls_at_levels = []
    interpolated_f1_at_levels = []

    for recall_level in recall_levels:
        found_precision = 0.0
        for prec, rec in interpolated_precision:
            if rec >= recall_level:
                found_precision = prec
                break
        interpolated_precisions_at_levels.append(found_precision)
        interpolated_recalls_at_levels.append(recall_level)
        if found_precision + recall_level > 0:
            f1 = 2 * found_precision * recall_level / (found_precision + recall_level)
        else:
            f1 = 0.0
        interpolated_f1_at_levels.append(f1)

    return interpolated_precisions_at_levels, interpolated_recalls_at_levels, interpolated_f1_at_levels

In [None]:
def calculate_interpolated_precision(ranked_doc_indices, relevant_doc_ids, doc_index_to_id, recall_levels):
    """Calculates interpolated precision at given recall levels for a single query."""
    precision_recall_points = []
    hits = 0
    num_relevant = len(relevant_doc_ids)
    num_retrieved = 0

    if num_relevant == 0:
        return [0] * len(recall_levels)

    for doc_idx in ranked_doc_indices:
        num_retrieved += 1
        doc_id = doc_index_to_id[doc_idx]
        if doc_id in relevant_doc_ids:
            hits += 1

        current_precision = hits / num_retrieved
        current_recall = hits / num_relevant
        precision_recall_points.append((current_precision, current_recall))

    precision_recall_points.insert(0, (0, 0))
    if precision_recall_points[-1][1] < 1:
         precision_recall_points.append((precision_recall_points[-1][0], 1.0))

    precision_recall_points.sort(key=lambda x: x[1])
    interpolated_precision = []
    current_max_precision = 0

    for i in range(len(precision_recall_points) - 1, -1, -1):
         current_max_precision = max(current_max_precision, precision_recall_points[i][0])
         interpolated_precision.insert(0, (current_max_precision, precision_recall_points[i][1]))

    interpolated_precisions_at_levels = []
    current_interpolated_idx = 0

    for recall_level in recall_levels:
        found_precision = 0.0
        for prec, rec in interpolated_precision:
             if rec >= recall_level:
                  found_precision = prec
                  break
        interpolated_precisions_at_levels.append(found_precision)
    return interpolated_precisions_at_levels

In [None]:
tfidf_interpolated_precisions_per_query = []
num_queries_for_11pt_eval_tfidf = 0

for query_idx, query_id in enumerate(query_ids):
    current_relevant_doc_ids = qrels_dict.get(query_id, set())
    if not current_relevant_doc_ids:
        continue

    num_queries_for_11pt_eval_tfidf += 1
    query_similarity = cosine_similarity(tfidf_queries[query_idx], tfidf_matrix)
    tfidf_ranked_doc_indices = query_similarity.flatten().argsort()[::-1]
    interpolated_precisions = calculate_interpolated_precision(
        tfidf_ranked_doc_indices, current_relevant_doc_ids, index_to_doc_id, recall_levels_11pt
    )
    tfidf_interpolated_precisions_per_query.append(interpolated_precisions)

if num_queries_for_11pt_eval_tfidf > 0:
    avg_tfidf_interpolated_precision = np.mean(tfidf_interpolated_precisions_per_query, axis=0)
else:
    avg_tfidf_interpolated_precision = [0] * len(recall_levels_11pt)

In [None]:
import numpy as np

def calculate_interpolated_f1(interpolated_precisions, rec):
    interpolated_f1 = []
    for p, r in zip(interpolated_precisions, rec):
        if p + r == 0:
            interpolated_f1.append(0.0)
        else:
            interpolated_f1.append(2 * p * r / (p + r))
    return interpolated_f1

avg_tfidf_f1 = calculate_interpolated_f1(avg_tfidf_interpolated_precision, avg_tfidf_rec)
avg_tfidf_f1

In [None]:
avg_metrics_df = pd.DataFrame({
    'interpolated_recalls_at_levels': avg_tfidf_rec,
    'interpolated_precisions_at_levels': avg_tfidf_interpolated_precision,
    'interpolated_f1_at_levels': avg_tfidf_f1,
})

avg_metrics_df.to_csv('avg_tfidf_metrics.csv', index=False)