# Information Retrieval
## Assignment 10

Team members:

1: Pouria Sadr

2: Kimia Mahdinejad

3: Saleh Ebrahimian

4: Mobin Tasnimi

In [None]:
import pandas as pd
from google.colab import drive
import numpy as np

In [None]:
drive.mount('/content/drive')
dataset_path = '/content/drive/MyDrive/IR-Dataset/DS-10/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
qrels = pd.read_csv(dataset_path + 'qrels.txt', sep="\s+", names=["query", "q0", "docid", "rel"])
run = pd.read_csv(dataset_path + 'run.txt', sep="\s+", names=["query", "q0", "docid", "rank", "score", "system"])

In [None]:
def calculate_precision(k):
  qrels_copy = qrels.copy()
  run_copy = run.copy()

  query_count = len(run_copy["query"].unique())

  label = "P@" + str(k)

  merged_dataframes = pd.merge(run_copy[["query", "docid", "score"]], qrels_copy[["query","docid","rel"]], how="left")

  top_k_relevants = merged_dataframes.groupby("query")[["query", "docid", "rel"]].head(k)

  top_k_relevants[label] = (top_k_relevants["rel"] > 0)

  top_k_relevants_per_query = top_k_relevants[["query", label]].groupby("query").sum().astype(int) / k

  precision = (top_k_relevants_per_query.sum() / query_count)[label]

  return precision

In [None]:
def calculate_recall(k):
  qrels_copy = qrels.copy()
  run_copy = run.copy()

  query_count = len(run_copy["query"].unique())

  label = "P@" + str(k)

  qrels_copy["relevant_per_query"] = (qrels_copy["rel"] > 0)
  total_relevants_per_query = qrels_copy.groupby("query")["relevant_per_query"].sum().astype(int)

  merged_dataframes = pd.merge(run_copy[["query", "docid", "score"]], qrels_copy[["query","docid","rel"]], how="left")

  top_k_relevants = merged_dataframes.groupby("query")[["query","docid","rel"]].head(k)

  relevants = qrels_copy[qrels_copy.rel > 0]
  selection = pd.merge(top_k_relevants, relevants[["query","docid","rel"]], how="left")
  selection = selection[~selection["rel"].isnull()]

  recall_precision_per_query = selection.groupby("query")["docid"].count() / total_relevants_per_query
  recall_precision_per_query.name = label
  recall_precision_per_query = recall_precision_per_query.reset_index().set_index("query")

  recall = (recall_precision_per_query.sum() / query_count)[label]

  return recall

In [None]:
def calculate_map(k):
  qrels_copy = qrels.copy()
  run_copy = run.copy()

  query_count = len(run_copy["query"].unique())

  label = "P@" + str(k)

  relevants = qrels_copy[qrels_copy.rel > 0].copy()
  relevants["rel"] = 1

  top_k_relevants = run_copy.groupby("query")[["query","docid","score"]].head(k)

  top_k_relevants["rank"] = 1
  top_k_relevants["rank"] = top_k_relevants.groupby("query")["rank"].cumsum()
  top_k_relevants["discount"] = 1. / np.log2(top_k_relevants["rank"]+1)

  selection = pd.merge(top_k_relevants, relevants[["query","docid","rel"]], how="left")
  selection["rel"] = selection.groupby("query")["rel"].cumsum()
  selection[label] = selection["rel"] / selection["rank"]

  map_per_query = selection[["query", label]].groupby("query").sum()
  relevants[label] = relevants["rel"]
  reletives_per_query = relevants[["query",label]].groupby("query").sum()
  map_per_query = map_per_query / reletives_per_query

  map = (map_per_query.sum() / query_count)[label]

  return map

In [None]:
def calculate_mrr(k):
  qrels_copy = qrels.copy()
  run_copy = run.copy()

  query_count = len(run_copy["query"].unique())

  label = "P@" + str(k)

  top_k_relevants = run_copy.groupby("query")[["query","docid","score"]].head(k)

  top_k_relevants["rank"] = 1
  top_k_relevants["rank"] = top_k_relevants.groupby("query")["rank"].cumsum()

  relevants = qrels_copy[qrels_copy.rel > 0]

  selection = pd.merge(top_k_relevants, relevants[["query","docid","rel"]], how="left")
  selection["query"] = pd.Categorical(selection["query"])
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection[label] = 1.0 / selection["rank"]

  rr_per_query = selection[[label]]

  if rr_per_query.empty:
    mrr = 0
  else:
    mrr = (rr_per_query.sum() / query_count)[label]

  return mrr

In [None]:
def calculate_ndcg(k):
  qrels_copy = qrels.copy()
  run_copy = run.copy()

  query_count = len(run_copy["query"].unique())

  label = "P@" + str(k)

  top_k_relevants = run_copy.groupby("query")[["query","docid","score"]].head(k)

  top_k_relevants["rank"] = 1
  top_k_relevants["rank"] = top_k_relevants.groupby("query")["rank"].cumsum()
  top_k_relevants["discount"] = 1. / np.log2(top_k_relevants["rank"]+1)

  relevants = qrels_copy[qrels_copy.rel > 0]

  selection = pd.merge(top_k_relevants, relevants[["query","docid","rel"]], how="left")
  selection = selection[~selection["rel"].isnull()]
  selection[label] = (2**selection["rel"] - 1.0) * selection["discount"]

  perfect_ranking = relevants.sort_values(["query","rel"], ascending=[True,False]).reset_index(drop=True)
  perfect_ranking = perfect_ranking.groupby("query").head(k)

  perfect_ranking["rank"] = 1
  perfect_ranking["rank"] = perfect_ranking.groupby("query")["rank"].cumsum()
  perfect_ranking["discount"] = 1. / np.log2(perfect_ranking["rank"]+1)

  perfect_ranking[label] = (2**perfect_ranking["rel"] - 1.0) * perfect_ranking["discount"]

  dcg_per_query = selection[["query", label]].groupby("query").sum()
  idcg_per_query = perfect_ranking[["query",label]].groupby("query").sum()
  ndcg_per_query = dcg_per_query / idcg_per_query

  if ndcg_per_query.empty:
    ndcg = 0.0
  else:
    ndcg = (ndcg_per_query.sum() / query_count)[label]

  return ndcg

In [None]:
k = int(input ("Enter K: "))

precision_flag = input ("Need Precision? (Y/N) ")
recall_flag = input ("Need Recall? (Y/N) ")
map_flag = input ("Need MAP? (Y/N) ")
mrr_flag = input ("Need MRR? (Y/N) ")
ndcg_flag = input ("Need NDCG? (Y/N) ")

print("\n")

if precision_flag == "Y":
  precision = calculate_precision(k)
  print("Precision = " + str(precision))

if recall_flag == "Y":
  recall = calculate_recall(k)
  print("Recall = " + str(recall))

if map_flag == "Y":
  map = calculate_map(k)
  print("MAP = " + str(map))

if mrr_flag == "Y":
  mrr = calculate_mrr(k)
  print("MRR = " + str(mrr))

if ndcg_flag == "Y":
  ndcg = calculate_ndcg(k)
  print("NDCG = " + str(ndcg))

Enter K: 10
Need Precision? (Y/N) N
Need Recall? (Y/N) N
Need MAP? (Y/N) N
Need MRR? (Y/N) Y
Need NDCG? (Y/N) Y


MRR = 0.35515929867649065
NDCG = 0.4184448718088329
