# Download and extract

In [1]:
!gdown 1N5V40HQsnCHHytdmG98YfXvN_3DCjV67

Downloading...
From (original): https://drive.google.com/uc?id=1N5V40HQsnCHHytdmG98YfXvN_3DCjV67
From (redirected): https://drive.google.com/uc?id=1N5V40HQsnCHHytdmG98YfXvN_3DCjV67&confirm=t&uuid=e240dedd-0a7b-4245-a534-579a26328cc6
To: /content/dataframe_hamshahri2.tar
100% 160M/160M [00:02<00:00, 60.1MB/s]


In [2]:
!tar -xf dataframe_hamshahri2.tar

# Load data

In [3]:
import pandas as pd

df_docs=pd.read_csv('/content/hamshahri2/docs.csv.gz', compression='gzip')
df_judgments=pd.read_csv('/content/hamshahri2/judgments_dataframe.csv.gz', compression='gzip')
df_queries_fa=pd.read_csv('/content/hamshahri2/queries_fa.csv.gz', compression='gzip')

In [4]:
qrels={}
for item in df_judgments[df_judgments['relevancy'] == 1].groupby('query_id')[['doc_id', 'relevancy']]:
  qrels[str(item[0])]={val[0]:int(val[1]) for val in zip(item[1]['doc_id'],item[1]['relevancy'])}

# Create TF-IDF model and do search

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict, Tuple

class RankerTFIDF:

    def __init__(self, docs) -> None:
        """
        Initialize TF-IDF vectorizer and fit it to the provided documents.

        Args:
            docs (list): List of document strings.
        """
        self.docs = docs
        self.vectorizer = TfidfVectorizer()
        self.tfidf_matrix = self.vectorizer.fit_transform(self.docs)


    def search(self, query: str, k: int = 5) -> List[Tuple[float, str]]:
        """
        Return the top-k most similar documents to a single query.

        Args:
            query: Input query string.
            k: Number of top results to return.

        Returns:
            List of (score, document) tuples, sorted by score (descending).
        """
        query_vec = self.vectorizer.transform([query])
        similarities = cosine_similarity(query_vec, self.tfidf_matrix).flatten()
        top_indices = similarities.argsort()[-k:][::-1]
        return [
            (similarities[i], self.docs[i])
            for i in top_indices
        ]

    def batch_search(
        self,
        queries: List[str],
        k: int = 5
    ) -> Dict[str, List[Tuple[int, float]]]:
        """
        Return top-k results for multiple queries in the format:
        {query: [(doc_id, score), ...]}

        Args:
            queries: List of query strings.
            k: Number of top results per query.

        Returns:
            Dictionary mapping each query to its ranked results (doc_id, score).
        """
        query_vecs = self.vectorizer.transform(queries)
        sim_matrix = cosine_similarity(query_vecs, self.tfidf_matrix)

        results = {}
        for i, query in enumerate(queries):
            similarities = sim_matrix[i]
            top_indices = similarities.argsort()[-k:][::-1]
            results[query] = [
                (doc_id, float(similarities[doc_id]))  # Convert numpy.float32 to Python float
                for doc_id in top_indices
            ]
        return results

In [6]:
# Initialize TF-IDF scorer
docs = df_docs['TITLE']
scorer = RankerTFIDF(docs)

In [7]:
# Batch search for multiple queries
queries=list(df_queries_fa.TITLE.values)
results = scorer.batch_search(queries, k=1000)

# Evaluation

In [8]:
!pip install beir

Collecting beir
  Downloading beir-2.2.0-py3-none-any.whl.metadata (28 kB)
Collecting pytrec-eval-terrier (from beir)
  Downloading pytrec_eval_terrier-0.5.10-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Downloading beir-2.2.0-py3-none-any.whl (77 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytrec_eval_terrier-0.5.10-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (304 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m304.8/304.8 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytrec-eval-terrier, beir
Successfully installed beir-2.2.0 pytrec-eval-terrier-0.5.10


In [9]:
import numpy as np
ranked_results={}
i=0
for query, doc_scores in results.items():
  i=i+1
  ranked_results[str(i)]={df_docs.at[doc_id,'DOCID']:float(score) for doc_id, score in doc_scores}

In [10]:
from beir.retrieval.evaluation import EvaluateRetrieval
top_k_values=[1, 3, 5, 10, 100, 1000]
metrics=EvaluateRetrieval.evaluate(qrels, ranked_results, top_k_values)

In [11]:
# Convert metrics to DataFrame
def get_metrics_dataframe(metrics):
  data = []
  for metric_group in metrics:
      for metric_name, score in metric_group.items():
          metric, k = metric_name.split('@')
          data.append({
              'Metric': metric,
              'k': int(k),
              'Score': score
          })

  df = pd.DataFrame(data)
  return df.pivot(index='k', columns='Metric', values='Score').reset_index()

In [12]:
metrics_df=get_metrics_dataframe(metrics)
metrics_df

Metric,k,MAP,NDCG,P,Recall
0,1,0.00641,0.48,0.48,0.00641
1,3,0.01711,0.52614,0.53333,0.02006
2,5,0.02576,0.53291,0.54,0.0327
3,10,0.04043,0.4962,0.484,0.05555
4,100,0.113,0.31309,0.2216,0.2221
5,1000,0.14576,0.40397,0.05,0.46653


# Save Metrics into github

Save the `metrics_df` DataFrame to a CSV file named `metrics.csv` in the current Colab environment and push into your github account.


In [13]:
metrics_df.to_csv('metrics.csv', index=False)
print('metrics_df saved to metrics.csv')

metrics_df saved to metrics.csv


In [39]:
!rm -r irws

your_github_name= input('Please enter your GitHub name: ')

your_github_id = input('Please enter your GitHub Personal Access Token (PAT): ')

your_email="you@example.com"

homework_id='hm01'

!git clone https://{your_github_id}@github.com/{your_github_name}/irws

rm: cannot remove 'irws': No such file or directory
Please enter your GitHub Personal Access Token (PAT): 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
Cloning into 'irws'...
remote: Enumerating objects: 10, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 10 (delta 1), reused 3 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (10/10), 5.40 KiB | 5.40 MiB/s, done.
Resolving deltas: 100% (1/1), done.


In [23]:
!cp metrics.csv irws/irws_{homework_id}_best_value.csv
%cd irws/
!git add irws_{homework_id}_best_value.csv
!git config --global user.email {your_email}
!git config --global user.name {your_name}
!git commit -m "{homework_id}: add irws_{homework_id}_best_value.csv"
!git push
%cd ../

/content/irws
On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean
Everything up-to-date
/content


# Download .ipynb file from File/Download/ipynb and Upload manually to update github repo.

In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print(f'User uploaded file "{fn}" with length {len(uploaded[fn])} bytes')


In [27]:
!cp IRWS_HM01.ipynb irws/IRWS_HM01.ipynb
%cd irws/
!git add IRWS_HM01.ipynb
!git commit -m "{homework_id}: commit .ipynb file from colab"
!git push
%cd ../

/content/irws
On branch main
Your branch is ahead of 'origin/main' by 1 commit.
  (use "git push" to publish your local commits)

nothing to commit, working tree clean
Enumerating objects: 4, done.
Counting objects: 100% (4/4), done.
Delta compression using up to 2 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 10.69 KiB | 1.78 MiB/s, done.
Total 3 (delta 0), reused 0 (delta 0), pack-reused 0
remote: [1;31merror[m: GH013: Repository rule violations found for refs/heads/main.[K
remote: 
remote: - GITHUB PUSH PROTECTION[K
remote:   —————————————————————————————————————————[K
remote:     Resolve the following violations before pushing again[K
remote: 
remote:     - Push cannot contain secrets[K
remote: 
remote:     [K
remote:      (?) Learn how to resolve a blocked push[K
remote:      https://docs.github.com/code-security/secret-scanning/working-with-secret-scanning-and-push-protection/working-with-push-protection-from-the-command-line#resolving-a-blo