# Information Retrieval Lab WiSe 2024/2025: Baseline Retrieval System

The following second test retrieval systems builds onto the baseline system from https://github.com/irgroup-classrooms/wir-2024

In [30]:
!pip3 install 'tira>=0.0.139' ir-datasets 'python-terrier==0.10.0'



In [31]:
# Create an API client to interact with the TIRA platform
from tira.third_party_integrations import ensure_pyterrier_is_loaded
from tira.rest_api_client import Client

ensure_pyterrier_is_loaded()
tira = Client()

In [32]:
# Load dataset
from pyterrier import get_dataset

pt_dataset = get_dataset('irds:ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training')

# Data Cleaning & Preprocessing

In [33]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [34]:
stop_words = set(stopwords.words('english'))

# Text preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Create Index

In [36]:
# Create indexer
indexer = IterDictIndexer(
    "../data/clean_index",
    meta={'docno': 50, 'text': 4096},
    overwrite=True
)

# Create clean document iterator
def clean_docs_iter():
    for doc in pt_dataset.get_corpus_iter():
        yield {'docno': doc['docno'], 'text': clean_text(doc['text'])}

# Build index
index = indexer.index(clean_docs_iter())

ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training documents:   0%|          | 0/68261 [00:0…

07:00:19.054 [ForkJoinPool-3-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 1 empty documents


# Sentence Embeddings using Sentence Transformer

In [37]:
!pip install transformers torch



In [39]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Example query
query = "who is aziz hashim"

# Generate an embedding for the query
query_embedding = model.encode(query)

# BM25 Retrieval

In [40]:
from pyterrier import BatchRetrieve

bm25 = BatchRetrieve(index, wmodel="BM25")

# Get original queries
queries = pt_dataset.get_topics('text')

# Run BM25 on original queries
bm25_results = bm25(queries)

In [41]:
print(queries.columns)  # Check the columns in the queries DataFrame

Index(['qid', 'query'], dtype='object')


# Retrieve Topics

In [42]:
topics = pt_dataset.get_topics('text')
print(topics)

        qid                                              query
0   1030303                                 who is aziz hashim
1   1037496                                 who is rep scalise
2   1043135                   who killed nicholas ii of russia
3   1051399                          who sings monk theme song
4   1064670              why do hunters pattern their shotguns
..      ...                                                ...
92   405717                       is cdg airport in main paris
93   182539                      example of monotonic function
94  1113437             what is physical description of spruce
95  1129237        hydrogen is a liquid below what temperature
96   146187  difference between a mcdouble and a double che...

[97 rows x 2 columns]


# Retrieval using Semantic Search

* Converts all documents in the corpus to embeddings.
* Converts the queries to embeddings.
* Uses cosine similarity to find top-k similar document


In [43]:
import torch

In [44]:
from sentence_transformers import SentenceTransformer, util
from pyterrier import BatchRetrieve

# Load the pre-trained Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Assuming pt_dataset.get_topics('query') gives a DataFrame with columns ['qid', 'query']
queries = pt_dataset.get_topics('query')

# Generate embeddings for queries
query_embeddings = model.encode(queries['query'], convert_to_tensor=True)

# Assuming BM25 is already set up
bm25 = BatchRetrieve(index, wmodel="BM25")

# Perform retrieval using BM25 (baseline)
bm25_results = bm25(queries)

# For semantic search with Sentence-BERT, compare cosine similarities
documents = [doc['text'] for doc in pt_dataset.get_corpus_iter()]
document_embeddings = model.encode(documents, convert_to_tensor=True)

# Compare the query embeddings with document embeddings using cosine similarity
cosine_scores = util.pytorch_cos_sim(query_embeddings, document_embeddings)

# You can retrieve the top K most relevant documents for each query
top_k_results = []
top_k = 10

for i, scores in enumerate(cosine_scores):
    sorted_indices = torch.argsort(scores, descending=True)[:top_k]
    top_k_results.append([documents[idx] for idx in sorted_indices])


ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training documents:   0%|          | 0/68261 [00:0…

# Wrapper Class for Semantic Search

PyTerrier Transformer that:
* Takes precomputed document embeddings and a Sentence-BERT model.
* For each query in a dataframe, encodes it, does cosine similarity, and picks top-k docs.
* Returns a DataFrame of results with columns ['qid', 'docno', 'score'].


In [45]:
from pyterrier import Transformer
from sentence_transformers import SentenceTransformer, util
import pandas as pd

class SemanticSearchWrapper(Transformer):
    def __init__(self, query_embeddings, document_embeddings, k=10):
        self.query_embeddings = query_embeddings
        self.document_embeddings = document_embeddings
        self.k = k
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def transform(self, topics):
        all_documents = [doc['text'] for doc in pt_dataset.get_corpus_iter()]
        all_docnos = [doc['docno'] for doc in pt_dataset.get_corpus_iter()]  # Get all docnos

        results_list = []
        for i, query in enumerate(topics['query']):
            query_embedding = self.model.encode(query, convert_to_tensor=True)
            cosine_scores = util.pytorch_cos_sim(query_embedding, self.document_embeddings)

            # Get top k indices and ensure the length is at least k
            sorted_indices = torch.argsort(cosine_scores, descending=True)[:self.k].flatten().tolist()
            num_results = len(sorted_indices) # get actual number of results

            # Create a DataFrame for this query's results, ensuring all columns have the same length
            df = pd.DataFrame({
                'qid': [topics['qid'][i]] * num_results,  # Repeat qid for each result
                'docno': [all_docnos[idx] for idx in sorted_indices],  # Use corresponding docnos
                'score': cosine_scores[0][sorted_indices].cpu().tolist()  # Add cosine similarity scores
            })
            results_list.append(df)

        # Concatenate all query results into a single DataFrame
        final_results_df = pd.concat(results_list, ignore_index=True)
        return final_results_df  # Return a DataFrame

# Evaluation

In [46]:
from pyterrier import Experiment

# Create an instance of the SemanticSearchWrapper
semantic_search_instance = SemanticSearchWrapper(query_embeddings, document_embeddings)

# Perform evaluation for both BM25 and semantic search pipelines
results = Experiment([bm25, semantic_search_instance],  # bm25 and semantic_search_instance are now two retrieval pipelines
                     queries,                        # List of queries or topics
                     pt_dataset.get_qrels(),         # Ground truth relevance data (qrels)
                     eval_metrics=["map", "ndcg_cut_10", "P_1", "P_5", "P_10"])

# Print the results to compare BM25 and semantic search performance
print(results)

ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training documents:   0%|          | 0/68261 [00:0…

ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training documents:   0%|          | 0/68261 [00:0…

                                                name       map  ndcg_cut_10  \
0                                           BR(BM25)  0.415656     0.491229   
1  <__main__.SemanticSearchWrapper object at 0x79...  0.531523     0.652962   

        P_1       P_5      P_10  
0  0.711340  0.612371  0.576289  
1  0.876289  0.793814  0.737113  


# Upload to TIRA

In [None]:
import os
from tira.third_party_integrations import persist_and_normalize_run

# Define the directory path for saving runs
run_dir = '../data/runs'

# Create the directory if it does not exist
os.makedirs(run_dir, exist_ok=True)

# Assign the results to the 'run' variable
run = bm25_rm3(pt_dataset.get_topics('text'))

# Persist and normalize the run
persist_and_normalize_run(
    run,
    system_name='bm25+SemSearch-relevancers',
    default_output=run_dir,
    upload_to_tira=pt_dataset,
)