# Test System Semantic Search Team "Relevancers" TH Köln

The following second test retrieval systems builds onto the baseline system from https://github.com/irgroup-classrooms/wir-2024

In [2]:
!pip3 install 'tira>=0.0.139' ir-datasets 'python-terrier==0.10.0'

Collecting tira>=0.0.139
  Downloading tira-0.0.143-py3-none-any.whl.metadata (4.6 kB)
Collecting ir-datasets
  Downloading ir_datasets-0.5.9-py3-none-any.whl.metadata (12 kB)
Collecting python-terrier==0.10.0
  Downloading python-terrier-0.10.0.tar.gz (107 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.6/107.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting wget (from python-terrier==0.10.0)
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyjnius>=1.4.2 (from python-terrier==0.10.0)
  Downloading pyjnius-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting matchpy (from python-terrier==0.10.0)
  Downloading matchpy-0.5.5-py3-none-any.whl.metadata (12 kB)
Collecting chest (from python-terrier==0.10.0)
  Downloading chest-0.2.3.tar.gz (9.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting

In [3]:
# Create an API client to interact with the TIRA platform
from tira.third_party_integrations import ensure_pyterrier_is_loaded
from tira.rest_api_client import Client

ensure_pyterrier_is_loaded()
tira = Client()

terrier-assemblies 5.7 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.7 jar not found, downloading to /root/.pyterrier...
Done
terrier-prf -SNAPSHOT jar not found, downloading to /root/.pyterrier...
Done


PyTerrier 0.10.0 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7



In [4]:
# Load dataset
from pyterrier import get_dataset
from pyterrier import IterDictIndexer

pt_dataset = get_dataset('irds:ir-lab-wise-2024/subsampled-ms-marco-rag-20250105-training')

# Data Cleaning & Preprocessing

In [5]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
stop_words = set(stopwords.words('english'))

# Text preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Create Index

In [7]:
# Create indexer
indexer = IterDictIndexer(
    "../data/clean_index",
    meta={'docno': 50, 'text': 4096},
    overwrite=True
)

# Create clean document iterator
def clean_docs_iter():
    for doc in pt_dataset.get_corpus_iter():
        yield {'docno': doc['docno'], 'text': clean_text(doc['text'])}

# Build index
index = indexer.index(clean_docs_iter())

Download from Zenodo: https://zenodo.org/records/14600777/files/subsampled-ms-marco-rag-20250105-training-inputs.zip


Download: 100%|██████████| 79.7M/79.7M [00:07<00:00, 11.4MiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_datasets/ir-lab-wise-2024/subsampled-ms-marco-rag-20250105-training/


ir-lab-wise-2024/subsampled-ms-marco-rag-20250105-training documents:   0%|          | 0/113227 [00:00<?, ?it/…

# Sentence Embeddings using Sentence Transformer

In [13]:
!pip install transformers torch



In [14]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Example query
query = "who is aziz hashim"

# Generate an embedding for the query
query_embedding = model.encode(query)

# BM25 Retrieval

In [9]:
from pyterrier import BatchRetrieve

bm25 = BatchRetrieve(index, wmodel="BM25")

# Get original queries
queries = pt_dataset.get_topics('text')

# Run BM25 on original queries
bm25_results = bm25(queries)

In [10]:
print(queries.columns)  # Check the columns in the queries DataFrame

Index(['qid', 'query'], dtype='object')


# Retrieve Topics

In [15]:
topics = pt_dataset.get_topics('text')
print(topics)

            qid                                              query
0    2024-79081   how taylor swift s age affects her relationships
1    2024-40863          how does bee sting affect quality of life
2   2024-224279  why should teachers always be reviewing their ...
3   2024-223358                 why is there a watermelon shortage
4   2024-158677  what was entertainment like in the 1990s in th...
..          ...                                                ...
81   2024-18963                did old dominion commit voter fraud
82  2024-149459  what percent of students are bullied because o...
83   2024-88894  how to help students understand number of deat...
84  2024-153051       what target stors s policies for shoplifting
85  2024-224926               why was salsa not noticed in america

[86 rows x 2 columns]


# Retrieval using Semantic Search

* Converts all documents in the corpus to embeddings.
* Converts the queries to embeddings.
* Uses cosine similarity to find top-k similar document


In [11]:
import torch

In [12]:
from sentence_transformers import SentenceTransformer, util
from pyterrier import BatchRetrieve

# Load the pre-trained Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Assuming pt_dataset.get_topics('query') gives a DataFrame with columns ['qid', 'query']
queries = pt_dataset.get_topics('query')

# Generate embeddings for queries
query_embeddings = model.encode(queries['query'], convert_to_tensor=True)

# Assuming BM25 is already set up
bm25 = BatchRetrieve(index, wmodel="BM25")

# Perform retrieval using BM25 (baseline)
bm25_results = bm25(queries)

# For semantic search with Sentence-BERT, compare cosine similarities
documents = [doc['text'] for doc in pt_dataset.get_corpus_iter()]
document_embeddings = model.encode(documents, convert_to_tensor=True)

# Compare the query embeddings with document embeddings using cosine similarity
cosine_scores = util.pytorch_cos_sim(query_embeddings, document_embeddings)

# You can retrieve the top K most relevant documents for each query
top_k_results = []
top_k = 10

for i, scores in enumerate(cosine_scores):
    sorted_indices = torch.argsort(scores, descending=True)[:top_k]
    top_k_results.append([documents[idx] for idx in sorted_indices])


ir-lab-wise-2024/subsampled-ms-marco-rag-20250105-training documents:   0%|          | 0/113227 [00:00<?, ?it/…

# Wrapper Class for Semantic Search

PyTerrier Transformer that:
* Takes precomputed document embeddings and a Sentence-BERT model.
* For each query in a dataframe, encodes it, does cosine similarity, and picks top-k docs.
* Returns a DataFrame of results with columns ['qid', 'docno', 'score'].


In [16]:
from pyterrier import Transformer
from sentence_transformers import SentenceTransformer, util
import pandas as pd

class SemanticSearchWrapper(Transformer):
    def __init__(self, query_embeddings, document_embeddings, k=10):
        self.query_embeddings = query_embeddings
        self.document_embeddings = document_embeddings
        self.k = k
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def transform(self, topics):
        all_documents = [doc['text'] for doc in pt_dataset.get_corpus_iter()]
        all_docnos = [doc['docno'] for doc in pt_dataset.get_corpus_iter()]  # Get all docnos

        results_list = []
        for i, query in enumerate(topics['query']):
            query_embedding = self.model.encode(query, convert_to_tensor=True)
            cosine_scores = util.pytorch_cos_sim(query_embedding, self.document_embeddings)

            # Get top k indices and ensure the length is at least k
            sorted_indices = torch.argsort(cosine_scores, descending=True)[:self.k].flatten().tolist()
            num_results = len(sorted_indices) # get actual number of results

            # Create a DataFrame for this query's results, ensuring all columns have the same length
            df = pd.DataFrame({
                'qid': [topics['qid'][i]] * num_results,  # Repeat qid for each result
                'docno': [all_docnos[idx] for idx in sorted_indices],  # Use corresponding docnos
                'score': cosine_scores[0][sorted_indices].cpu().tolist()  # Add cosine similarity scores
            })
            results_list.append(df)

        # Concatenate all query results into a single DataFrame
        final_results_df = pd.concat(results_list, ignore_index=True)
        return final_results_df  # Return a DataFrame

# Evaluation

In [18]:
# Create an instance of the SemanticSearchWrapper
semantic_search_instance = SemanticSearchWrapper(query_embeddings, document_embeddings)

In [15]:
from pyterrier import Experiment


# Create the combined pipelines (doesn't work because running out of RAM)
# bm25_semantic =  bm25 >> semantic_search_instance

# Perform evaluation for both BM25 and semantic search pipelines
results = Experiment([semantic_search_instance],
                     queries,                        # List of queries or topics
                     pt_dataset.get_qrels(),         # Ground truth relevance data (qrels)
                     eval_metrics=["map", "ndcg_cut_10", "P_1", "P_5", "P_10"])

# Print the results of semantic search performance
print(results)

ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training documents:   0%|          | 0/68261 [00:0…

ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training documents:   0%|          | 0/68261 [00:0…

                                                name       map  ndcg_cut_10  \
0  <__main__.SemanticSearchWrapper object at 0x7b...  0.531523     0.652962   

        P_1       P_5      P_10  
0  0.876289  0.793814  0.737113  


# Upload to TIRA

In [19]:
import os
from tira.third_party_integrations import persist_and_normalize_run

# Define the directory path for saving runs
run_dir = '../data/runs'

# Create the directory if it does not exist
os.makedirs(run_dir, exist_ok=True)

# Assign the results to the 'run' variable
run = semantic_search_instance(pt_dataset.get_topics('text'))

# Persist and normalize the run
persist_and_normalize_run(
    run,
    system_name='SemSearch-relevancers',
    default_output=run_dir,
    upload_to_tira=pt_dataset,
)

ir-lab-wise-2024/subsampled-ms-marco-rag-20250105-training documents:   0%|          | 0/113227 [00:00<?, ?it/…

ir-lab-wise-2024/subsampled-ms-marco-rag-20250105-training documents:   0%|          | 0/113227 [00:00<?, ?it/…

The run file is normalized outside the TIRA sandbox, I will store it at "../data/runs".
Done. run file is stored under "../data/runs/run.txt.gz".
Run uploaded to TIRA. Claim ownership via: https://www.tira.io/claim-submission/3d9d7293-a5fa-4bdb-b131-eb9f3c803c0e
