<a href="https://colab.research.google.com/github/len-rtz/wir-2024-relevancers/blob/main/relevancers_baseline_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Baseline System Team "Relevancers" TH Köln

The following first draft retrieval systems builds onto the baseline system from https://github.com/irgroup-classrooms/wir-2024

In [5]:
# Install and import libraries
!pip3 install 'tira>=0.0.139' ir-datasets 'python-terrier==0.10.0'



In [6]:
# Create an API client to interact with the TIRA platform
from tira.third_party_integrations import ensure_pyterrier_is_loaded
from tira.rest_api_client import Client

ensure_pyterrier_is_loaded()
tira = Client()

In [7]:
# Load Dataset
from pyterrier import get_dataset

pt_dataset = get_dataset('irds:ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training')

In [8]:
# Build an index

from pyterrier import IterDictIndexer

indexer = IterDictIndexer(
    # Store the index in the `index` directory.
    "../data/index",
    meta={'docno': 50, 'text': 4096},
    # If an index already exists there, then overwrite it.
    overwrite=True,
)
index = indexer.index(pt_dataset.get_corpus_iter())

Download from Zenodo: https://zenodo.org/records/14254044/files/subsampled-ms-marco-deep-learning-20241201-training-inputs.zip


Download: 100%|██████████| 9.51M/9.51M [00:01<00:00, 7.75MiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_datasets/ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training/


ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training documents:   0%|          | 0/68261 [00:0…

17:59:07.555 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 1 empty documents


# Data Cleaning & Preprocessing

In [9]:
# Import required libraries
import nltk
from nltk.corpus import stopwords
import re

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to remove stopwords
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word.lower() not in stop_words])

# Function to preprocess text (lowercase and remove punctuation)
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Function to validate documents (filter out very short documents)
def is_valid_document(doc):
    return len(doc['text'].split()) > 5

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
# Generate clean_docs by applying preprocessing functions
clean_docs = [
    {**doc, 'text': preprocess_text(remove_stopwords(doc['text']))}
    for doc in pt_dataset.get_corpus_iter() if is_valid_document(doc)
]

# Check the number of cleaned documents and inspect a sample
print("Number of cleaned documents:", len(clean_docs))
print("Sample document:", clean_docs[0])

def is_valid_document(doc):
    cleaned_text = preprocess_text(remove_stopwords(doc['text']))
    return len(cleaned_text.split()) > 5  # Keep only documents with more than 5 words

ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training documents:   0%|          | 0/68261 [00:0…

Number of cleaned documents: 68253
Sample document: {'text': 'voe im really sure voe know need get permit exactly get voe', 'docno': '4459825'}


# Create Index

In [11]:
import os
import pyterrier as pt

# Initialize PyTerrier
if not pt.started():
    pt.init()

# Define the index path
index_path = "/content/wir-2024-relevancers/data/clean_index"

# Create the directory if it doesn't exist
os.makedirs(index_path, exist_ok=True)

# Example function: Validate documents for indexing
def is_valid_document(doc):
    # Ensure the document is not empty after cleaning
    cleaned_text = preprocess_text(remove_stopwords(doc['text']))
    return len(cleaned_text.split()) > 5  # Keep only documents with more than 5 words

# Create the index using IterDictIndexer
indexer = pt.IterDictIndexer(index_path, meta={'docno': 50, 'text': 4096}, overwrite=True)

# Replace `clean_docs` with your iterable of documents
# Ensure `clean_docs` is an iterable of dictionaries with at least `docno` and `text` keys
index = indexer.index(iter(clean_docs))

# Print success message
print("Index created successfully!")

# Load the created index
index_object = pt.IndexFactory.of(index_path)

# Validate the index by printing collection statistics
print(index_object.getCollectionStatistics().toString())

# Convert the index path to an IndexRef
index_ref = pt.IndexRef.of(index_path)

# Print to confirm the type of index_ref
print("IndexRef type:", type(index_ref))  # Should print <class 'pyterrier.querying.IndexRef'>

17:59:49.573 [ForkJoinPool-2-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 1 empty documents
Index created successfully!
Number of documents: 68253
Number of terms: 95646
Number of postings: 1565316
Number of fields: 1
Number of tokens: 2196421
Field names: [text]
Positions:   false

IndexRef type: <class 'jnius.reflect.org.terrier.querying.IndexRef'>


# Retrieval with RM3 Query Expansion

In [12]:
# Step 1: Define BM25 Retrieval Pipeline
bm25 = pt.BatchRetrieve(index_ref, wmodel="BM25")

# Step 2: Chain RM3 with BM25
bm25_rm3 = bm25 >> pt.rewrite.RM3(index_ref) >> bm25

# Step 3: Test RM3 Pipeline
topics = pt_dataset.get_topics('text')  # Load topics
rm3_results = bm25_rm3.transform(topics)

print("BM25+RM3 Results:")
print(rm3_results.head())
print("Columns in RM3 Results:", rm3_results.columns)

Download from Zenodo: https://zenodo.org/records/14254044/files/subsampled-ms-marco-deep-learning-20241201-training-truths.zip


Download: 100%|██████████| 61.7k/61.7k [00:00<00:00, 231kiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_datasets/ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training/
BM25+RM3 Results:
       qid  docid    docno  rank      score             query_0  \
0  1030303  53844  8726436     0  38.501560  who is aziz hashim   
1  1030303  62108  8726435     1  30.684877  who is aziz hashim   
2  1030303  56033  8726433     2  27.497540  who is aziz hashim   
3  1030303  35862  8726437     3  26.724978  who is aziz hashim   
4  1030303  32178  8726429     4  24.604161  who is aziz hashim   

                                               query  
0  applypipeline:off partner^0.028783683 capit^0....  
1  applypipeline:off partner^0.028783683 capit^0....  
2  applypipeline:off partner^0.028783683 capit^0....  
3  applypipeline:off partner^0.028783683 capit^0....  
4  applypipeline:off partner^0.028783683 capit^0....  
Columns in RM3 Results: Index(['qid', 'docid', 'docno', 'rank', 'score', 'query_0', 'query'], dt