<a href="https://colab.research.google.com/github/len-rtz/wir-2024-relevancers/blob/main/relevancers_baseline_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Baseline System Team "Relevancers" TH Köln

The following first draft retrieval systems builds onto the baseline system from https://github.com/irgroup-classrooms/wir-2024

In [1]:
# Install required libraries
!pip3 install 'tira>=0.0.139' ir-datasets 'python-terrier==0.10.0'

# Import necessary libraries
import pyterrier as pt
import pandas as pd
from nltk.corpus import stopwords
import re
import nltk

Collecting tira>=0.0.139
  Downloading tira-0.0.143-py3-none-any.whl.metadata (4.6 kB)
Collecting ir-datasets
  Downloading ir_datasets-0.5.9-py3-none-any.whl.metadata (12 kB)
Collecting python-terrier==0.10.0
  Downloading python-terrier-0.10.0.tar.gz (107 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.6/107.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting wget (from python-terrier==0.10.0)
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyjnius>=1.4.2 (from python-terrier==0.10.0)
  Downloading pyjnius-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting matchpy (from python-terrier==0.10.0)
  Downloading matchpy-0.5.5-py3-none-any.whl.metadata (12 kB)
Collecting chest (from python-terrier==0.10.0)
  Downloading chest-0.2.3.tar.gz (9.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting

In [20]:
# Create an API client to interact with the TIRA platform
from tira.third_party_integrations import ensure_pyterrier_is_loaded
from tira.rest_api_client import Client

ensure_pyterrier_is_loaded()
tira = Client()

In [18]:
# Load Dataset
from pyterrier import get_dataset

pt_dataset = get_dataset('irds:ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training')

# Data Cleaning & Preprocessing

In [21]:
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Text preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Create Index

In [13]:
from pyterrier import IterDictIndexer

In [22]:
# Create indexer
indexer = IterDictIndexer(
    "../data/clean_index",
    meta={'docno': 50, 'text': 4096},
    overwrite=True
)

# Create clean document iterator
def clean_docs_iter():
    for doc in pt_dataset.get_corpus_iter():
        yield {'docno': doc['docno'], 'text': clean_text(doc['text'])}

# Build index
index = indexer.index(clean_docs_iter())

Download from Zenodo: https://zenodo.org/records/14254044/files/subsampled-ms-marco-deep-learning-20241201-training-inputs.zip


Download: 100%|██████████| 9.51M/9.51M [00:00<00:00, 36.0MiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_datasets/ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training/


ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training documents:   0%|          | 0/68261 [00:0…

14:45:08.451 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 1 empty documents


# Retrieve Topics

In [23]:
# Retrieve topics
topics = pt_dataset.get_topics('text')

Download from Zenodo: https://zenodo.org/records/14254044/files/subsampled-ms-marco-deep-learning-20241201-training-truths.zip


Download: 100%|██████████| 61.7k/61.7k [00:00<00:00, 1.44MiB/s]

Download finished. Extract...
Extraction finished:  /root/.tira/extracted_datasets/ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training/





# Retrieval

In [24]:
# Define retrieval model
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

# Add RM3 query expansion to each model
bm25_rm3 = bm25 >> pt.rewrite.RM3(index) >> bm25

# Evaluate
results = pt.Experiment(
    [bm25_rm3],
    topics,
    pt_dataset.get_qrels(),
    eval_metrics=["map", "recip_rank", "ndcg_cut_10", "P_1", "P_5", "P_10"],
    names=["BM25+RM3"]
)

print(results)

       name       map  recip_rank  ndcg_cut_10       P_1       P_5      P_10
0  BM25+RM3  0.452199    0.768722     0.512417  0.680412  0.653608  0.612371


# Upload to TIRA

In [25]:
import os
from tira.third_party_integrations import persist_and_normalize_run

# Define the directory path for saving runs
run_dir = '../data/runs'

# Create the directory if it does not exist
os.makedirs(run_dir, exist_ok=True)

# Assign the results to the 'run' variable
run = bm25_rm3(pt_dataset.get_topics('text'))

# Persist and normalize the run
persist_and_normalize_run(
    run,
    system_name='bm25+rm3-relevancers-baseline',
    default_output=run_dir,
    upload_to_tira=pt_dataset,
)

The run file is normalized outside the TIRA sandbox, I will store it at "../data/runs".
Done. run file is stored under "../data/runs/run.txt.gz".
Run uploaded to TIRA. Claim ownership via: https://www.tira.io/claim-submission/9a651923-11d3-428e-8824-5d621a9c6393
