<a href="https://colab.research.google.com/github/len-rtz/wir-2024-relevancers/blob/main/relevancers_baseline_system_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Baseline System Team "Relevancers" TH Köln

The following first draft retrieval systems builds onto the baseline system from https://github.com/irgroup-classrooms/wir-2024

In [14]:
# Install required libraries
!pip3 install 'tira>=0.0.139' ir-datasets 'python-terrier==0.10.0'
!pip install transformers

# Import necessary libraries
import pyterrier as pt
import pandas as pd
from nltk.corpus import stopwords
import re
import nltk
from transformers import pipeline
from pyterrier import IterDictIndexer



In [15]:
# Create an API client to interact with the TIRA platform
from tira.third_party_integrations import ensure_pyterrier_is_loaded
from tira.rest_api_client import Client

ensure_pyterrier_is_loaded()
tira = Client()

In [16]:
# Load Dataset
from pyterrier import get_dataset

pt_dataset = get_dataset('irds:ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training')

# Data Cleaning & Preprocessing

In [17]:
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Text preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Create Index

In [18]:
# Create indexer
indexer = IterDictIndexer(
    "../data/clean_index",
    meta={'docno': 50, 'text': 4096},
    overwrite=True
)

# Create clean document iterator
def clean_docs_iter():
    for doc in pt_dataset.get_corpus_iter():
        yield {'docno': doc['docno'], 'text': clean_text(doc['text'])}

# Build index
index = indexer.index(clean_docs_iter())

Download from Zenodo: https://zenodo.org/records/14254044/files/subsampled-ms-marco-deep-learning-20241201-training-inputs.zip


Download: 100%|██████████| 9.51M/9.51M [00:01<00:00, 6.12MiB/s]


Download finished. Extract...
Extraction finished:  /root/.tira/extracted_datasets/ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training/


ir-lab-wise-2024/subsampled-ms-marco-deep-learning-20241201-training documents:   0%|          | 0/68261 [00:0…

11:14:40.392 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 1 empty documents


# Query Rewriting

In [25]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Load the query reformulation model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("prhegde/t5-query-reformulation-RL")
model = AutoModelForSeq2SeqLM.from_pretrained("prhegde/t5-query-reformulation-RL")

def reformulate_query(query, num_beams=5, temperature=0.7):
    # Ensure query is a string
    text = str(query)

    # Encode text
    encoding = tokenizer.encode_plus(
        text,
        max_length=128,
        padding='longest',
        truncation=True,
        return_tensors="pt"
    )

    # Set random seed for reproducibility
    torch.manual_seed(42)

    # Generate reformulated outputs
    outputs = model.generate(
        input_ids=encoding['input_ids'],
        attention_mask=encoding['attention_mask'],
        max_length=128,
        min_length=5,
        num_beams=num_beams,
        num_return_sequences=3,  # Generate 3 different versions
        temperature=temperature,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        early_stopping=True,
        repetition_penalty=1.5,
        length_penalty=1.0,
        no_repeat_ngram_size=2
    )

    # Decode outputs
    reformulations = []
    for output in outputs:
        reformulated = tokenizer.decode(output, skip_special_tokens=True)
        reformulations.append(reformulated)

    return reformulations

# Test the function
original_query = "how does climate change affect wildlife?"
reformulated_queries = reformulate_query(original_query)

print(f"Original Query: {original_query}")
print("\nReformulated versions:")
for i, reform in enumerate(reformulated_queries, 1):
    print(f"{i}. {reform}")

Original Query: how does climate change affect wildlife?

Reformulated versions:
1. how does climate change affect wildlife
2. impacts of climate change on wildlife
3. how climate change affects wildlife


In [26]:
class T5QueryRewriter(pt.Transformer):
    def __init__(self, num_beams=5, temperature=0.7):
        super().__init__()
        self.num_beams = num_beams
        self.temperature = temperature

    def transform(self, topics):
        # Create a copy of the input topics
        new_topics = topics.copy()

        # Determine which column contains the query
        query_column = 'text' if 'text' in new_topics.columns else 'query'

        # Paraphrase each query
        new_topics[query_column] = new_topics[query_column].apply(
            lambda x: paraphrase_query(x, self.num_beams, self.temperature)
        )

        return new_topics

In [27]:
# used to print some example rewrites to verify the transformer
def print_example_rewrites(topics, t5_rewriter, num_examples=3):
    print("\nExample Query Rewrites:")
    sample_topics = topics.head(num_examples)
    rewrites = t5_rewriter.transform(sample_topics)
    query_column = 'text' if 'text' in topics.columns else 'query'

    for idx, row in sample_topics.iterrows():
        print(f"\nOriginal: {row[query_column]}")
        print(f"Rewritten: {rewrites.loc[idx, query_column]}")

# Retrieve Topics

In [28]:
# Retrieve topics
topics = pt_dataset.get_topics('text')

# Retrieval

In [29]:
# Define retrieval models
bm25 = pt.BatchRetrieve(index, wmodel="BM25")
bm25_rm3 = bm25 >> pt.rewrite.RM3(index) >> bm25

# Create T5 rewriter pipeline
t5_rewriter = T5QueryRewriter(num_beams=5, temperature=0.7)

# Print some example rewrites first
print_example_rewrites(topics, t5_rewriter)

# Create the combined pipelines
bm25_t5 = t5_rewriter >> bm25
bm25_t5_rm3 = t5_rewriter >> bm25_rm3

# Evaluate all models
results = pt.Experiment(
    [bm25_rm3, bm25_t5, bm25_t5_rm3],
    topics,
    pt_dataset.get_qrels(),
    eval_metrics=["map", "recip_rank", "ndcg_cut_10", "P_1", "P_5", "P_10"],
    names=["BM25+RM3", "BM25+T5", "BM25+T5+RM3"]
)

print("\nEvaluation Results:")
print(results)


Example Query Rewrites:

Original: who is aziz hashim
Rewritten: ['aziz hashim was born in israel', 'aziz hashim is known as the leader of israel.', 'aziz hashim was born in israel.']

Original: who is rep scalise
Rewritten: ['who is rep. scalise', 'who is rep. scalise?', 'who is rep scalise?']

Original: who killed nicholas ii of russia
Rewritten: ['who killed nicholas ii of russia', 'who killed nicholas ii of russia?', 'who shot nicholas ii']


JavaException: No methods called newSearchRequest in org/terrier/querying/Manager matching your arguments, requested: ('1030303', ['aziz hashim was born in israel', 'aziz hashim is known as the leader of israel.', 'aziz hashim was born in israel.']), available: ['(Ljava/lang/String;Ljava/lang/String;)Lorg/terrier/querying/SearchRequest;', '(Ljava/lang/String;)Lorg/terrier/querying/SearchRequest;', '()Lorg/terrier/querying/SearchRequest;']

# Upload to TIRA

In [None]:
import os
from tira.third_party_integrations import persist_and_normalize_run

# Define the directory path for saving runs
run_dir = '../data/runs'

# Create the directory if it does not exist
os.makedirs(run_dir, exist_ok=True)

# Assign the results to the 'run' variable
run = bm25_rm3(pt_dataset.get_topics('text'))

# Persist and normalize the run
persist_and_normalize_run(
    run,
    system_name='bm25+rm3-relevancers',
    default_output=run_dir,
    upload_to_tira=pt_dataset,
)