# Introduction to PyTerrier

_IN4325: Information retrieval lecture, TU Delft_

**Part 6: Learning to rank**

In this part, we'll dive into learning-to-rank (LTR) models. Specifically, we'll cover how to use PyTerrier transformers to

- compute query-document features and
- train and evaluate LTR models.

In order to run everything in this notebook, you'll need [NLTK](https://www.nltk.org/), [scikit-learn](https://scikit-learn.org/), and [LightGBM](https://github.com/microsoft/LightGBM/tree/master/python-package) installed:


We'll use NLTK for tokenization later. This requires some data that we need to download first:


In [None]:
!pip install gensim
!pip install irds
!pip install python-terrier==0.10.0 nltk scikit-learn lightgbm

In [None]:
import pyterrier as pt

if not pt.started():
    pt.init(tqdm="notebook")

In [None]:
import nltk

nltk.download("punkt")
nltk.download('stopwords')

We'll use the `nfcorpus` dataset again, as before. In this notebook, we'll use a subset of the queries (`nontopic`). The only reason for this is that it makes the computations faster.


In [None]:
# dataset = pt.get_dataset("irds:nfcorpus")

----------------------------------------------------------------------------------------------------------------------------------------------------------------THIS IS WHERE OUR CODE IS ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
from gensim.models import Word2Vec
from pathlib import Path


# DATASET = pt.datasets.get_dataset("irds:antique/test/non-offensive")
DATASET = pt.get_dataset('irds:antique/train/split200-train')
IDX_PATH = Path("index").absolute()
if not (IDX_PATH / "data.properties").is_file():
    pt.index.IterDictIndexer(
        str(IDX_PATH),
        meta={
            "docno": 32,
            "text": 131072,
        },
    ).index(DATASET.get_corpus_iter())

## Word2Vec


In [None]:
# used for ranking - original corpus
tokenized_dict = {}
for doc in DATASET.get_corpus_iter():
  tokenized_dict[doc['docno']] = doc['text']

In [None]:
from gensim.models import KeyedVectors
import numpy as np
from numpy import dot
from numpy.linalg import norm
from sklearn.metrics import precision_score, recall_score, f1_score

antique_queries = DATASET.get_topics().head(80)

def cosine_similarity(a, b):
    """
    Compute the cosine similarity between two vectors.
    """
    return dot(a, b) / (norm(a) * norm(b))

def pad_vectors(vec1, vec2):
    """
    Pad the shorter vector with zeros to match the length of the longer vector.
    """
    len_diff = len(vec1) - len(vec2)
    if len_diff > 0:
        vec2 = np.pad(vec2, (0, len_diff))
    elif len_diff < 0:
        vec1 = np.pad(vec1, (0, -len_diff))
    return vec1, vec2


def vectorize_document(document, model):
    """
    Convert a document into a vector by averaging the vectors of its words.
    """
    words = document.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)


def rank_documents(query, documents_dict, model):
    """
    Rank documents based on their similarity to the query.
    """
    query_vector = vectorize_document(query, model)
    # document_vectors = [vectorize_document(doc, model) for doc in documents]
    document_vectors = {}

    for key, value in documents_dict.items():
      document_vectors[key] = vectorize_document(value, model)
    #q, d = pad_vectors(query_vector, document_vectors)
    # similarities = [np.dot(pad_vectors(query_vector, doc_vector)) for doc_vector in document_vectors]
    sim = {}
    for key, value in document_vectors.items():

      q, d = pad_vectors(query_vector, value)
      sim[key] = cosine_similarity(q, d)

    # ranked_indices = np.argsort(sim)[::-1]
    return sorted(sim.items(), key=lambda item: item[1], reverse=True)


def rank_documents_pd(query, qid, document_pd, model):
    """
    Rank documents based on their similarity to the query.
    :param query: The query document as a string.
    :param document_pd: A pandas DataFrame of documents with columns 'docno', 'label', and the text content.
    :param model: A trained Word2Vec model.
    :return: A list of ground truth labels for the documents, ranked by Word2Vec similarity to the query.
    """
    query_vector = vectorize_document(query, model)
    sim = {}

    # Iterate over the DataFrame rows
    for index, row in document_pd.iterrows():
        # Assuming 'text' is the column containing the document text
        doc_text = tokenized_dict[row['docno']]
        doc_vector = vectorize_document(doc_text, model)
        # print("doc_vector: ", doc_vector, "query_vector: ", query_vector)

        # Compute similarity and store it with document number as key
        q, d = pad_vectors(query_vector, doc_vector)
        # print("q: ", q, "d: ", d)
        sim[row['docno']] = cosine_similarity(q, d)
        # print("sim: ", cosine_similarity(q, d))

    # Sort documents based on similarity
    sorted_docnos = sorted(sim.items(), key=lambda item: item[1], reverse=True)
    # print("sorted_docnos: ", sorted_docnos)
    # Extract the ground truth labels in the ranked order
    ranked_labels = [document_pd[document_pd['docno'] == docno].iloc[0]['label'] for docno, _ in sorted_docnos]
    for i, docno in enumerate(sorted_docnos):
      if qid != document_pd[document_pd['docno'] == docno[0]].iloc[0]['qid']:
        ranked_labels[i] = 0

    # print("ranked_labels: ", ranked_labels)

    return ranked_labels


# Function to filter corpus based on IDF threshold
def filter_corpus_by_idf(corpus, threshold, idf_per_word):
    return [[word for word in doc if idf_per_word.get(word, 0) >= threshold] for doc in corpus]

def ndcg(retrieved_relevance, gt_relevance):
    gt_length = min(len(gt_relevance), 30)

    dcg = sum(relevance / np.log2(i + 2) for i, relevance in enumerate(retrieved_relevance[:gt_length]))

    ideal_sorted_relevance = sorted(gt_relevance, reverse=True)[:gt_length]
    idcg = sum(relevance / np.log2(i + 2) for i, relevance in enumerate(ideal_sorted_relevance))
    # print("lists: ", retrieved_relevance, ideal_sorted_relevance)
    return dcg / idcg if idcg else 0

def evaluate_model(model, filtered_test_corpus, qrel_test_corpus_pd, qids):
    """
    Evaluate the model on the test set.
    """
    ndcg_scores = []
    for qid in qids:
        qrel_labled_docs = qrel_test_corpus_pd.loc[qrel_test_corpus_pd["qid"]==qid].apply(lambda row: (row["docno"], row["label"]), axis=1).tolist()
        groundtruth_ranking = [doc[1] for doc in qrel_labled_docs]
        if len(groundtruth_ranking) == 0:
            continue
        ranked_documents_w2v = rank_documents_pd(antique_queries.loc[antique_queries['qid']==qid].iloc[0]['query'], qid, qrel_test_corpus_pd, model)

        ndcg_scores.append(ndcg(ranked_documents_w2v, groundtruth_ranking))
        # print("ndcg_score: ", ndcg_scores[-1])

    return ndcg_scores

In [None]:
from nltk.tokenize import word_tokenize

corpus = [doc['text'] for doc in DATASET.get_corpus_iter()]
# tokenize
tokenized_corpus = [word_tokenize(doc) for doc in corpus]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

tfidf_model = TfidfVectorizer()
tfidf_vector = tfidf_model.fit_transform(corpus)

idf_per_word = dict(zip(tfidf_model.get_feature_names_out(), tfidf_model.idf_))

## Cross-validation

The following code cross validate the model to select the best threshold value based on the nDCG@10 score. This part isn't fully working due to a bad validation result so hasn't been incorporated into our main function.

In [None]:
threshold_values = [0.05, 0.1, 0.5]

# Initialize KFold
kf = KFold(n_splits=3, shuffle=True, random_state=42)

best_threshold = None
best_score = -np.inf
qids = DATASET.get_topics()[:3]["qid"].unique()
qrels = DATASET.get_qrels()

# Train and evaluate models for each threshold
for threshold in threshold_values:
    print(f"Evaluating threshold {threshold}...")

    filtered_corpus = []
    for doc in tokenized_corpus:
        filtered_doc = []
        for word in doc:
            if word in idf_per_word:
                if idf_per_word[word] >= threshold:
                    filtered_doc.append(word)
        filtered_corpus.append(filtered_doc)

    filtered_corpus = [[word for word in doc if word not in stop_words] for doc in filtered_corpus]
    scores = []

    # Instead of converting the entire corpus to an array, work with indices directly
    for train_index, test_index in kf.split(filtered_corpus):

        train_corpus = [filtered_corpus[i] for i in train_index]
        test_corpus = [filtered_corpus[i] for i in test_index]


        # get the docnos of the test corpus using test_index
        docnos = [doc["docno"] for i, doc in enumerate(DATASET.get_corpus_iter()) if i in test_index]
        # get the qrels which are from test_corpus
        qrel_test_corpus_pd = qrels[qrels["docno"].isin(docnos)]
        # print(qrel_test_corpus_pd.iloc[0])

        # model = Word2Vec(train_corpus, vector_size=100, window=5, min_count=1, sg=1)
        # evaluate_model(model, test_corpus, qrel_test_corpus_pd, qids)
        # print("test: ", qrel_test_corpus_pd[qrel_test_corpus_pd['docno'] == '2531329_4'].iloc[0]['qid'])

        tfidf_vectorizer = TfidfVectorizer()
        tfidf_vectorizer.fit([' '.join(doc) for doc in train_corpus])
        idf_per_word = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_vectorizer.idf_))


        model = Word2Vec(train_corpus, vector_size=100, window=5, min_count=1, sg=1)
        model.train(train_corpus, total_examples=len(train_corpus), epochs=10)

        score = evaluate_model(model, test_corpus, qrel_test_corpus_pd, qids)
        scores += score

    average_score = np.mean(scores)
    print(f"Threshold {threshold} average score: {average_score}")

    # Update best threshold based on evaluation score
    if average_score > best_score:
        best_score = average_score
        best_threshold = threshold

print(f"Best threshold: {best_threshold} with score: {best_score}")

## Training and Testing

Train the model again with the best threshold and evaluate it on several single queries.

In [None]:
filtered_corpus = []
for doc in tokenized_corpus:
    filtered_doc = []
    for word in doc:
        if word in idf_per_word:
            if idf_per_word[word] >= best_threshold:
                filtered_doc.append(word)
    filtered_corpus.append(filtered_doc)

filtered_corpus = [[word for word in doc if word not in stop_words] for doc in filtered_corpus]


epochs = 10

# Dimensionality of the feature vectors
vector_size = 100 #300
# The maximum distance between the current and predicted word within a sentence
window = 5
# Ignores all words with total absolute frequency lower than this
mincount = 1

model = Word2Vec(filtered_corpus, vector_size=vector_size, window=window, min_count=mincount, sg=1)
model.train(filtered_corpus, total_examples=len(filtered_corpus), epochs=epochs)

model_name = "word2vec_best_threshold_stopwords.model"
model.save(model_name)
model = Word2Vec.load("word2vec_best_threshold_stopwords.model")

import pandas as pd

# Example usage
query = "dog"
N_TOP = 10

ranked_documents = rank_documents(query, tokenized_dict, model)
word2vec_df = pd.DataFrame(columns=['docno', 'text', 'rank', 'score', 'query'])


for i, (key, value) in enumerate(ranked_documents):
  print(key, value)
  if (i > N_TOP):
    break
  for doc in DATASET.get_corpus_iter():

    if (doc['docno'] == key):
      new_row = {'docno': doc['docno'], 'text': doc['text'], 'rank': i, 'score': value, 'query': query}
      new_row = pd.DataFrame([new_row])

      # word2vec_df = pd.concat([word2vec_df, d], ignore_index=True)
      word2vec_df = pd.concat([word2vec_df, new_row], ignore_index=True)
      break

word2vec_df