## Information Retrieval Passage (Re)ranking
This notebook contains a pipeline that takes the results of a passage full ranker, creates feature vectors with various basic and extended features and then trains models such as Ranknet and XGBOOST using these feature vectors. The results of the fullrankers are stored in the 'output' folder. For this notebook, the index, the passage length index, tokenised queries and tokenised passages are required. Some of these are quite big and as such will be saved to disk after running the notebook once.

## Module imports and pip installations

In [2]:
!python3 -V # please make sure this is python 3
%pip install torch numpy nltk textblob wheel memory_profiler pysos contractions gensim
%load_ext memory_profiler

Python was not found; run without arguments to install from the Microsoft Store, or disable this shortcut from Settings > Manage App Execution Aliases.


Note: you may need to restart the kernel to use updated packages.


Contains all module imports

In [None]:
# file management
import os
import codecs
import importlib
import pickle
import datetime
import pysos
from pathlib import Path

# data structures
import json
import argparse
from collections import defaultdict, Counter
from zipfile import ZipFile
import pandas as pd

# math computations
import numpy as np
import torch.utils.data as data
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

# NLP
import nltk
import regex as re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
from textblob import TextBlob as tb
from nltk.corpus import wordnet as wn
from nltk.stem.porter import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')
stopwords = set(stopwords.words("english"))
stemmer = PorterStemmer()

# computational processes
import multiprocessing
from process import process_single_passage
from functools import partial

# indexing and preprocessing
import pass_len_index
import pysos
import small_index
import process
import passage_preprocess

from small_index import SmallIndex
from pass_len_index import PassageLengthIndex
from passage_preprocess import PassagePP

# full rankers
import create_vector
import full_ranker

from full_ranker import Fullranker

# rerankers
import xgboost_ranker
import ranknet
import reranker

from ranknet import train, inference
from xgboost_ranker import transform_dict, transform_dict_experimental, XGBoostRanker
from reranker import Reranker

# feature vectors
import gensim
import gensim.downloader as api
from create_vector import VectorCreator
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from additional_features import get_awe_similarities, extend_features

# evaluation metrics
import evaluation_methods as evals

## Preprocessing

Load queries and labels into memory

In [None]:
def passage_loader(path):
    print("load passages from: {}".format(path))   
    passages = json.load(open(path, 'r', encoding="utf-8", errors="ignore"))    
    return passages

def query_loader(path):    
    print("load queries from: {}".format(path))
    queries = json.load(open(path, 'r'))    
    return queries


def label_loader(path):
    print("Load labels from: {}".format(path))
    labels = json.load(open(path, 'r'))    
    return labels

# NOTE: passages only get loaded in a seperate cell, if required for followup operations
queries_training = query_loader("../../data/training_queries.json")
queries_validation = query_loader("../../data/validation_queries.json")
queries_test = query_loader("../../data/test_queries.json")

labels_training = label_loader("../../data/training_labels.json")
labels_validation = label_loader("../../data/validation_labels.json")

Create a data class that holds all our data such as queries, labels and passages

In [4]:
class Data:
    def __init__(self, queries_training, queries_validation, 
                 queries_test, labels_training, labels_validation):
        # when creating index, passages could be provided
        self.passages = None
        self.tokenised_passages = None
        self.queries_training = queries_training
        self.queries_validation = queries_validation
        self.queries_test = queries_test
        self.labels_training = labels_training
        self.labels_validation = labels_validation
        

data = Data(queries_training, queries_validation,
                 queries_test, labels_training, labels_validation)

We also create a Search Engine wrapper class for the full ranker

In [None]:
class SearchEngine:
    def __init__(self, data):
        self.full_ranker = None
        self.name = "None"
        self.data = data

    def create_file(self, filename):
        """Generate a filename specific to this named search engine
        """
        assert isinstance(self.name, str)
        assert isinstance(filename, str)
        return self.name + "_" + filename
    
    def sort_scores(self, scores):
        """rank the calclulated scores from largest to smallest"""
        for q_id, p2score in tqdm(scores.items()):
            sorted_p2score=sorted(p2score.items(), key=lambda x:x[1], reverse = True)
            scores[q_id]=sorted_p2score
            
        return scores
    
    def write_full_rank_results(self, scores, data_name="training"):
        """Convert a dictionary of queries and the scored passages to a text file"""

        # create result file name
        timestamp = datetime.datetime.utcnow().strftime('%Y-%m-%d')
        result_file = self.create_file(f"{timestamp}full_ranking_{data_name}_result.text")

        # open text file and write top 100 results into file together with its features
        with codecs.open(f"../../pipeline/output/{result_file}", "w", "utf-8") as file:
            for q_id, p2score in scores.items():
                ranking=0

                # loop through top 100 for each query and write the data for each passage
                for (p_id, score) in p2score[:100]:
                    ranking+=1         
                    feature_1 = score
                    feature_2 = 0  # len(self.data.passages[p_id]) -> this is for the reranker later           
                    file_title = self.create_file(f"full_ranking_on_the_{data_name}_set")
                    file.write('\t'.join([q_id, p_id, str(ranking), str(feature_1), str(feature_2), file_title])+os.linesep) 

        # output the result file
        print("Produce file {}".format(f"output/{result_file}"))

Functions for preprocessing the queries

In [5]:
def process_queries(queries, lemmatizer, stemmer):
    """Preprocesses all queries"""
    
    print("Start preprocessing the queries.")
    # initialize the preprocessing results dict
    queries_tokenised = {}
    
    # for every query
    for query_id in tqdm(queries.keys()):
        # create a text blob of the current query
        text_blob = tb(queries[query_id]).words
        
        # lower words and remove stopwords from text blob
        no_stop = [stemmer.stem(word.lower()) for word in text_blob if not word in stopwords]
        
        # store the preprocessed result
        queries_tokenised[query_id] = no_stop
    return queries_tokenised


def process_all_query_data(data, stemmer=None, lemmatizer=None):
    data.tokenised_queries_training = process_queries(data.queries_training, lemmatizer, stemmer)
    data.tokenised_queries_validation = process_queries(data.queries_validation, lemmatizer, stemmer)
    data.tokenised_queries_test = process_queries(data.queries_test, lemmatizer, stemmer)
    return data

# We use the PorterStemmer from nltk
stemmer = PorterStemmer()

# The queries are preprocessed and stored in our data class
data = process_all_query_data(data, stemmer=stemmer)



Start preprocessing the queries.


100%|██████████| 8000/8000 [00:01<00:00, 6153.69it/s]


Start preprocessing the queries.


100%|██████████| 200/200 [00:00<00:00, 5881.09it/s]


Start preprocessing the queries.


100%|██████████| 200/200 [00:00<00:00, 5713.07it/s]


Cell for preprocessing passages and loading them into memory (also saving to disk if there is none on disk yet)

In [None]:
get_tokenised_passages = True

if get_tokenised_passages == True:
        try:
            # try to load tokenised passages from disk
            with open("tokenised_passages.pickle", 'rb') as f:
                        tokenised_passages = pickle.load(f)
        except:
            # using a seperate main cell in order to perform multiprocessing during the preprocessing of passages
            if __name__ ==  '__main__': 
                            # load passages into data class
                            data.passages = passage_loader("../../data/passages_small.json")
                            index_name = "small"

                            # create passage preprocess object
                            indexer = PassagePP(data)
            
                            # call preprocessing function and load passages into memory
                            tokenised_passages = indexer.process_passages(stopwords, stemmer)

                            # save passages to disk using pickling
                            with open("tokenised_passages.pickle", 'wb') as f:
                                pickle.dump(tokenised_passages, f)

## Index loading

Attempt to load the indexes, if they are not on disk create them new and save them to disk

In [7]:
# create a document length index for calculating the BM25 score
if __name__ ==  '__main__': 
    try:
        # init passage length index object
        pass_len = PassageLengthIndex(data)

        # attempt to load index from disk
        pass_len_index = pass_len.load_pass_len_index("small")
    except FileNotFoundError:

        # load passages using loader
        print("Failed to load, because no stored passage length index is found. Will now create a new index.")
        if not data.passages:
            data.passages = passage_loader("../../data/passages_small.json")

        # init passage length index object
        pass_len_index = PassageLengthIndex(data)
        pickle_name = 'small'

        # create passage length index, save to disk and load into memory
        pass_len_index = pass_len_index.create_pass_len_index(stopwords, pickle_name, tokenised_pass=tokenised_passages, stemmer=stemmer,  result=True)


# use a seperate main call, for the multiprocessing part of create index
if __name__ ==  '__main__': 
    try:
        # init inverted index object
        indexer = SmallIndex(data)

        # attempt to load inverted index to memory
        index = indexer.load_index("small")
    except FileNotFoundError:
        print("Failed to load, because no stored index is found. Will now create a new index.")
        if not data.passages:
            data.passages = passage_loader("../../data/passages_small.json")

        indexer = SmallIndex(data)
        index_name = "small"
        
        # create inverted index with tokens from passages as keys and data for each token as value
        index = indexer.create_index(index_name, stopwords, stemmer, tokenised_pass=tokenised_passages, result=True)

Trying to load index from disk...
Successfully loaded index to memory!
Trying to load index from disk...
Successfully loaded index to memory!


## Full-Ranking
Given a user query, full-ranking aims to quickly and roughly rank all passages and return a ranked list of passages.

### 4.1
a) Create a Fullranker class with scoring methods <br>
b) Create search engine instance and assign a fullranker to it <br>
c) Assign a name to the search engine

### 4.2
d) Get full ranker training scores on the tokenised training queries <br>
e) Get full ranker validation scores on the tokenised validation queries <br>
f) Get full ranker test scores on the tokenised test queries

Search engine properties are made and you can choose which of the 3 datasets you would like to run for scoring, after which you run the next cells.

In [None]:
# create a full ranker
franker = Fullranker(index)

# create the search engine and assign properties
search_engine = SearchEngine(data=data)
search_engine.full_ranker = franker
search_engine.name = "bm25_temp_results"

SCORE_TRAINING = False
SCORE_VALIDATION = True
SCORE_TEST = False

# parameters for the full rankers, alpha=0 for non smoothed

parameters = {
'alpha': 0.7,
'b':0.6,
'k1':0.7,
'l':pass_len_index
}

# save functions in dict for selecting later
full_rankers = {'tf':search_engine.full_ranker.tf_score,
                'tfidf':search_engine.full_ranker.tf_idf_score,
                'bm25':search_engine.full_ranker.BM25_score,
                'ql':search_engine.full_ranker.query_likelihood
                }

chosen_ranker = 'bm25'
ranker_function = full_rankers[chosen_ranker]

Following cell computes the scores on the datasets chosen above

In [None]:
if SCORE_TRAINING:
    # For each query, calculte scores of all passages on the training set.
    training_scores = ranker_function(search_engine.data.tokenised_queries_training, **parameters)

    # rank the calclulated scores from largest to smallest.
    training_scores = search_engine.sort_scores(training_scores)

    # write results to text file
    search_engine.write_full_rank_results(training_scores, data_name="training")

if SCORE_VALIDATION:
    # For each query, calculte scores of all passages on the validation set.
    val_scores = ranker_function(search_engine.data.tokenised_queries_validation, **parameters)

    # rank the calclulated scores from largest to smallest.
    val_scores = search_engine.sort_scores(val_scores)

    # write results to text file
    search_engine.write_full_rank_results(val_scores, data_name="validation")     

if SCORE_TEST:
    # For each query, calculte scores of all passages on the test set.
    test_scores = ranker_function(search_engine.data.tokenised_queries_test, **parameters)

    # rank the calclulated scores from largest to smallest.
    test_scores = search_engine.sort_scores(test_scores)

    # write results to text file
    search_engine.write_full_rank_results(test_scores, data_name="test")  


## Vector Creation

Creates the basic feature vector with BM25 as main score (though the chosen full ranker results can be changed if desired using the 'path_keywords' list of tokens) and the other models as additional features. Also contains features such as query term count and passage term count

In [2]:
# obtain result path based on keywords
def get_results(search_tokens, large=False):
    results = []
    search_path = '../../pipeline/output/'
    
    # check all full ranker output folders
    for root, dir, files in os.walk(search_path):
        for file in files:
            results.append(str(os.path.join(root, file)))

    # loop through found filenames
    for path in results:

            # check if all keywords are present in path name
            if all(tokens in path for tokens in search_tokens):

                # if using results from fullranker on the large passages dataset, return those
                if large == True:
                    if 'large' in path:
                        return path
                else:
                    if 'large' not in path:
                        return path

# get feature vector for reranking
def get_features(dataset='training', path_keywords=['bm25', 'training']):
    path = get_results(path_keywords)
    print(path)

    # insert labels and queries for each dataset in the dict
    params = {
        'training':[data.labels_training, data.tokenised_queries_training],
        'validation':[data.labels_validation, data.tokenised_queries_validation],
        'test':[None, data.tokenised_queries_test]
    }

    # init VectorCreator object to get feature vectors
    create_vec = VectorCreator(index, pass_len_index, params[dataset][0], params[dataset][1], path)

    # get the features 
    features = create_vec.get_vectors()
    return features

**Load or create feature vectors**

In [170]:
# load locally saved feature vectors for training
if os.path.isfile("features_training.pickle"):
    with open("features_training.pickle", 'rb') as f:
        features_training = pickle.load(f)
else:
    features_training = get_features('training', ['bm25', 'training'])
    with open("features_training.pickle", 'wb') as f:
        pickle.dump(features_training, f)

# load locally saved feature vectors for validation
if os.path.isfile("features_validation.pickle"):
    with open("features_validation.pickle", 'rb') as f:
        features_validation = pickle.load(f)
else:
    features_validation = get_features('validation', ['bm25', 'validation'])
    with open("features_validation.pickle", 'wb') as f:
        pickle.dump(features_validation, f)

# load locally saved feature vectors for test
if os.path.isfile("features_test.pickle"):
    with open("features_test.pickle", 'rb') as f:
        features_test = pickle.load(f)
else:
    features_test = get_features('test', ['bm25', 'test'])
    with open("features_test.pickle", 'wb') as f:
        pickle.dump(features_test, f)


Extend features with LSA and AWE

In [42]:
# load word2vec model
awe_model = gensim.downloader.load("glove-twitter-50")

In [92]:
# enable or disable additional features
USE_LSA_FEATURES = True
USE_AWE_FEATURES = True




if USE_AWE_FEATURES == True:
    print("get awe matrix for validation")
    qids = list(features_validation.keys())
    pids = [list(features_validation[q_id].keys()) for q_id in qids]
    distances_awe_validation = get_awe_similarities(qids, data.tokenised_queries_validation, pids, data.tokenised_passages, awe_model)

    print("get awe matrix for test")
    qids = list(features_test.keys())
    pids = [list(features_test[q_id].keys()) for q_id in qids]
    distances_awe_test = get_awe_similarities(qids, data.tokenised_queries_test, pids, data.tokenised_passages, awe_model)

    print("get awe matrix for training")
    qids = list(features_training.keys())
    pids = [list(features_training[q_id].keys()) for q_id in qids]
    distances_awe_training = get_awe_similarities(qids, data.tokenised_queries_training, pids, data.tokenised_passages, awe_model)





get awe matrix for validation


100%|██████████| 200/200 [00:03<00:00, 52.55it/s]


get awe matrix for test


100%|██████████| 200/200 [00:03<00:00, 51.71it/s]


get awe matrix for training


100%|██████████| 7983/7983 [02:33<00:00, 52.01it/s]


In [None]:
USE_LSA_FEATURES = True
features_training = extend_features(features_training, data.queries_training, data.passages, awe_scores=distances_awe_training, awe=USE_AWE_FEATURES, lsa=USE_LSA_FEATURES)
features_validation = extend_features(features_validation, data.queries_validation, data.passages, awe_scores=distances_awe_validation, awe=USE_AWE_FEATURES, lsa=USE_LSA_FEATURES)
features_test = extend_features(features_test, data.queries_test, data.passages, awe_scores=distances_awe_test, awe=USE_AWE_FEATURES, lsa=USE_LSA_FEATURES)

## Reranking

Create an object that holds all the hyperparameters for RankNet and set the seeds

In [15]:
parser = argparse.ArgumentParser()
parser.add_argument("--epochs", type=int, default=30)
parser.add_argument("--lr", type=float, default=0.001)
parser.add_argument("--input_size", type=int, default=7)
parser.add_argument("--hidden_size1", type=int, default=256)
parser.add_argument("--hidden_size2", type=int, default=512)
parser.add_argument("--output_size", type=int, default=1)
parser.add_argument("--batch_size", type=int, default=512)
parser.add_argument("--random_seed", type=int, default=0)
args = parser.parse_known_args()[0]


np.random.seed(args.random_seed)
torch.manual_seed(args.random_seed)
torch.cuda.manual_seed_all(args.random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

In [176]:
importlib.reload(xgboost_ranker)
importlib.reload(reranker)
importlib.reload(ranknet)

# init reranker object
rranker = Reranker(features_training, features_validation, 'm4-reranker-tuning', ranknet_args=args, features_test=features_test)

In [177]:
# run rerankers and get the result file paths for evaluation
xgboost_results_path = rranker.xgboost_reranker(test=False, awe=True, lsa=True)
ranknet_results_path = rranker.ranknet_reranker(test=False, awe=True, lsa=True)

Finished transforming dict to df, fitting...
Fit successful, predicting...
predicting...
Predict successful, saving results...
Produce file reranker_output/test/bm25_awe_and_lsa_vector_xgboost_test_result.text


## Evaluation

In [143]:


def run_evaluation(df, model_name, path="../output/re_ranking_validation_result.text", debug=False, tuning_results=False):
    print(f"Name: {model_name}, Path: {path}")
    
    # create score storage (for every query a list of passages)
    scores = defaultdict(list)
    
    # open the results from the model path
    with codecs.open(path, "r", "utf-8") as file:
        # store every query result in the scores dictionary
        for line in file.readlines():
            content = line.split('\t')
            scores[content[0]].append(content[1])
    
    # Flag for debugging
    if debug:
        print(f"Number of passages: {len(list(scores.items())[0][1])}")
        print(f"Number of queries: {len(scores)}")
    
    # run MRR evaluation
    mrr = evals.MRR(queries=queries_validation, labels=labels_validation)
    mrr_res = mrr.evaluate(scores, k=100)
    

    # run DCG evaluation
    dcg = evals.DCG(queries=queries_validation, labels=labels_validation)
    dcg_res = dcg.evaluate(scores, k=100)
    

    # run nDCG evaluation
    ndcg = evals.nDCG(queries=queries_validation, labels=labels_validation)
    ndcg_res = ndcg.evaluate(scores, k=20)
    
    
    # run nDCG evaluation
    precision = evals.Precision(queries=queries_validation, labels=labels_validation)
    precision_res = precision.evaluate(scores, k=20)
    
    
    # run MAP evaluation (mean for all queries in total)
    map = evals.MAP(queries=queries_validation, labels=labels_validation)
    # print(map.labels)
    map_res = map.evaluate(scores)
    

    if debug:
        print("Full-ranking:")
        print("............")
        print('MRR@{}: {:.4f}'.format(100, mrr_res))
        print('DCG@{}: {:.4f}'.format(100, dcg_res))
        print('nDCG@{}: {:.4f}'.format(20, ndcg_res))
        print('Precision@{}: {:.4f}'.format(20, precision_res))
        print('MAP: {:.4f}'.format(map_res))
        print("............")
        print("\n")
    
    if tuning_results is False:
    # add model_name, MRR, DCG, NDCG, MAP
        df.loc[len(df.index)] = [model_name, mrr_res, dcg_res, ndcg_res, map_res, precision_res]
    else:
        return model_name, mrr_res, dcg_res, ndcg_res, map_res, precision_res

# initialize dataframe
df = pd.DataFrame(columns=['model_name', 'MRR', 'DCG', 'NDCG@20', "MAP", "Precision@20"])



run_evaluation(df, model_name=f"XGBoost BM25 AWE+LSA vector", path=xgboost_results_path)

run_evaluation(df, model_name=f"RankNet BM25 AWE+LSA vector", path=ranknet_results_path)


# Results from old runs are shown here
run_evaluation(df, model_name=f"XGBoost BM25 AWE+LSA  vector", path='reranker_output/validation/bm25_awe_and_lsa_vector_xgboost_validation_result.text')

run_evaluation(df, model_name=f"RankNet BM25 AWE+LSA  vector", path='reranker_output/validation/bm25_awe_and_lsa_vector_ranknet_validation_result.text')

run_evaluation(df, model_name=f"XGBoost BM25 AWE vector", path='reranker_output/validation/bm25_awe_vector_xgboost_validation_result.text')

run_evaluation(df, model_name=f"RankNet BM25 AWE vector", path='reranker_output/validation/bm25_awe_vector_ranknet_validation_result.text')

run_evaluation(df, model_name=f"XGBoost-BM25 basic", path=f"reranker_output/validation/bm25_basic_vector_xgboost_validation_result.text")

run_evaluation(df, model_name=f"Ranknet-BM25 basic", path=f"reranker_output/validation/bm25_basic_vector_ranknet_validation_result.text")

run_evaluation(df, model_name=f"BM25", path=f"../../milestones/milestone_2/output/small_passage_results/validation_results/M2_tok-low-stop-stem_bm25_full_ranking_validation_result.text")

display(df)

Name: XGBoost BM25 AWE+LSA vector, Path: reranker_output/validation/bm25_awe_and_lsa_vector_xgboost_validation_result.text
Name: RankNet BM25 AWE+LSA vector, Path: reranker_output/validation/bm25_awe_and_lsa_vector_ranknet_validation_result.text
Name: XGBoost BM25 AWE+LSA  vector, Path: reranker_output/validation/bm25_awe_and_lsa_vector_xgboost_validation_result.text
Name: RankNet BM25 AWE+LSA  vector, Path: reranker_output/validation/bm25_awe_and_lsa_vector_ranknet_validation_result.text
Name: XGBoost BM25 AWE vector, Path: reranker_output/validation/bm25_awe_vector_xgboost_validation_result.text
Name: RankNet BM25 AWE vector, Path: reranker_output/validation/bm25_awe_vector_ranknet_validation_result.text
Name: XGBoost-BM25 basic, Path: reranker_output/validation/bm25_basic_vector_xgboost_validation_result.text
Name: Ranknet-BM25 basic, Path: reranker_output/validation/bm25_basic_vector_ranknet_validation_result.text
Name: BM25, Path: ../../milestones/milestone_2/output/small_passage_

Unnamed: 0,model_name,MRR,DCG,NDCG@20,MAP,Precision@20
0,XGBoost BM25 AWE+LSA vector,0.313787,2.272981,0.306167,0.251015,0.108051
1,RankNet BM25 AWE+LSA vector,0.324395,2.288053,0.315676,0.262394,0.110051
2,XGBoost BM25 AWE+LSA vector,0.313787,2.272981,0.306167,0.251015,0.108051
3,RankNet BM25 AWE+LSA vector,0.324395,2.288053,0.315676,0.262394,0.110051
4,XGBoost BM25 AWE vector,0.301692,2.27462,0.299996,0.242378,0.109051
5,RankNet BM25 AWE vector,0.314573,2.290427,0.311641,0.252779,0.110551
6,XGBoost-BM25 basic,0.298728,2.274026,0.298606,0.239628,0.110051
7,Ranknet-BM25 basic,0.318111,2.288484,0.312633,0.256371,0.110301
8,BM25,0.316362,2.282817,0.3135,0.257047,0.111051
