<a href="https://colab.research.google.com/github/hammadkhann/Effective-Dense-Retrieval/blob/main/Neural_Reranking(RD3).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Pyterrier Setup**

In [None]:
!pip -q install python-terrier
import pyterrier as pt
pt.init()

PyTerrier 0.7.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)


## **Importing useful libraries**



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pickle
from google.colab import drive
from pyterrier.measures import *

pd.options.display.max_rows = None
pd.options.display.max_columns = None

## **Connect Google Drive**


In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Preparing testing dataset**

## **DPH(Sparse Retrieval)**

## TREC-2019

In [None]:
br = pt.BatchRetrieve.from_dataset('msmarco_passage', 'terrier_stemmed_text', num_results=100, metadata = ['docno', 'text'], verbose=True)

16:52:42.460 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 1.9 GiB of memory would be required.


In [None]:
dph_2019 = pd.DataFrame()
topics = pt.get_dataset('msmarco_passage').get_topics('test-2019')
topics = pt.get_dataset('msmarco_passage').get_qrels('test-2019')[["qid"]].drop_duplicates().merge(topics, on=['qid'])
for res in br.transform_gen(topics, batch_size=10):
  dph_2019 = dph_2019.append(res)
print("Number of topics: ", len(topics))

BR(DPH):   0%|          | 0/10 [00:00<?, ?q/s]

BR(DPH):   0%|          | 0/10 [00:00<?, ?q/s]

BR(DPH):   0%|          | 0/10 [00:00<?, ?q/s]

BR(DPH):   0%|          | 0/10 [00:00<?, ?q/s]

BR(DPH):   0%|          | 0/3 [00:00<?, ?q/s]

Number of topics:  43


In [None]:
dph_2019.shape

(4205, 7)

## TREC-2020

In [None]:
dph_2020 = pd.DataFrame()
topics = pt.get_dataset('msmarco_passage').get_topics('test-2020')
topics = pt.get_dataset('msmarco_passage').get_qrels('test-2020')[["qid"]].drop_duplicates().merge(topics, on=['qid'])
for res in br.transform_gen(topics, batch_size=10):
  dph_2020 = dph_2020.append(res)
  
print("Number of topics: ", len(topics))

BR(DPH):   0%|          | 0/10 [00:00<?, ?q/s]

BR(DPH):   0%|          | 0/10 [00:00<?, ?q/s]

BR(DPH):   0%|          | 0/10 [00:00<?, ?q/s]

BR(DPH):   0%|          | 0/10 [00:00<?, ?q/s]

BR(DPH):   0%|          | 0/10 [00:00<?, ?q/s]

BR(DPH):   0%|          | 0/4 [00:00<?, ?q/s]

Number of topics:  54


In [None]:
dph_2020.shape

(5329, 7)

## **ColBERT(Dense Retrieval)**

## TREC-2019

In [None]:
index = pt.IndexFactory.of(pt.get_dataset("msmarco_passage").get_index("terrier_stemmed_text"))
print(index.getCollectionStatistics())

16:52:53.864 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 1.9 GiB of memory would be required.
Number of documents: 8841823
Number of terms: 1170682
Number of postings: 215238456
Number of fields: 1
Number of tokens: 288759529
Field names: [text]
Positions:   false



In [None]:
colbert_2019 = pt.io.read_results("/content/drive/MyDrive/E2E.2019.res")
# get document text for each docid
meta = index.getMetaIndex()
text = []
for index, row in colbert_2019.iterrows():
    text.append(meta.getItem('text', int(row['docno'])))
colbert_2019['text'] = text

# get query for each qid
topics_2019 = pt.get_dataset('msmarco_passage').get_topics('test-2019')
colbert_2019 = pd.merge(colbert_2019, topics_2019, on='qid', how='left')

# filter queries on 43 topics
colbert_2019 = colbert_2019[colbert_2019['qid'].isin(dph_2019['qid'].unique())]
# filter data on top100 results for each 43 topics
colbert_2019 = colbert_2019.groupby('qid').apply(lambda x : x.sort_values(by = 'score', ascending = False).head(100).reset_index(drop = True))
# reset index
colbert_2019.reset_index(drop=True, inplace=True)
colbert_2019.shape

(4300, 7)

## TREC-2020

In [None]:
colbert_2020 = pt.io.read_results("/content/drive/MyDrive/E2E.2020.res")
index = pt.IndexFactory.of(pt.get_dataset("msmarco_passage").get_index("terrier_stemmed_text"))

# get document text for each docid
meta = index.getMetaIndex()
text = []
for index, row in colbert_2020.iterrows():
    text.append(meta.getItem('text', int(row['docno'])))
colbert_2020['text'] = text

# get query for each qid
topics_2020 = pt.get_dataset('msmarco_passage').get_topics('test-2020')
colbert_2020 = pd.merge(colbert_2020, topics_2020, on='qid', how='left')

# filter queries on 43 topics
colbert_2020 = colbert_2020[colbert_2020['qid'].isin(dph_2020['qid'].unique())]
# filter data on top100 results for each 43 topics
colbert_2020 = colbert_2020.groupby('qid').apply(lambda x : x.sort_values(by = 'score', ascending = False).head(100).reset_index(drop = True))
# reset index
colbert_2020.reset_index(drop=True, inplace=True)
colbert_2020.shape

16:54:01.754 [main] WARN org.terrier.structures.BaseCompressingMetaIndex - Structure meta reading data file directly from disk (SLOW) - try index.meta.data-source=fileinmem in the index properties file. 1.9 GiB of memory would be required.


(5400, 7)

# **Pyterrier-ColBERT Setup**

In [None]:
!pip install --upgrade git+https://github.com/terrierteam/pyterrier_colbert.git

Collecting git+https://github.com/terrierteam/pyterrier_colbert.git
  Cloning https://github.com/terrierteam/pyterrier_colbert.git to /tmp/pip-req-build-93qr82fo
  Running command git clone -q https://github.com/terrierteam/pyterrier_colbert.git /tmp/pip-req-build-93qr82fo
Collecting ColBERT@ git+https://github.com/cmacdonald/ColBERT.git@v0.2#egg=ColBERT
  Cloning https://github.com/cmacdonald/ColBERT.git (to revision v0.2) to /tmp/pip-install-nk8socw2/colbert_889cbced46f64343916962deb9fca93b
  Running command git clone -q https://github.com/cmacdonald/ColBERT.git /tmp/pip-install-nk8socw2/colbert_889cbced46f64343916962deb9fca93b
  Running command git checkout -b v0.2 --track origin/v0.2
  Switched to a new branch 'v0.2'
  Branch 'v0.2' set up to track remote branch 'v0.2' from 'origin'.
Collecting transformers==3.0.2
  Downloading transformers-3.0.2-py3-none-any.whl (769 kB)
[K     |████████████████████████████████| 769 kB 5.0 MB/s 
[?25hCollecting ujson
  Downloading ujson-4.3.0-cp

In [None]:
from pyterrier_colbert.ranking import ColBERTFactory

## Updating pyterrier-ColBERT load methods to support latest version of transformers library.

In [None]:
!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
!sudo apt-get install git-lfs
!git lfs install
!git clone https://huggingface.co/Luyu/condenser

Detected operating system as Ubuntu/bionic.
Checking for curl...
Detected curl...
Checking for gpg...
Detected gpg...
Running apt-get update... done.
Installing apt-transport-https... done.
Installing /etc/apt/sources.list.d/github_git-lfs.list...done.
Importing packagecloud gpg key... done.
Running apt-get update... done.

The repository is setup! You can now install packages.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 43 not upgraded.
Need to get 6,526 kB of archives.
After this operation, 14.7 MB of additional disk space will be used.
Get:1 https://packagecloud.io/github/git-lfs/ubuntu bionic/main amd64 git-lfs amd64 3.0.2 [6,526 kB]
Fetched 6,526 kB in 1s (10.3 MB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl

In [None]:
import os
import ujson
import torch
import random

from collections import defaultdict, OrderedDict

from colbert.parameters import DEVICE
import colbert.utils.utils
from colbert.utils.utils import print_message


def _load_checkpoint(path, model, optimizer=None, do_print=True):
    if do_print:
        print_message("#> Loading checkpoint", path)

    checkpoint = torch.load(path, map_location='cpu')

    state_dict = checkpoint['model_state_dict']
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        name = k
        if k[:7] == 'module.':
            name = k[7:]
        new_state_dict[name] = v

    checkpoint['model_state_dict'] = new_state_dict

    model.load_state_dict(checkpoint['model_state_dict'], strict=False)

    if optimizer:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'], strict=False)

    if do_print:
        print_message("#> checkpoint['epoch'] =", checkpoint['epoch'])
        print_message("#> checkpoint['batch'] =", checkpoint['batch'])

    return checkpoint

colbert.utils.utils.load_checkpoint = _load_checkpoint
colbert.utils.utils.load_checkpoint.__globals__['load_checkpoint'] = _load_checkpoint
from colbert.modeling.colbert import ColBERT

def _load_model(args, do_print=True):
    colbert = ColBERT.from_pretrained('/content/condenser/',
                                      query_maxlen=args.query_maxlen,
                                      doc_maxlen=args.doc_maxlen,
                                      dim=args.dim,
                                      similarity_metric=args.similarity,
                                      mask_punctuation=args.mask_punctuation)
    colbert = colbert.to(DEVICE)

    print_message("#> Loading model checkpoint.", condition=do_print)

    checkpoint = load_checkpoint(args.checkpoint, colbert, do_print=do_print)

    colbert.eval()

    return colbert, checkpoint

In [None]:
import colbert.evaluation.load_model
import pyterrier_colbert.ranking
colbert.evaluation.load_model.load_model = _load_model
pyterrier_colbert.ranking.load_model = _load_model

# **Evaluation on TREC 2019**

## **Loading model checkpoints**

### Baseline

In [None]:
colbert_checkpoint="/content/drive/MyDrive/colbert-baseline-50000.dnn" 
colbert_factory = ColBERTFactory(colbert_checkpoint, None, None)
colbert_pipeline_ann = pt.transformer.SourceTransformer(colbert_2019) >>  colbert_factory.text_scorer()
colbert_pipeline = pt.transformer.SourceTransformer(dph_2019) >>  colbert_factory.text_scorer()

NameError: ignored

### RD3: Multi-Task Learning (MTL)   

In [None]:
mtl_checkpoint="/content/drive/MyDrive/mtl_v2_cls+tok_losses-50000.dnn" 
mtl_factory = ColBERTFactory(mtl_checkpoint, None, None)
mtl_pipeline_ann = pt.transformer.SourceTransformer(colbert_2019) >>  mtl_factory.text_scorer()
mtl_pipeline = pt.transformer.SourceTransformer(dph_2019) >>  mtl_factory.text_scorer()

  warn("No index_root and index_name specified - no index ranking possible")
  warn("Faiss not installed. You cannot do retrieval")
Some weights of the model checkpoint at /content/condenser/ were not used when initializing ColBERT: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'bert.embeddings.position_ids']
- This IS expected if you are initializing ColBERT from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing ColBERT from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ColBERT

[Dec 04, 16:59:14] #> Loading model checkpoint.
[Dec 04, 16:59:14] #> Loading checkpoint /content/drive/MyDrive/mtl_v2_cls+tok_losses-50000.dnn


## **Experiment**

### DPH(Sparse Retrieval) Re-ranking

In [None]:
pt.Experiment(
    [colbert_pipeline, mtl_pipeline],
    pt.get_dataset('msmarco_passage').get_topics('test-2019'),
    pt.get_dataset('msmarco_passage').get_qrels('test-2019'),
    filter_by_qrels=True,
    names=['DPH >> ColBERT', 'DPH >> ColBERT-mtl-objective'],
    eval_metrics= [nDCG@10, AP(rel=2)@100, RR(rel=2)@10]
)

Unnamed: 0,name,nDCG@10,AP(rel=2)@100,RR(rel=2)@10
0,DPH >> ColBERT,0.655238,0.337195,0.834884
1,DPH >> ColBERT-increase-cls-similarity,0.669187,0.340263,0.865725
2,DPH >> ColBERT-add-token-overlap,0.666822,0.340682,0.810299
3,DPH >> ColCondenser,0.662346,0.345593,0.824612
4,DPH >> ColTaCL,0.660369,0.334005,0.85155
5,DPH >> ColBERT-mtl-objective,0.640291,0.335387,0.868605


### ColBERT(Dense Retrieval) Re-ranking

In [None]:
pt.Experiment(
    [colbert_pipeline_ann, mtl_pipeline_ann],
    pt.get_dataset('msmarco_passage').get_topics('test-2019'),
    pt.get_dataset('msmarco_passage').get_qrels('test-2019'),
    filter_by_qrels=True,
    names=['ANN-ColBERT >> ColBERT', 'ANN-ColBERT >> ColBERT-mtl-objective'],
    eval_metrics= [nDCG@10, AP(rel=2)@100, RR(rel=2)@10]
)

Unnamed: 0,name,nDCG@10,AP(rel=2)@100,RR(rel=2)@10
0,ANN-ColBERT >> ColBERT,0.691909,0.391189,0.858527
1,ANN-ColBERT >> ColBERT-increase-cls-similarity,0.705811,0.400619,0.889535
2,ANN-ColBERT >> ColBERT-add-token-overlap,0.671373,0.383359,0.825581
3,ANN-ColBERT >> ColCondenser,0.680013,0.397421,0.837209
4,ANN-ColBERT >> ColTaCL,0.683721,0.379544,0.854651
5,ANN-ColBERT >> ColBERT-mtl-objective,0.640436,0.354623,0.862403


# **Evaluation on TREC 2020**

## **Loading model checkpoints**

### Baseline 

In [None]:
colbert_pipeline_ann_2020 = pt.transformer.SourceTransformer(colbert_2020) >>  colbert_factory.text_scorer()
colbert_pipeline_2020 = pt.transformer.SourceTransformer(dph_2020) >>  colbert_factory.text_scorer()

### RD3: Multi-Task Learning (MTL)   

In [None]:
mtl_pipeline_ann_2020 = pt.transformer.SourceTransformer(colbert_2020) >>  mtl_factory.text_scorer()
mtl_pipeline_2020 = pt.transformer.SourceTransformer(dph_2020) >>  mtl_factory.text_scorer()

In [None]:
pt.Experiment(
    [mtl_pipeline_2020],
    pt.get_dataset('msmarco_passage').get_topics('test-2020'),
    pt.get_dataset('msmarco_passage').get_qrels('test-2020'),
    filter_by_qrels=True,
    names=['DPH >> ColBERT'],
    eval_metrics= [nDCG@10, AP(rel=2)@100, RR(rel=2)@10]
)

## **Experiment**

### DPH(Sparse Retrieval) Re-ranking

In [None]:
pt.Experiment(
    [colbert_pipeline_2020, mtl_pipeline_2020],
    pt.get_dataset('msmarco_passage').get_topics('test-2020'),
    pt.get_dataset('msmarco_passage').get_qrels('test-2020'),
    filter_by_qrels=True,
    names=['DPH >> ColBERT', 'DPH >> ColBERT-mtl-objective'],
    eval_metrics= [nDCG@10, AP(rel=2)@100, RR(rel=2)@10]
)

Unnamed: 0,name,nDCG@10,AP(rel=2)@100,RR(rel=2)@10
0,DPH >> ColBERT,0.657733,0.395494,0.835979
1,DPH >> ColBERT-increase-cls-similarity,0.668525,0.402908,0.853395
2,DPH >> ColBERT-add-token-overlap,0.655973,0.401742,0.833848
3,DPH >> ColCondenser,0.658674,0.393606,0.820216
4,DPH >> ColTaCL,0.663475,0.402527,0.822531
5,DPH >> ColBERT-mtl-objective,0.659804,0.382391,0.847737


### ColBERT(Dense Retrieval) Re-ranking

In [None]:
pt.Experiment(
    [colbert_pipeline_ann_2020, mtl_pipeline_ann_2020],
    pt.get_dataset('msmarco_passage').get_topics('test-2020'),
    pt.get_dataset('msmarco_passage').get_qrels('test-2020'),
    filter_by_qrels=True,
    names=['ANN-ColBERT >> ColBERT', 'ANN-ColBERT >> ColBERT-mtl-objective'],
    eval_metrics= [nDCG@10, AP(rel=2)@100, RR(rel=2)@10]
)

Unnamed: 0,name,nDCG@10,AP(rel=2)@100,RR(rel=2)@10
0,ANN-ColBERT >> ColBERT,0.693783,0.446686,0.842953
1,ANN-ColBERT >> ColBERT-increase-cls-similarity,0.702668,0.455801,0.856481
2,ANN-ColBERT >> ColBERT-add-token-overlap,0.671862,0.439425,0.803417
3,ANN-ColBERT >> ColCondenser,0.659519,0.434365,0.798148
4,ANN-ColBERT >> ColTaCL,0.669823,0.448219,0.83642
5,ANN-ColBERT >> ColBERT-mtl-objective,0.631162,0.39631,0.833848
