### **Install packages and download the 2020 Deep Learning Track data**

In [1]:
!apt install libomp-dev
!pip install faiss-gpu --upgrade
!wget -c https://www.dropbox.com/s/m1n2wf80l1lb9j1/collection.tar.gz?dl=1 -O - | tar -xz
!wget https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/topics.dl20.txt
!wget https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.dl20-passage.txt
!pip install git+https://github.com/pedrogengo/pygaggle.git
!git clone https://github.com/leobavila/ia376e_projeto_final.git

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libomp5
Suggested packages:
  libomp-doc
The following NEW packages will be installed:
  libomp-dev libomp5
0 upgraded, 2 newly installed, 0 to remove and 37 not upgraded.
Need to get 239 kB of archives.
After this operation, 804 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libomp5 amd64 5.0.1-1 [234 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libomp-dev amd64 5.0.1-1 [5,088 B]
Fetched 239 kB in 1s (228 kB/s)
Selecting previously unselected package libomp5:amd64.
(Reading database ... 155222 files and directories currently installed.)
Preparing to unpack .../libomp5_5.0.1-1_amd64.deb ...
Unpacking libomp5:amd64 (5.0.1-1) ...
Selecting previously unselected package libomp-dev.
Preparing to unpack .../libomp-dev_5.0.1-1_amd64.deb ...
Unpacking libomp-dev (5.0.1-

### **Mount Google Drive**

In [2]:
#google drive connection
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


### **Import packages**

In [3]:
import argparse
import collections
import torch
import json
import pandas as pd
from pygaggle.rerank.base import Query, Text
from pygaggle.rerank.transformer import DuoT5
from transformers import T5ForConditionalGeneration
from tqdm import tqdm
from typing import List
import time

2021-12-08 21:01:23 [INFO] loader: Loading faiss with AVX2 support.
2021-12-08 21:01:23 [INFO] loader: Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'")
2021-12-08 21:01:23 [INFO] loader: Loading faiss.
2021-12-08 21:01:23 [INFO] loader: Successfully loaded faiss.


### **Reranking**

Code to rerank monoT5 output using a duoT5 model. The pyggagle code to run it was modified so we can obtain the pairwise_scores during the process.

The former pyggagle code uses a sym_sum aggregation function to calculate the pointwise_scores.

The pairwise_scores are important so we can test a bunch of aggregation functions fastly.

In [4]:
# Setting parameters
num_rerank = 300
run_to_execute = "part1"
#run_to_execute = "part2"

# Dataset path
collection_tsv = './collection.tsv'
topics = './topics.dl20.txt'
input_run = './ia376e_projeto_final/results/base.dl20.p.dTq.rm3.mono.trec'

# Output path
output_run = f'./drive/My Drive/Colab Notebooks/base.dl20.p.dTq.rm3.duo.{num_rerank}_{run_to_execute}.trec'

# Getting device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [5]:
def load_run(path):
    """Loads run into a dict of key: query_id, value: list of candidate doc
    ids."""

    # We want to preserve the order of runs so we can pair the run file with
    # the TFRecord file.
    print('Loading run...')
    run = collections.OrderedDict()
    with open(path) as f:
        for line in tqdm(f):
            query_id, _, doc_title, rank, _, _ = line.split()
            if query_id not in run:
                run[query_id] = []
            run[query_id].append((doc_title, int(rank)))

    # Sort candidate docs by rank.
    print('Sorting candidate docs by rank...')
    sorted_run = collections.OrderedDict()
    for query_id, doc_titles_ranks in tqdm(run.items()):
        doc_titles_ranks.sort(key=lambda x: x[1])
        doc_titles = [doc_titles for doc_titles, _ in doc_titles_ranks]
        sorted_run[query_id] = doc_titles

    return sorted_run

In [6]:
# load the duot5-base-marco model
model = T5ForConditionalGeneration.from_pretrained('castorini/duot5-base-msmarco').to(device).eval()
reranker = DuoT5(model=model)
print(f'Running on {device}')

Downloading:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Running on cuda


In [7]:
# load the queries into a dictionary
queries = {}
with open(topics) as f:
    for line in f:
        query_id, query_text = line.strip().split('\t')
        queries[query_id] = query_text

In [8]:
# load the collection into a dictionary
collection = {}
with open(collection_tsv) as f:
    for line in f:
        doc_id, doc_text = line.strip().split('\t')
        collection[doc_id] = doc_text

In [9]:
# load the queries and its monoT5 results sorted by rank into a dictionary
run = load_run(path=input_run)

54000it [00:00, 793082.33it/s]
100%|██████████| 54/54 [00:00<00:00, 6734.03it/s]

Loading run...
Sorting candidate docs by rank...





In [10]:
# split the run into two orderedicts containing 27 and 27 queries
run1 = collections.OrderedDict()
run2 = collections.OrderedDict()

query_counter = 0
for query_id, doc_ids in run.items():
  if query_counter >= len(run)//2:
    run2[query_id] = doc_ids
  else:
    run1[query_id] = doc_ids
  query_counter += 1

In [11]:
# select the queries to be run (the first or the last 27), due to colab pro running time limitations
if run_to_execute == "part1":
  run = run1
elif run_to_execute == "part2":
  run = run2
else:
  pass
print(run_to_execute)

part1


In [12]:
# reranking
query_scores = dict()
with open(output_run, 'w') as fout:
    for query_id, doc_ids in tqdm(run.items(), total=len(run)):
        
        query = Query(queries[query_id])
        doc_ids = doc_ids[:num_rerank]
        texts = [Text(collection[doc_id], {'docid': doc_id}, 0) for doc_id in doc_ids]
        agg_scores, pairwise_scores = reranker.rescore(query, texts)
        query_scores[query_id] = pairwise_scores
        reranked = sorted(agg_scores, key=lambda x: x.score, reverse=True)
        for rank, doc in enumerate(reranked, start=1): 
            fout.write(f'{query_id} Q0 {doc.metadata["docid"]} {rank} {doc.score} duo\n')
print('Done!')

100%|██████████| 27/27 [04:39<00:00, 10.36s/it]

Done!





In [13]:
# create the json file containing the pairwise_scores
json.dump(query_scores, open(f'./drive/My Drive/Colab Notebooks/output_duot5_pairwise_scores_{num_rerank}_{run_to_execute}.json', 'w'))

In [14]:
# the results calculated using the sym_sum aggregation function are the the trec format:
# query_id, Q0, doc_title, rank, scores, model
!head -n 10 "./drive/My Drive/Colab Notebooks/base.dl20.p.dTq.rm3.duo.{num_rerank}_{run_to_execute}.trec"

23849 Q0 2647769 1 57.23760701843639 duo
23849 Q0 8010561 2 56.05483990375433 duo
23849 Q0 1944730 3 49.411186351208016 duo
23849 Q0 5554704 4 46.262890528465505 duo
23849 Q0 188190 5 42.54471337364521 duo
23849 Q0 7119957 6 39.256772236425604 duo
23849 Q0 3878669 7 38.94966362317791 duo
23849 Q0 8059826 8 37.859609535487834 duo
23849 Q0 2017213 9 35.78464242769405 duo
23849 Q0 6667419 10 33.69924053696013 duo
