In [None]:
!pip install torch==1.13.0+cu116 -f https://download.pytorch.org/whl/torch_stable.html
!pip install pytorch-pretrained-bert
!pip install livelossplot
!pip install nvidia-ml-py3
!pip install unidecode
!pip install pandas
!pip install ir_datasets

!pip install ipywidgets==7.* --user
!pip install widgetsnbextension jupyter_contrib_nbextensions --user
!jupyter contrib nbextension install --user
!jupyter nbextension enable --py widgetsnbextension

In [1]:
from os.path import exists
import pandas as pd
import numpy as np
import torch

from tqdm.notebook import tqdm
import logging

from pytorch_pretrained_bert import BertTokenizer
from pytorch_pretrained_bert.modeling import BertForSequenceClassification

from util.bert import *
from util.evaluation import *

In [2]:
collection_filename = 'docs-dev.tsv'
queries_filename = 'msmarco-docdev-queries.tsv'
top100s_filename = 'msmarco-docdev-top100'
qrels_filename = 'msmarco-docdev-qrels.tsv'

query_ids_validation_filename = 'query-ids-dev-validation.tsv'
query_ids_test_filename = 'query-ids-dev-test.tsv'

model_dir = "model/"
msmarco_dir = "data/msmarco_doc/"
output_dir = "data/output/"

top_n = 100

# Set up

## Download files

In [None]:
import urllib.request
import gzip
import shutil

folder = 'data/msmarco_doc/'
file_names = [queries_filename, top100s_filename, qrels_filename]

for file_name in file_names:
    file_path = folder + file_name
    file_name_gz = file_name + '.gz'
    file_path_gz = folder + file_name_gz
    

    if not exists(file_path_gz):
        print('Downloading ' + file_name_gz + ' ...')
        url = 'https://msmarco.blob.core.windows.net/msmarcoranking/' + file_name_gz
        urllib.request.urlretrieve(url, file_path_gz)

    if not exists(file_path):
        print('Extracting ' + file_name_gz + ' ...')
        with gzip.open(file_path_gz, 'rb') as file_gz:
            with open(file_path, 'wb') as file:
                shutil.copyfileobj(file_gz, file)

## Generate document collection
Must be executed in Google Colab

In [54]:
import ir_datasets
dataset = ir_datasets.load("msmarco-document/trec-dl-2019")
doc_store = dataset.docs_store()

# Load the validation and test query ids, and the top 100s
query_ids_validation = pd.read_csv(
    msmarco_dir + query_ids_validation_filename,
    delimiter=' ', encoding='utf-8', header=None,
    names=['query_id']
)
query_ids_test = pd.read_csv(
    msmarco_dir + query_ids_test_filename,
    delimiter=' ', encoding='utf-8', header=None,
    names=['query_id']
)
top100s = pd.read_csv(
    msmarco_dir + top100s_filename, 
    delimiter=' ', encoding='utf-8', header=None,
    names = ['query_id', 'Q0', 'doc_id', 'initial_rank', 'score', 'run']
)[['query_id', 'doc_id']]

# Select the top 100s of the queries that are present in the test or validation dataset
top100s_filtered = pd.concat([query_ids_validation, query_ids_test]).merge(top100s, how='left', on=['query_id'])

# Create a new dataframe for the documents in the selected top 100s
docs = pd.DataFrame(list(np.unique(top100s_filtered['doc_id'].tolist())), columns=['doc_id'])

# Fetch the body of every document
docs['body'] = docs['doc_id'].apply(lambda doc_id: doc_store.get(doc_id).body)

# Save the document collection
docs.to_csv(msmarco_dir + collection_filename, sep="\t", header=False, index=False)

## Convert model to PyTorch

In [None]:
!pip install tensorflow
!python util/bert_convert_tensorflow_to_pytorch.py --tf_checkpoint_path=./model/BERT_Base_trained_on_MSMARCO/model.ckpt-100000 --bert_config_file=./model/BERT_Base_trained_on_MSMARCO/bert_config.json --pytorch_dump_path=./model/BERT_Base_trained_on_MSMARCO/pytorch.bin

# Experiment

In [4]:
def tokenize_split_length(tokenizer, query, body):
    return split_doc(query, body, tokenizer)

def tokenize_split_period(tokenizer, query, body):
    return split_doc(query, body, tokenizer, at_period=True)

split_methods = {
    'length': tokenize_split_length,
    'period': tokenize_split_period,
}

n_chunks = 10

# validation (True) or test (False) dataset
validation = True

## Load data

In [3]:
docs = pd.read_csv(
    msmarco_dir + collection_filename, 
    delimiter='\t', encoding='utf-8', header=None,
    names = ['doc_id', 'body']
)

queries = pd.read_csv(
    msmarco_dir + queries_filename, 
    delimiter='\t', encoding='utf-8', header=None, 
    names=['query_id', 'query']
)

top100s = pd.read_csv(
    msmarco_dir + top100s_filename, 
    delimiter=' ', encoding='utf-8', header=None,
    names=['query_id', 'Q0', 'doc_id', 'initial_rank', 'score', 'run']
)[['query_id', 'doc_id', 'initial_rank']]

## Preprocess

In [None]:
tqdm.pandas()

tqdm.write('General preprocessing...')

# Query IDs
data = pd.read_csv(msmarco_dir + (query_ids_validation_filename if validation else query_ids_test_filename), delimiter=' ', encoding='utf-8', header=None)
data.columns = ['query_id']

# Queries, top 100s
data = data.merge(top100s[top100s['initial_rank'] <= top_n], how='left', on=['query_id'])
data = data.merge(queries, how='left', on=['query_id'])
data['query'] = data['query'].progress_apply(lambda x: remove_non_alphanumeric(x.lower()))

# Docs
data = data.merge(docs, how='left', on=['doc_id'])
data['body'] = data['body'].progress_apply(lambda x: remove_non_alphanumeric(x.lower(), keep_periods=True))

## Model

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', 2)
model.load_state_dict(torch.load(model_dir + 'BERT_Base_trained_on_MSMARCO/pytorch.bin'))

model.eval()
model.to('cuda')

# Tokenize + run

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
logging.getLogger('pytorch_pretrained_bert').setLevel(logging.ERROR)

query_ids = list(np.unique(data['query_id'].tolist()))
query_id_chunks = list(split_chunks(query_ids, n_chunks))

# Tokenization
for method, tokenize in split_methods.items():
    for i, query_id_chunk in enumerate(query_id_chunks):
        file = output_dir + 'run.bert.{}.{}.tsv-{}-of-{}'.format(method, 'val' if validation else 'test', i + 1, n_chunks)
        if exists(file):
            continue

        tqdm.write('Tokenization method \'{}\', chunk {}/{}'.format(method, i + 1, n_chunks))

        data_chunk = data[data['query_id'].isin(query_id_chunk)].copy()

        tqdm.write('Tokenizing ...')
        data_chunk['input'] = data_chunk.progress_apply(lambda row: tokenize(tokenizer, row['query'], row['body']), axis=1)
        data_chunk = data_chunk.explode('input')

        tqdm.write('Running ...')
        run_bert(model, data_chunk)

        tqdm.write('Scoring ...')
        data_chunk['score'] = data_chunk.progress_apply(lambda row: row['output'].data[0][1].item(), axis=1)

        data_chunk[['query_id', 'doc_id', 'score']].to_csv(file,sep="\t", header=False,index=False)

## Evaluate

In [None]:
evaluation = pd.DataFrame()

for method in split_methods.keys():
    filenames = [output_dir + 'run.bert.{}.{}.tsv-{}-of-{}'.format(method, 'val' if validation else 'test', i + 1, n_chunks) for i in range(n_chunks)]

    dfs = []
    for filename in filenames:
        df = pd.read_csv(filename, delimiter='\t', encoding='utf-8', header=None, names=['query_id', 'doc_id', 'score'])
        dfs.append(df)
    
    df = pd.concat(dfs, ignore_index=True)
    df.to_csv(output_dir + method + '.tsv', sep=' ', header=False, index=False)

    for agg in ['sum', 'max', 'first']:
        input_path = output_dir + '{}.tsv'.format(method)
        output_path = output_dir + 'BERT-{}-{}-ranking.txt'.format(method, agg)
        aggregate_results(input_path, output_path, agg, normalize=agg == 'sum')

        results = evaluate(msmarco_dir + qrels_filename, output_path)
        evaluation['metric'] = results.keys()
        evaluation['{}-{}'.format(method, agg)] = results.values()

print(evaluation)