In [None]:
!git clone https://github.com/NThakur20/DeepCT.git
!pip install beir

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# BEFORE RUNNING THIS OPEN DEEPCT/DEEPCT/MODDELING.PY AND CHANGE LINE 339 TO assignment_map[name] = name_to_variable[name]
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

from DeepCT.deepct import run_deepct                            # git clone https://github.com/NThakur20/DeepCT.git

from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.generation.models import QGenModel
from tqdm.autonotebook import trange

import pathlib, os, json
import logging
import requests
import random

In [4]:
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

In [None]:
dataset = "nfcorpus"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
out_dir = "datasets"
data_path = util.download_and_unzip(url, out_dir)
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")

In [None]:
base_model_url = "https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip"
out_dir = "models"
bert_base_dir = util.download_and_unzip(base_model_url, out_dir)

In [7]:
checkpoint_dir = "DeepCT_Trained"

In [None]:
run_deepct.flags.DEFINE_string('f', '', 'kernel')

In [None]:
# the checkpoint number of the init_checkpoint has to be set

if not os.path.isfile(os.path.join(checkpoint_dir, "deepct.jsonl")):
    ################################
    #### Command-Line Arugments ####
    ################################
    run_deepct.FLAGS.task_name = "beir"                                                     # Defined a seperate BEIR task in DeepCT. Check out run_deepct.
    run_deepct.FLAGS.do_train = False                                                       # We only want to use the code for inference.
    run_deepct.FLAGS.do_eval = False                                                        # No evaluation.
    run_deepct.FLAGS.do_predict = True                                                      # True, as we would use DeepCT model for only prediction.
    run_deepct.FLAGS.data_dir = data_path + "/corpus.jsonl"                    # Provide original path to corpus data, follow beir format.
    run_deepct.FLAGS.vocab_file = bert_base_dir + "/vocab.txt"                 # Provide bert-base-uncased model vocabulary.
    run_deepct.FLAGS.bert_config_file = bert_base_dir + "/bert_config.json"     # Provide bert-base-uncased config.json file.
    run_deepct.FLAGS.init_checkpoint = checkpoint_dir + "/model.ckpt-{ckpt-number}"     # Provide DeepCT MSMARCO model (bert-base-uncased) checkpoint file.
    run_deepct.FLAGS.max_seq_length = 350                                                   # Provide Max Sequence Length used for consideration. (Max: 512)
    run_deepct.FLAGS.train_batch_size = 128                                                 # Inference batch size, Larger more Memory but faster!
    run_deepct.FLAGS.output_dir = checkpoint_dir                                                 # Output directory, this will contain two files: deepct.jsonl (output-file) and predict.tf_record
    run_deepct.FLAGS.output_file = "deepct.jsonl"                                           # Output file for storing final DeepCT produced corpus.
    run_deepct.FLAGS.m = 100                                                                # Scaling parameter for DeepCT weights: scaling parameter > 0, recommend 100
    run_deepct.FLAGS.smoothing = "sqrt"                                                     # Use sqrt to smooth weights. DeepCT Paper uses None.
    run_deepct.FLAGS.keep_all_terms = True                                                  # Do not allow DeepCT to delete terms.

    # Runs DeepCT model on the corpus.jsonl
    run_deepct.main()

This notebook covers the step for the DeepCT evaluation from HuggingFace until the part where Docker is used (https://github.com/beir-cellar/beir/blob/main/examples/retrieval/evaluation/sparse/evaluate_deepct.py). The rest of the evaluatoin cannot be done in a notebook but have to be done on a machine where Docker can be installed. The deepct.jsonl created with this notebook can be used for the evaluation with Docker.

If Docker runs out of Java heap space add -e JAVA_TOOL_OPTIONS="-Xmx12800m" to the docker run command.