In [None]:
!pip install sentence-transformers

In [None]:

"""
This examples show how to train a Cross-Encoder for the SQuAD Dataset.

The query and the passage are passed simoultanously to a Transformer network. The network then returns
a score between 0 and 1 how relevant the passage is for a given query.

The resulting Cross-Encoder can then be used for passage re-ranking: You retrieve for example 100 passages
for a given query, for example with ElasticSearch, and pass the query+retrieved_passage to the CrossEncoder
for scoring. You sort the results then according to the output of the CrossEncoder.

This gives a significant boost compared to out-of-the-box ElasticSearch / BM25 ranking.

Running this script:
python train_cross-encoder.py
"""
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, LoggingHandler, util
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers import InputExample
from pathlib import Path
import torch
import logging
from datetime import datetime
import gzip
import pandas as pd
import os
import tarfile
from tqdm import tqdm
import json
import shutil
import csv

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

root = '/gdrive/MyDrive/Project 2/retrieve-rerank'

Mounted at /gdrive


In [None]:
#First, we define the transformer model we want to fine-tune
# model_name = 'distilroberta-base'
model_name = 'cross-encoder/ms-marco-MiniLM-L-6-v2'

train_batch_size = 32
num_epochs = 1
model_save_path = 'output/training_squad_cross-encoder-'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
result_dir = Path(root) / 'results'
data_dir = Path(root) / 'data'

# We train the network with as a binary label task
# Given [query, passage] is the label 0 = irrelevant or 1 = relevant?
# We use a positive-to-negative ratio: For 1 positive sample (label 1) we include 4 negative samples (label 0)
# in our training setup. For the negative samples, we use the triplets provided by MS Marco that specify (query, positive sample, negative sample).
pos_neg_ration = 4

# Maximal number of training samples we want to use
max_train_samples = 2e7

#We set num_labels=1, which predicts a continous score between 0 and 1
model = CrossEncoder(model_name, num_labels=1, max_length=512)


### Now we read the SQuAD dataset
data_folder = 'squad-data'
os.makedirs(data_folder, exist_ok=True)


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

### loading corpus and questions

In [None]:
def load_dataset(path, url):
  contexts = dict()
  questions = dict()
  golden_context_ids = dict()

  if not os.path.exists(path):
    logging.info("Download SQuAD dev-v1.1 ...")
    util.http_get(url, path)


  with open(path, 'r', encoding='utf8') as json_file:
    json_object = json.load(json_file)
    context_id = 0
    question_id = 0

    for i in range (len(json_object['data'])):
      data = json_object['data'][i]

      for j in range(len(data['paragraphs'])):
        paragraph = data['paragraphs'][j]
        contexts[str(context_id)] = paragraph['context']

        for k in range(len(paragraph['qas'])):
          questions[str(question_id)] = paragraph['qas'][k]['question']
          golden_context_ids[str(question_id)] = context_id

          question_id += 1

        context_id += 1

  return contexts, questions, golden_context_ids

train_contexts, train_questions, train_golden_context_ids = load_dataset(os.path.join(data_folder, 'train-set'), 'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json')
dev_contexts, dev_questions, dev_golden_context_ids = load_dataset(os.path.join(data_folder, 'dev-set'), 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json')

  0%|          | 0.00/8.12M [00:00<?, ?B/s]

  0%|          | 0.00/1.05M [00:00<?, ?B/s]

In [None]:
dev_contexts

{'0': 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.',
 '1': 'The Panthers finished the regular season with a 15–1 record, and quarterback Cam Newton was named the NFL Most Valuable Player (MVP). They defeated the Arizona Cardinals 49–15 in the NFC Championship Game an

### Creating training & dev data (Without injection of BM25 scores)

In [None]:
### Now we create our training & dev data
train_samples = []
dev_samples = {}

# We use 200 random queries from the train set for evaluation during training
# Each query has at least one relevant and up to 200 irrelevant (negative) passages
num_dev_queries = 200
num_max_dev_negatives = 200

train_filepath = os.path.join(data_folder, 'train_qidpidtriples.csv')

if not os.path.exists(train_filepath):
    logging.info("Copy into "+os.path.basename(train_filepath))
    shutil.copy(f'{data_dir}/train_qidpidtriples.csv', train_filepath)

train_df = pd.read_csv(train_filepath)

# shuffle the dataframe
train_eval_df = train_df.sample(frac=1).reset_index(drop=True)

for i in range(train_eval_df.shape[0]):
  qid, pos_id, neg_id = str(train_eval_df['qid'][i]), str(train_eval_df['pos_id'][i]), str(train_eval_df['neg_id'][i])

  if qid not in dev_samples and len(dev_samples) < num_dev_queries:
    dev_samples[qid] = {'query': train_questions[qid], 'positive': set(), 'negative': set()}

  if qid in dev_samples:
    dev_samples[qid]['positive'].add(train_contexts[pos_id])

    if len(dev_samples[qid]['negative']) < num_max_dev_negatives:
      dev_samples[qid]['negative'].add(train_contexts[neg_id])


# Read our training file
count = 0

for i in range(train_df.shape[0]):
  qid, pos_id, neg_id = str(train_df['qid'][i]), str(train_df['pos_id'][i]), str(train_df['neg_id'][i])

  # only use data
  if qid in dev_samples:
    continue

  question = train_questions[qid]
  if (count % (pos_neg_ration+1)) == 0:
    passage = train_contexts[neg_id]
    label = 0
  else:
    passage = train_contexts[pos_id]
    label = 1

  train_samples.append(InputExample(texts=[question, passage], label=label))
  count += 1

# import the evaluator to be used
from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator

In [None]:
len(train_samples)

87399

### Creating training & dev data (Injecting BM25 scores)

Copy the training set from google drive to the temporary Colab drive and preprocess the data.

In [None]:
train_filepath = os.path.join(data_folder, 'trainset_bm25score.csv')

if not os.path.exists(train_filepath):
    logging.info("Copy into "+os.path.basename(train_filepath))
    shutil.copy(f'{data_dir}/trainset_bm25score.csv', train_filepath)

train_df = pd.read_csv(train_filepath)


# normalize the bm25 scores
global_min_bm25 = 0
global_max_bm25 = 50

train_df['pos_score'] = ((train_df['pos_score']-global_min_bm25)/(global_max_bm25 - global_min_bm25))*100
train_df['neg_score'] = ((train_df['neg_score']-global_min_bm25)/(global_max_bm25 - global_min_bm25))*100

# shuffle the dataframe
train_eval_df = train_df.sample(frac=1).reset_index(drop=True)

In [None]:
train_df

In [None]:
### Now we create our training & dev data
train_samples = []
dev_samples = {}

# We use 200 random queries from the train set for evaluation during training
# Each query has at least one relevant and up to 200 irrelevant (negative) passages
num_dev_queries = 200
num_max_dev_negatives = 200

for i in range(train_eval_df.shape[0]):
  qid, pos_id, neg_id = str(train_eval_df['qid'][i]), str(train_eval_df['pos_id'][i]), str(train_eval_df['neg_id'][i])
  pos_score, neg_score = int(train_eval_df['pos_score'][i]), int(train_eval_df['neg_score'][i])

  if qid not in dev_samples and len(dev_samples) < num_dev_queries:
    dev_samples[qid] = {'query': list(), 'positive': list(), 'negative': list()}

  if qid in dev_samples:
    dev_samples[qid]['positive'].append(train_contexts[pos_id])
    dev_samples[qid]['query'].append("{} [SEP] {}".format(pos_score, train_questions[qid]))

    if len(dev_samples[qid]['negative']) < num_max_dev_negatives:
      dev_samples[qid]['negative'].append(train_contexts[neg_id])
      dev_samples[qid]['query'].append("{} [SEP] {}".format(neg_score, train_questions[qid]))


for i in range(train_df.shape[0]):
  qid, pos_id, neg_id = str(train_df['qid'][i]), str(train_df['pos_id'][i]), str(train_df['neg_id'][i])
  pos_score, neg_score = int(train_eval_df['pos_score'][i]), int(train_eval_df['neg_score'][i])

  # only use data not included in dev_samples
  if qid in dev_samples:
    continue

  question = train_questions[qid]
  pos_context = train_contexts[pos_id]
  neg_context = train_contexts[neg_id]

  train_samples.append(InputExample(texts=["{} [SEP] {}".format(pos_score, question), pos_context], label=1))
  train_samples.append(InputExample(texts=["{} [SEP] {}".format(neg_score, question), neg_context], label=0))

# download CERerankingEvaluator_bm25cat code which is a modification on CERerankingEvaluator.py class from SBERT
util.http_get('https://github.com/arian-askari/ms-marco-MiniLM-L-12-v3/raw/main/train/CERerankingEvaluator_bm25cat.py', os.path.join('./', 'CERerankingEvaluator_bm25cat.py'))
from CERerankingEvaluator_bm25cat import CERerankingEvaluator

  0%|          | 0.00/1.36k [00:00<?, ?B/s]

## Training

Train a model and save it in the Google Drive.

In [None]:
# We create a DataLoader to load our train samples
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)

# We add an evaluator, which evaluates the performance during training
# It performs a classification task and measures scores like F1 (finding relevant passages) and Average Precision
evaluator = CERerankingEvaluator(dev_samples, name='train-eval')

# Configure the training
warmup_steps = 300
logging.info("Warmup-steps: {}".format(warmup_steps))


# Train the model
model.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=500,
          warmup_steps=warmup_steps,
          output_path=model_save_path,
          use_amp=True)

# Save latest model
torch.save(model, model_save_path+'-latest')
# model.save(model_save_path+'-latest')

# Copy the model into the parmenent drive
num_trial=0
model_dir = result_dir / f'cross-encoder-noinjection-v_{num_trial}'

while model_dir.is_file():
    num_trial = int(model_dir.name.replace('cross-encoder-noinjection-v_',''))
    model_dir = result_dir / f'cross-encoder-noinjection-v_{num_trial+1}'

shutil.copy(model_save_path+'-latest', model_dir)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2732 [00:00<?, ?it/s]

PosixPath('/gdrive/MyDrive/Project 2/retrieve-rerank/results/cross-encoder-noinjection-v_4')

## Testing

In [None]:
bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
bi_encoder.max_seq_length = 256     #Truncate long passages to 256 tokens
num_doc_retrieve = 32

cross_encoder = model

dev_context_embeds = bi_encoder.encode(list(dev_contexts.values()), convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/65 [00:00<?, ?it/s]

In [None]:
def search(query, k):

  ### Retrieve ###
  # Encode the query using the bi-encoder and find potentially relevant passages
  question_embed = bi_encoder.encode(query, convert_to_tensor=True).cuda()
  hits = util.semantic_search(question_embed, dev_context_embeds, top_k=num_doc_retrieve)
  hits = hits[0] # Get the hits for the first query

  ### Re-Ranking ###
  # Score all retrieved passages with the cross_encoder
  cross_input = [[query, dev_contexts[str(hit['corpus_id'])]] for hit in hits]
  cross_scores = cross_encoder.predict(cross_input)

  # Sort results by the cross-encoder scores
  for i in range(len(cross_scores)):
    hits[i]['cross-score'] = cross_scores[i]

  hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
  top_k_context_id = [hit['corpus_id'] for hit in hits[0:k]]

  return top_k_context_id

Just find top k accuracies for both k=5 and k=20.

In [None]:
def find_topk_acc(top_k):
  correct = 0
  count = 0

  for id in tqdm(dev_questions.keys(), mininterval = 3, desc ="Evaluating..."):
    question = dev_questions[id]
    golden_context_id = dev_golden_context_ids[id]
    top_k_context_ids = search(question, top_k)

    if golden_context_id in top_k_context_ids:
      correct += 1

    # To limit the number of iterations
    # count += 1
    # if count == 100:
    #   break

  topk_acc = (correct/len(dev_questions)) * 100

  print(f"Successful retrievals: {correct}/{len(dev_questions.keys())}")

  return topk_acc

print(f'Top-k retrieval accuracy with k=5: {find_topk_acc(5)}')
print(f'Top-k retrieval accuracy with k=20: {find_topk_acc(20)}')

Evaluating...: 100%|██████████| 10570/10570 [29:02<00:00,  6.07it/s]


Successful retrievals: 10025/10570
Top-k retrieval accuracy with k=5: 94.84389782403028


Evaluating...: 100%|██████████| 10570/10570 [28:55<00:00,  6.09it/s]

Successful retrievals: 10134/10570
Top-k retrieval accuracy with k=20: 95.87511825922422





[Optional] Run the following code to see how well our retrieve-rerank method does given a query and k value.

In [None]:
question_id = '766'
top_k = 5

question = dev_questions[question_id]
# golden_context_id = dev_contexts[555]
top_k_context_id = search(question, top_k)


print(f"Query: {question}")
# print(f"Golden context id: {golden_context_id}")
print(f"Top-k context id: {top_k_context_id}")
print(f"Top-k context:")

for id, context_id in enumerate(top_k_context_id):
  print(f'{id}. {dev_contexts[str(context_id)]}')

Query: On what yard line did Carolina begin with 4:51 left in the game?
Top-k context id: [51, 49, 47, 48, 46]
Top-k context:
0. With 4:51 left in regulation, Carolina got the ball on their own 24-yard line with a chance to mount a game-winning drive, and soon faced 3rd-and-9. On the next play, Miller stripped the ball away from Newton, and after several players dove for it, it took a long bounce backwards and was recovered by Ward, who returned it five yards to the Panthers 4-yard line. Although several players dove into the pile to attempt to recover it, Newton did not and his lack of aggression later earned him heavy criticism. Meanwhile, Denver's offense was kept out of the end zone for three plays, but a holding penalty on cornerback Josh Norman gave the Broncos a new set of downs. Then Anderson scored on a 2-yard touchdown run and Manning completed a pass to Bennie Fowler for a 2-point conversion, giving Denver a 24–10 lead with 3:08 left and essentially putting the game away. Ca