# 1. Environmental Setup

In [None]:
!pip install rank_bm25



In [None]:
import os
from pathlib import Path
import json
import csv
import pandas as pd
import numpy as np
from rank_bm25 import BM25Okapi
import nltk
from nltk.tokenize import word_tokenize
import torch
from tqdm import tqdm


nltk.download('punkt')

dataset_dir = Path('squad')
if not dataset_dir.is_dir():
  !mkdir squad
  !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O squad/train-v1.1.json
  !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O squad/dev-v1.1.json

device = 'cuda' if torch.cuda.is_available() else 'cpu'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


--2023-12-02 06:33:10--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30288272 (29M) [application/json]
Saving to: ‘squad/train-v1.1.json’


2023-12-02 06:33:11 (161 MB/s) - ‘squad/train-v1.1.json’ saved [30288272/30288272]

--2023-12-02 06:33:11--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4854279 (4.6M) [application/json]
Saving to: ‘squad/dev-v1.1.json’


2023-12-02 06:33:12 (88.1 MB/s) - ‘squad/dev-v1.1.json’ saved [4854279/4854279]



In [None]:
from google.colab import drive
drive.mount('/gdrive')

root = '/gdrive/MyDrive/Project 2/retrieve-rerank'

Mounted at /gdrive


# 2. Data Processing

In [None]:
def convert(inputfile, outputfile):
    with open(inputfile, 'r') as json_file:
        json_object = json.load(json_file)
    header = ['id', 'question','title', 'context_id', 'context']
    with open(outputfile, 'w', encoding="utf-8", newline='') as file:
        writer = csv.writer(file)
        writer.writerow(header)

        context_id = 0
        contexts = list()

        contexts.append(json_object['data'][0]['paragraphs'][0]['context'])

        for i in range (len(json_object['data'])):
            for j in range(len(json_object['data'][i]['paragraphs'])):
                for k in range(len(json_object['data'][i]['paragraphs'][j]['qas'])):
                    row = []
                    row.append(json_object['data'][i]['paragraphs'][j]['qas'][k]['id'])
                    row.append(json_object['data'][i]['paragraphs'][j]['qas'][k]['question'])
                    row.append(json_object['data'][i]['title'])

                    current_context = json_object['data'][i]['paragraphs'][j]['context']

                    if contexts[-1] != current_context:
                      context_id += 1
                      contexts.append(current_context)

                    row.append(context_id)
                    row.append(current_context)

                    if (len(row) != 0):
                        writer.writerow(row)
    return contexts

train_contexts = convert(f'{dataset_dir}/train-v1.1.json', f'{dataset_dir}/train.csv')
dev_contexts = convert(f'{dataset_dir}/dev-v1.1.json', f'{dataset_dir}/dev.csv')

train_df = pd.read_csv(f'{dataset_dir}/train.csv')
dev_df = pd.read_csv(f'{dataset_dir}/dev.csv')

# 3. Applying BM25 on SQuAD

Create a dataframe of unique contexts and initiate a bm25 object that is based on tokenized contexts.

In [None]:
# context_df = train_df["context"].drop_duplicates()
# context_df = dev_df["context"].drop_duplicates()
# contexts = context_df.values.tolist()

tokenized_corpus = [word_tokenize(doc) for doc in train_contexts]
bm25 = BM25Okapi(tokenized_corpus)

Find the top-k retrieval accuracy using the dev set.

In [None]:
def find_topk_acc(df, k):
  """
    Inputs: dataframe, k
    Outputs: top-k accuracy
  """
  success_num = 0
  for row_id in tqdm(df.index, mininterval = 3, desc ="Evaluating..."):
    query = df["question"][row_id]
    gold_context_id = df["context_id"][row_id]

    tokenized_query = word_tokenize(query)
    # Get relevance score for each context
    context_scores = torch.tensor(bm25.get_scores(tokenized_query))
    # Get the context id of top ten context
    _, top_context_id = torch.topk(context_scores, k=k)

    if gold_context_id in top_context_id:
      success_num += 1
  topk_acc = (success_num/len(df)) * 100

  print(f"Successful retrievals: {success_num}/{len(df)}")
  return topk_acc

print(f'Top-k retrieval accuracy of BM25 with k=5: {find_topk_acc(dev_df, 5)}')
print(f'Top-k retrieval accuracy of BM25 with k=20: {find_topk_acc(dev_df, 20)}')


Evaluating...: 100%|██████████| 10570/10570 [01:58<00:00, 89.35it/s]


Successful retrievals: 9192/10570
Top-k retrieval accuracy of BM25 with k=5: 86.96310312204352


Evaluating...: 100%|██████████| 10570/10570 [01:55<00:00, 91.37it/s]

Successful retrievals: 9946/10570
Top-k retrieval accuracy of BM25 with k=20: 94.0964995269631





Create another dataset with `qid, gold_id, neg_id, gold_score, neg_score` rows.

In [None]:
def create_new_dataset(df, outputfile):
  """
    Inputs: dataframe, output file name
    This function is to create a dataset
  """
  header = ['qid', 'pos_id', 'neg_id', 'pos_score', 'neg_score']

  with open(outputfile, 'w', encoding="utf-8", newline='') as file:
    writer = csv.writer(file)
    writer.writerow(header)

    for row_id in tqdm(df.index, mininterval = 3, desc ="Evaluating..."):
      query = df["question"][row_id]
      gold_context_id = df["context_id"][row_id]
      row = []

      tokenized_query = word_tokenize(query)
      # Get relevance score for each context
      context_scores = torch.tensor(bm25.get_scores(tokenized_query))
      # Get the context id of top ten context
      _, top_context_id = torch.topk(context_scores, k=5)

      # Get 'hard negative': The negative should look really similar to the positive passage, but it should not be relevant to the query.
      # neg_id = top_context_id[-1].item() if gold_context_id != top_context_id[-1] else top_context_id[-2].item()

      for i, id in enumerate(top_context_id):
        if id == gold_context_id:
          # Choose the neg_id to be the one after the golden context if it is not the last
          if i != len(top_context_id)-1: neg_id = top_context_id[i+1].item()

          # Choose the neg_id to be the one before the golden context if it is the last
          else: neg_id = top_context_id[i-1].item()

        if neg_id is None: neg_id = top_context_id[-1].item()


      gold_score = context_scores[gold_context_id].item()
      neg_score = context_scores[neg_id].item()

      writer.writerow([row_id, gold_context_id, neg_id, gold_score, neg_score])

In [None]:
## *** The function need to be rewritten to account for creating the following two datasets.
# create_new_dataset(train_df, f'{root}/train_qidpidtriples.csv')
# create_new_dataset(dev_df, f'{root}/dev_qidpidtriples.csv')

create_new_dataset(train_df, f'{root}/trainset_bm25score.csv')
# create_new_dataset(dev_df, f'{root}/devset_bm25score.csv')

Evaluating...: 100%|█████████▉| 87555/87599 [1:50:11<00:03, 13.75it/s]

[Optional] Run the following code to see how well BM25 does given a query and `k` value.

In [None]:
def retrieve_context(query, k):
  tokenized_query = word_tokenize(query)

  context_scores = torch.tensor(bm25.get_scores(tokenized_query))

  top_scores, top_context_id = torch.topk(context_scores, k=k)

  topk_context = bm25.get_top_n(tokenized_query, train_contexts, n=k)

  print(f'scores {top_scores.tolist()}')

  return top_context_id, topk_context

query = train_df["question"][2]
golden_context_id = train_df["context_id"][2]
top_context_id, topk_context = retrieve_context(query, 20)

print(f"Query: {query}")
print(f"Golden context id: {golden_context_id}")
print(f"Top-k context id: {top_context_id}")
print(f"Top-k context:")

for id, context in enumerate(topk_context):
  print(f'{id}. {context}')

scores [42.95682587765693, 36.78653003146273, 36.24635265423394, 34.970439667867275, 34.47066230049125, 34.4376577115112, 34.09289941867836, 33.9207635887153, 33.824768193267175, 33.74218653127046, 32.83243883810128, 32.732157104570014, 32.32543634463234, 32.04143470016326, 31.394329920331778, 31.370335423218275, 31.149947097604645, 31.062438959243575, 30.293788552416146, 30.080866952815366]
Query: The Basilica of the Sacred heart at Notre Dame is beside to which structure?
Golden context id: 0
Top-k context id: tensor([38, 52, 47,  6,  9, 46, 50,  8, 23, 19, 21, 22, 31, 49, 48, 14,  7, 39,
        15, 28])
Top-k context:
0. The University of Notre Dame du Lac (or simply Notre Dame /ˌnoʊtərˈdeɪm/ NOH-tər-DAYM) is a Catholic research university located adjacent to South Bend, Indiana, in the United States. In French, Notre Dame du Lac means "Our Lady of the Lake" and refers to the university's patron saint, the Virgin Mary. The main campus covers 1,250 acres in a suburban setting and it