In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
import numpy as np
import torch
import faiss
from datasets import load_dataset
from tqdm.auto import tqdm

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [3]:
from config import QAConfig
from data_utils import SquadDataProcessor 
from retriever import TextEncoder

In [4]:
cfg = QAConfig()
data_processor = SquadDataProcessor(cfg)

## Load data

In [5]:
raw_train_dataset = load_dataset(cfg.DATASET_NAME, split ="train[:5%]", num_proc=cfg.NUM_PROC)
train_data = data_processor.process_data(raw_train_dataset, data_type="train")

Processing train data


In [6]:
MODEL_NAME = "distilbert-base-uncased"
text_encoder = TextEncoder(MODEL_NAME)

freezing distilbert-base-uncased parameters


## Create question embedding for similarity search

In [7]:
# Convert to numpy array (required for HF Datasets)
EMBEDDING_COLUMN = 'question_embedding'
embeddings_dataset = raw_train_dataset.map(
    lambda x: {EMBEDDING_COLUMN: text_encoder.get_embeddings(x['question']).detach().cpu().numpy()},
    batched=True
)

Map:   0%|          | 0/6516 [00:00<?, ? examples/s]

## Initialize FAISS Index

In [8]:
# create faiss index
embeddings_dataset.add_faiss_index(column=EMBEDDING_COLUMN)

  0%|          | 0/7 [00:00<?, ?it/s]

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'question_embedding'],
    num_rows: 6516
})

## Sample search

In [9]:
# similarity search
question = 'When did Beyonce start becoming popular?'

input_quest_embedding = text_encoder.get_embeddings([question]).cpu().detach().numpy()
input_quest_embedding.shape

(1, 768)

In [10]:
TOP_K = 5
scores, samples = embeddings_dataset.get_nearest_examples(
    EMBEDDING_COLUMN, input_quest_embedding, k=TOP_K
)

In [11]:
for idx, score in enumerate(scores):
    print(f'Top {idx + 1}\tScore: {score}')
    print(f'Question: {samples["question"][idx]}')
    # print(f'Context: {samples["context"][idx]}')
    print(f'Answer: {samples["answers"][idx]}')
    print()

Top 1	Score: 1.287664141980116e-10
Question: When did Beyonce start becoming popular?
Answer: {'text': ['in the late 1990s'], 'answer_start': [269]}

Top 2	Score: 2.613532066345215
Question: When did Beyoncé rise to fame?
Answer: {'text': ['late 1990s'], 'answer_start': [276]}

Top 3	Score: 4.859475135803223
Question: When did Beyoncé release Formation?
Answer: {'text': ['February 6, 2016'], 'answer_start': [3]}

Top 4	Score: 5.054233074188232
Question: In which decade did Beyonce become famous?
Answer: {'text': ['late 1990s'], 'answer_start': [276]}

Top 5	Score: 5.170374393463135
Question: When did Beyonce begin her deals with name brands?
Answer: {'text': ['since the age of 18'], 'answer_start': [433]}

