# dense retrieval

작성일자: 210119\
작성자: 조진욱\
목표: 
1. 2-1 에서 Sparse 임베딩 모델(TFIDF) 대신 Dense 임베딩 모델(DPR) 을 사용해보자\
2. FAISS, datasets 패키지를 이용해서 retrieval 과정을 만들어보자
순서: 
2-1과 동일

다만 2-1 의 retrieval 결과와 성능을 비교할 예정

비고:
1. DPR, faiss 와 datasets 를 그대로 사용한 예시라서 매우 쉽게 짜여져있음
교육을 위해서는 어느정도 구현을 할 수 있도록 만들어 둬야함
2. 

TO DO:
1. batch 과정이 어려울 것 같으면 처음엔 batch 1 로 

In [1]:
import json
import pandas as pd
from tqdm.notebook import tqdm
import os

In [2]:
import faiss

In [3]:
from transformers import DPRContextEncoder, DPRContextEncoderTokenizerFast

In [4]:
def embed(documents: dict, ctx_encoder: DPRContextEncoder, ctx_tokenizer: DPRContextEncoderTokenizerFast) -> dict:
    """Compute the DPR embeddings of document passages"""
    input_ids = ctx_tokenizer(
        documents["title"], documents["text"], truncation=True, padding="longest", return_tensors="pt"
    )["input_ids"]
    embeddings = ctx_encoder(input_ids.to(device=device), return_dict=True).pooler_output
    return {"embeddings": embeddings.detach().cpu().numpy()}

In [5]:
import torch
from functools import partial
device = "cuda" if torch.cuda.is_available() else "cpu"
dpr_ctx_encoder_model_name = "facebook/dpr-ctx_encoder-multiset-base"
rag_model_name = "facebook/rag-sequence-nq"
batch_size = 16
root_data_dir = "./data"
data_dir = "./data/squad_odqa"
output_dir = "./data/dense"

## dpr 용 csv 만들기

In [35]:
with open(f'{root_data_dir}/squad/dev.json', 'r') as reader:
    input_data = json.load(reader)['data']

is_training = True
row_list = []

count = 0
ctx_count = 0 
for doc_id, entry in enumerate(tqdm(input_data)):
    title = entry["title"]
    for paragraph in entry["paragraphs"]:
        context_text = paragraph["context"]
        
        for qa in paragraph["qas"]:
            qas_id = qa["id"]
            question_text = qa["question"]
            start_position_character = None
            answer_text = None
            answers = []

            is_impossible = qa.get("is_impossible", False)
            if not is_impossible:
                if is_training:
                    answer = qa["answers"][0]
                    answer_text = answer["text"]
                    start_position_character = answer["answer_start"]
                else:
                    answers = qa["answers"]
                    
            temp = {
                'title':title,
                'text':context_text,
                'question':question_text,
                'answer':answer_text,
                'title_id':doc_id,
                'ctx_id': ctx_count,
                'question_id': count
            }
            row_list.append(temp)
            count += 1
        ctx_count += 1

print(len(row_list))
df = pd.DataFrame(row_list)
df.to_csv(f"{output_dir}/train_tcqaidx.csv", sep='\t', index=False)

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


38708


## Step1 datasets 패키지 이용해서 데이터 로드 및 데이터셋 구축
https://huggingface.co/docs/datasets/loading_datasets.html?highlight=csv#csv-files

In [9]:
from datasets import Features, Sequence, Value, load_dataset
from typing import List, Optional

In [10]:
dataset = load_dataset(
    "csv", data_files=[f"{output_dir}/train_tc.csv"], split="train", delimiter="\t", column_names=["title", "text"]
)

Using custom data configuration default


Downloading and preparing dataset csv/default-1a69b9128733c8f4 (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-1a69b9128733c8f4/0.0.0/2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-1a69b9128733c8f4/0.0.0/2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2. Subsequent calls will reuse this data.


In [11]:
type(dataset)

datasets.arrow_dataset.Dataset

In [12]:
def split_text(text: str, n=100, character=" ") -> List[str]:
    """Split the text every ``n``-th occurrence of ``character``"""
    text = text.split(character)
    return [character.join(text[i : i + n]).strip() for i in range(0, len(text), n)]

def split_documents(documents: dict) -> dict:
    """Split documents into passages"""
    titles, texts = [], []
    for title, text in zip(documents["title"], documents["text"]):
        if text is not None:
            for passage in split_text(text):
                titles.append(title if title is not None else "")
                texts.append(passage)
    return {"title": titles, "text": texts}

dataset = dataset.map(split_documents, batched=True, num_proc=4)







In [13]:
ctx_encoder = DPRContextEncoder.from_pretrained(dpr_ctx_encoder_model_name).to(device=device)
ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(dpr_ctx_encoder_model_name)
new_features = Features(
    {"text": Value("string"), "title": Value("string"), "embeddings": Sequence(Value("float32"))}
)  # optional, save as float32 instead of float64 to save space

In [14]:
dataset = dataset.map(
    partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer),
    batched=True,
    batch_size=batch_size,
    features=new_features,
)

HBox(children=(FloatProgress(value=0.0, max=850.0), HTML(value='')))




In [15]:
passages_path = os.path.join(output_dir, "my_knowledge_dataset")
dataset.save_to_disk(passages_path)

## Step2 index the dataset

In [16]:
# === index config ===
dim = 768 # The dimension of the embeddings to pass to the HNSW Faiss index.
num = 128 # The number of bi-directional links created for every new element during the HNSW index construction.

index = faiss.IndexHNSWFlat(dim, num, faiss.METRIC_INNER_PRODUCT)
dataset.add_faiss_index("embeddings", custom_index=index)

HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




Dataset({
    features: ['text', 'title', 'embeddings'],
    num_rows: 13598
})

In [17]:
# save the index
index_path = os.path.join(output_dir, "my_knowledge_dataset_hnsw_index.faiss")
dataset.get_index("embeddings").save(index_path)
# dataset.load_faiss_index("embeddings", index_path)  # to reload the index

## Step3 retrieve

In [18]:
from transformers import RagRetriever, RagTokenizer, RagSequenceForGeneration

In [19]:
retriever = RagRetriever.from_pretrained(
    rag_model_name, index_name="custom", indexed_dataset=dataset
)

In [22]:
# question = "Where did Greece culture begin?"
question = "What do neuroanatomists study?" # 10761 doc expected, 55th title
input_ids = tokenizer.question_encoder(question, return_tensors="pt")["input_ids"]

In [23]:
question_hidden_states = model.question_encoder(input_ids)[0]

In [24]:
context_input_ids = model.retriever(
    input_ids,
    question_hidden_states.cpu().detach().to(torch.float32).numpy(),
    prefix=model.generator.config.prefix,
    n_docs=2,
    return_tensors="pt",
)["context_input_ids"]

# set to correct device
context_input_ids = context_input_ids.to(input_ids)

In [25]:
context_input_ids.shape

torch.Size([2, 300])

In [26]:
retrieved_doc_embeds, doc_ids, doc_dicts = retriever.retrieve(question_hidden_states.cpu().detach().to(torch.float32).numpy(),
                                                              n_docs=3)

In [27]:
retrieved_doc_embeds.shape

(1, 3, 768)

In [28]:
doc_ids

array([[3609, 3611, 4742]])

In [29]:
doc_dicts[0].keys()

dict_keys(['embeddings', 'text', 'title'])

In [30]:
doc_dicts[0]['title']

['Brain', 'Brain', 'Immunology']

In [31]:
doc_dicts[0]['text']

['The oldest method of studying the brain is anatomical, and until the middle of the 20th century, much of the progress in neuroscience came from the development of better cell stains and better microscopes. Neuroanatomists study the large-scale structure of the brain as well as the microscopic structure of neurons and their components, especially synapses. Among other tools, they employ a plethora of stains that reveal neural structure, chemistry, and connectivity. In recent years, the development of immunostaining techniques has allowed investigation of neurons that express specific sets of genes. Also, functional neuroanatomy uses medical imaging techniques to correlate variations',
 'Neurophysiologists study the chemical, pharmacological, and electrical properties of the brain: their primary tools are drugs and recording devices. Thousands of experimentally developed drugs affect the nervous system, some in highly specific ways. Recordings of brain activity can be made using electr

In [None]:
# 55	2684	10758~10761 10760~10763
# 55	2685	10762~10765 10764~10767
# 71	3484	14230~14234 14232~14236

In [37]:
for attr, val in index.__dict__.items():
    print(f"{attr} = {val}")

this = <Swig Object of type 'faiss::IndexHNSWFlat *' at 0x7f3d10336750>


In [40]:
model.

SyntaxError: invalid syntax (<ipython-input-40-41c238f641ce>, line 1)

In [6]:
def chunk_tensor(t, chunk_size):
    return [t[i : i + chunk_size] for i in range(0, len(t), chunk_size)]

In [9]:
ex = torch.ones((3, 2), dtype=torch.float32)

In [10]:
chunk_tensor(ex, 2)

[tensor([[1., 1.],
         [1., 1.]]),
 tensor([[1., 1.]])]