<a href="https://colab.research.google.com/github/juliatessler/1s2023-unicamp-dl-for-search-systems/blob/main/7-dense-passage-retriever/7_dense_passage_retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers -q
!pip install ftfy -q
!pip install sentencepiece -q
!pip install evaluate -q
!pip install pyserini -q
!pip install faiss-cpu -q
!pip install trectools -q
!pip install jsonlines -q
!pip install hnswlib -q

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for hnswlib (pyproject.toml) ... [?25l[?25hdone


In [None]:
from google.colab import drive

drive.mount('/content/gdrive', force_remount = True)

Mounted at /content/gdrive


In [None]:
work_dir = '/content/gdrive/MyDrive/Unicamp/DL_applied_to_IR'

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Wed Apr 19 23:48:14 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0    44W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
import pandas as pd
import numpy as np
import ftfy
import jsonlines

from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModel,
    BatchEncoding,
    Trainer,
    TrainingArguments,
    get_linear_schedule_with_warmup, 
    get_cosine_with_hard_restarts_schedule_with_warmup, 
    AdamW
)

from tqdm.auto import tqdm
from evaluate import load
from collections import defaultdict

import os
import csv
import pickle
import time
import hnswlib

In [None]:
model_name = 'microsoft/MiniLM-L12-H384-uncased'

max_length = 256 
batch_size = 32
epochs = 20
lr = 2e-5

## Dataset retrieval and preparation


In [None]:
!wget https://storage.googleapis.com/unicamp-dl/ia368dd_2023s1/msmarco/msmarco_triples.train.tiny.tsv

--2023-04-19 23:48:22--  https://storage.googleapis.com/unicamp-dl/ia368dd_2023s1/msmarco/msmarco_triples.train.tiny.tsv
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.4.128, 142.251.10.128, 142.251.12.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.4.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8076179 (7.7M) [text/tab-separated-values]
Saving to: ‘msmarco_triples.train.tiny.tsv’


2023-04-19 23:48:24 (7.31 MB/s) - ‘msmarco_triples.train.tiny.tsv’ saved [8076179/8076179]



In [None]:
df = pd.read_csv('msmarco_triples.train.tiny.tsv', 
                 delimiter = '\t', 
                 header = None, 
                 names = ['query', 'pos_doc', 'neg_doc'])

df.head()

Unnamed: 0,query,pos_doc,neg_doc
0,is a little caffeine ok during pregnancy,We donât know a lot about the effects of caf...,It is generally safe for pregnant women to eat...
1,what fruit is native to australia,Passiflora herbertiana. A rare passion fruit n...,"The kola nut is the fruit of the kola tree, a ..."
2,how large is the canadian military,The Canadian Armed Forces. 1 The first large-...,The Canadian Physician Health Institute (CPHI)...
3,types of fruit trees,Cherry. Cherry trees are found throughout the ...,"The kola nut is the fruit of the kola tree, a ..."
4,how many calories a day are lost breastfeeding,"Not only is breastfeeding better for the baby,...","However, you still need some niacin each day; ..."


In [None]:
df['query'] = df['query'].apply(ftfy.fix_text)
df['pos_doc'] = df['pos_doc'].apply(ftfy.fix_text)
df.drop('neg_doc', inplace = True, axis = 1)

df.head()

Unnamed: 0,query,pos_doc
0,is a little caffeine ok during pregnancy,We don't know a lot about the effects of caffe...
1,what fruit is native to australia,Passiflora herbertiana. A rare passion fruit n...
2,how large is the canadian military,The Canadian Armed Forces. 1 The first large-...
3,types of fruit trees,Cherry. Cherry trees are found throughout the ...
4,how many calories a day are lost breastfeeding,"Not only is breastfeeding better for the baby,..."


In [None]:
train_df, eval_df = train_test_split(df, test_size = 0.1, random_state = 42)

In [None]:
# Remove this cell after initial pipeline debug
# train_df = train_df.sample(1000)
# eval_df = eval_df.sample(1000)

Peguei uma idéia do Leandro Carísio, de salvar os ítens tokenizados num dict para não ter que tokenizar novamente a cada epoch.

In [None]:
class DPRDataset(Dataset):
  def __init__(self, tokenizer, textos, max_seq_length = max_length):
        self.max_seq_length = max_seq_length
        self.tokenizer = tokenizer
        self.textos = textos
        self.cache = {}

  def __len__(self):
      return len(self.textos)
  
  def __getitem__(self, idx):
      # Guarda os itens tokenizados num dict e apenas recupera de lá, pra não
      # ter que ficar tokenizando a cada época
      # Como estamos guardando no dict e idx é um slice, é necessário converter ele pra algo
      # mapeável
      self.cache[str(idx)] = self.cache.get(str(idx), 
                  self.tokenizer(self.textos[idx],
                                padding=True,
                                truncation=True,
                                max_length=self.max_seq_length
                                )
                  )
      return self.cache[str(idx)]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
# Separa os conjuntos de treinamento e validação
queries_train = train_df['query'].tolist()
docs_train = train_df['pos_doc'].tolist()
queries_eval = eval_df['query'].tolist()
docs_eval = eval_df['pos_doc'].tolist()

In [None]:
dataset_queries_train = DPRDataset(tokenizer, queries_train)
dataset_docs_train = DPRDataset(tokenizer, docs_train)

# Datasets de validação
dataset_queries_eval = DPRDataset(tokenizer, queries_eval)
dataset_docs_eval = DPRDataset(tokenizer, docs_eval)

In [None]:
def collate_fn(batch):
    return BatchEncoding(tokenizer.pad(batch, return_tensors='pt'))

In [None]:
dataloader_queries_train = DataLoader(dataset_queries_train, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
dataloader_docs_train = DataLoader(dataset_docs_train, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

dataloader_queries_val = DataLoader(dataset_queries_eval, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
dataloader_docs_val = DataLoader(dataset_docs_eval, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

## Model settings

In [None]:
model_doc = AutoModel.from_pretrained(model_name).to(device)
model_query = AutoModel.from_pretrained(model_name).to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/133M [00:00<?, ?B/s]

In [None]:
EPSILON=1e-8

In [None]:
def compute_loss(doc_outputs, query_outputs):
    doc_cls = doc_outputs.last_hidden_state[:, 0, :]
    query_cls = query_outputs.last_hidden_state[:, 0, :]

    all_passages_all_topics_dot_product = torch.mm(doc_cls, query_cls.t())
    passages_and_positive_topics = all_passages_all_topics_dot_product.diag().unsqueeze(1)

    dot_product_differences = all_passages_all_topics_dot_product - passages_and_positive_topics

    loss = torch.log(torch.sum(torch.exp(dot_product_differences), dim=1))

    non_zeroed_losses = (loss > EPSILON).float().sum()

    if non_zeroed_losses > 0.0:
        final_loss = torch.sum(loss) / non_zeroed_losses
    else:
        final_loss = torch.mean(loss)

    return final_loss

## Train models

In [None]:
# Training loop
optimizer_query = AdamW(model_query.parameters(), lr=lr)
optimizer_doc = AdamW(model_doc.parameters(), lr=lr)

num_training_steps = epochs * len(dataloader_queries_train)
num_warmup_steps = int(num_training_steps * 0.1)

scheduler_query = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer_query, num_warmup_steps, num_training_steps)   
scheduler_doc = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer_doc, num_warmup_steps, num_training_steps)   

current_training_step = 0

for epoch in tqdm(range(epochs), desc='Epochs'):
    model_query.train()
    model_doc.train()
    
    train_losses = []
    for batch_query, batch_docs in tqdm(list(zip(dataloader_queries_train, dataloader_docs_train)), mininterval=0.5, desc='Train', disable=False):
        current_training_step += 1
        
        optimizer_query.zero_grad()
        optimizer_doc.zero_grad()
        
        doc_outputs = model_doc(**batch_docs.to(device))
        query_outputs = model_query(**batch_query.to(device))

        loss = compute_loss(doc_outputs, query_outputs)
        loss.backward()
        
        optimizer_query.step()
        optimizer_doc.step()

        scheduler_query.step()
        scheduler_doc.step()

        train_losses.append(loss.detach().cpu().numpy())
    
    model_query.save_pretrained(f'{work_dir}/homework7/{epoch}/query/')
    model_doc.save_pretrained(f'{work_dir}/homework7/{epoch}/doc/')

    model_query.eval()
    model_doc.eval()
    
    print("Epoch: {}, Training loss: {:0.4f}".format(epoch + 1, np.mean(train_losses)))



Epochs:   0%|          | 0/20 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Train:   0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 1, Training loss: 2.3656


Train:   0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 2, Training loss: 0.3922


Train:   0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 3, Training loss: 0.1560


Train:   0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 4, Training loss: 0.0960


Train:   0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 5, Training loss: 0.0594


Train:   0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 6, Training loss: 0.0504


Train:   0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 7, Training loss: 0.0284


Train:   0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 8, Training loss: 0.0316


Train:   0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 9, Training loss: 0.0195


Train:   0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 10, Training loss: 0.0183


Train:   0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 11, Training loss: 0.0159


Train:   0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 12, Training loss: 0.0129


Train:   0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 13, Training loss: 0.0101


Train:   0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 14, Training loss: 0.0122


Train:   0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 15, Training loss: 0.0074


Train:   0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 16, Training loss: 0.0062


Train:   0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 17, Training loss: 0.0064


Train:   0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 18, Training loss: 0.0046


Train:   0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 19, Training loss: 0.0042


Train:   0%|          | 0/310 [00:00<?, ?it/s]

Epoch: 20, Training loss: 0.0046


## TREC-COVID

In [None]:
!wget https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/trec-covid.zip -P collections # type: ignore
!unzip -o collections/trec-covid.zip -d ./collections # type: ignore

--2023-04-20 00:07:03--  https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/trec-covid.zip
Resolving public.ukp.informatik.tu-darmstadt.de (public.ukp.informatik.tu-darmstadt.de)... 130.83.167.186
Connecting to public.ukp.informatik.tu-darmstadt.de (public.ukp.informatik.tu-darmstadt.de)|130.83.167.186|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 73876720 (70M) [application/zip]
Saving to: ‘collections/trec-covid.zip’


2023-04-20 00:07:11 (12.2 MB/s) - ‘collections/trec-covid.zip’ saved [73876720/73876720]

Archive:  collections/trec-covid.zip
   creating: ./collections/trec-covid/
   creating: ./collections/trec-covid/qrels/
  inflating: ./collections/trec-covid/qrels/test.tsv  
  inflating: ./collections/trec-covid/corpus.jsonl  
  inflating: ./collections/trec-covid/queries.jsonl  


In [None]:
# with open('./collections/trec-covid/qrels/test.tsv', 'r') as fin:
#   data = fin.read().splitlines(True)
# with open('./collections/trec-covid/qrels/test_corrigido.tsv', 'w') as fout:
#   for line in data[1:]:
#     row = line.split()
#     fout.write(f'{row[0]}\t0\t{row[1]}\t{row[2]}\n')

### Vectorizing TREC-COVID

In [None]:
query_ids = []
query_texts = []

with jsonlines.open("./collections/trec-covid/queries.jsonl") as reader:
  for item in reader:
    id = item["_id"]
    query_ids.append(id)
    text = item["text"]
    query_texts.append(text)
    
dataset_queries_test = DPRDataset(tokenizer, query_texts)
dataloader_queries_test = DataLoader(dataset_queries_test, 
                                          batch_size=batch_size, shuffle=False, 
                                          collate_fn=collate_fn)

In [None]:
passage_ids = []
passage_texts = []

with jsonlines.open("./collections/trec-covid/corpus.jsonl") as reader:
  for item in reader:
    id = item["_id"]
    passage_ids.append(id)
    text = item["title"] + ' ' + item["text"]
    passage_texts.append(text)

dataset_passages_test = DPRDataset(tokenizer, passage_texts)
dataloader_passages_test = DataLoader(dataset_passages_test, 
                                      batch_size=batch_size, shuffle=False, 
                                      collate_fn=collate_fn)

#### Queries

In [None]:
model_query.eval()

queries_matrix = None

with torch.no_grad():
  for query_batch in tqdm(dataloader_queries_test, mininterval=0.5, desc='Test', 
                                          disable=False):
    query_outputs = model_query(**query_batch.to(device))
    query_cls = query_outputs.last_hidden_state[:, 0, :]

    if queries_matrix is None:
      queries_matrix = query_cls
    else:
      queries_matrix = torch.cat((queries_matrix, query_cls), dim=0)

Test:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
queries_matrix.shape

torch.Size([50, 384])

#### Documents

In [None]:
model_doc.eval()

passages_matrix = None

with torch.no_grad():
  for doc_batch in tqdm(dataloader_passages_test, mininterval=0.5, desc='Test', 
                                          disable=False):
    doc_outputs = model_doc(**doc_batch.to(device))
    doc_cls = doc_outputs.last_hidden_state[:, 0, :]

    if passages_matrix is None:
      passages_matrix = doc_cls
    else:
      passages_matrix = torch.cat((passages_matrix, doc_cls), dim=0)

Test:   0%|          | 0/5355 [00:00<?, ?it/s]

#### Retrieve top docs

In [None]:
similarity = torch.matmul(queries_matrix, torch.transpose(passages_matrix, 0, 1))

In [None]:
k = 1000

In [None]:
run = defaultdict(list)

for i in range(similarity.shape[0]):
  doc_scores = similarity[i,:]
  doc_scores, indices = torch.sort(doc_scores, dim=-1, descending=True)
  doc_scores = doc_scores[:k]
  indices = indices[:k]
  query_id = query_ids[i]
  doc_ids = [passage_ids[indices[j].item()]for j in range(indices.shape[0])]
  run["query"] += [query_id] * k
  run["docid"] += doc_ids
  run["score"] += doc_scores.tolist()
  run["q0"] += ["q0"] * k
  run["rank"] += list(range(1,k+1))
  run["system"] += ["dense_ret"] * k

## Evaluation

In [None]:
qrel = pd.read_csv("./collections/trec-covid/qrels/test.tsv", sep="\t", header=None, 
                   skiprows=1, names=["query", "docid", "rel"])
qrel["q0"] = "q0"
qrel = qrel.to_dict(orient="list")

In [None]:
def eval_ndcg10(run):
  trec_eval = load("trec_eval")
  results = trec_eval.compute(predictions=[run], references=[qrel])
  
  return results['NDCG@10']   

In [None]:
eval_ndcg10(run)

0.24587433032676137

Using `hnswlib`: https://github.com/nmslib/hnswlib

Got this last bit from Monique

In [None]:
embedding_size = model_query.config.hidden_size
embedding_size

384

In [None]:
#Defining our hnswlib index
index_path = f"{work_dir}/hnswlib.index"
index = hnswlib.Index(space = 'ip', dim = embedding_size)

if os.path.exists(index_path):
    print("Loading index...")
    index.load_index(index_path)
else:
    ### Create the HNSWLIB index
    print("Start creating HNSWLIB index")
    index.init_index(max_elements = passages_matrix.shape[0])

    corpus_embeddings = passages_matrix.cpu().numpy()

    # Then we train the index to find a suitable clustering
    index.add_items(corpus_embeddings, list(range(len(corpus_embeddings))))

    print("Saving index to:", index_path)
    index.save_index(index_path)

# Controlling the recall by setting ef:
index.set_ef(k+1)  # ef should always be > top_k_hits

Start creating HNSWLIB index
Saving index to: /content/gdrive/MyDrive/Unicamp/DL_applied_to_IR/hnswlib.index


In [None]:
run_hnswlib_knn = defaultdict(list)

for i in range(queries_matrix.shape[0]):
  question_embedding = queries_matrix[i]
  question_embedding = question_embedding.cpu().numpy()

  #We use hnswlib knn_query method to find the top_k_hits
  corpus_ids, distances = index.knn_query(question_embedding, k=k)

  # We extract corpus ids and scores for the first query
  hits = [{'corpus_id': passage_ids[id], 'score': 1-score} for id, score in zip(corpus_ids[0], distances[0])]
  hits = sorted(hits, key=lambda x: x['score'], reverse=True)

  query_id = query_ids[i]
  run_hnswlib_knn["query"] += [query_id] * k
  run_hnswlib_knn["docid"] += [h["corpus_id"] for h in hits]
  run_hnswlib_knn["score"] += [h["score"] for h in hits]
  run_hnswlib_knn["q0"] += ["q0"] * k
  run_hnswlib_knn["rank"] += list(range(1,k+1))
  run_hnswlib_knn["system"] += ["dense_ret"] * k

In [None]:
eval_ndcg10(run_hnswlib_knn)

0.24587433032676137