In [1]:
!pip install transformers -q
!pip install faiss-gpu -q
!pip install -q ftfy
!pip install pyserini -q

In [2]:
import os
os.chdir('/content/drive/MyDrive/doutorado/P_IA368DD_2023S/aula6')


import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.auto import tqdm

import json
import pickle

import gc

import faiss

In [3]:
import torch

from transformers import AutoTokenizer,  AutoModel

In [4]:
if not os.path.exists("pyserini"):
    os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

    !pip install pyserini -q
    !git clone --recurse-submodules https://github.com/castorini/pyserini.git
    !cd pyserini
    !cd tools/eval && tar xvfz trec_eval.9.0.4.tar.gz && cd trec_eval.9.0.4 && make && cd ../../..
    !cd tools/eval/ndeval && make && cd ../../..
else:
    !chmod +x pyserini/tools/eval/trec_eval.9.0.4/trec_eval

    print("Pyserini already installed...")

chmod: cannot access 'pyserini/tools/eval/trec_eval.9.0.4/trec_eval': No such file or directory
Pyserini already installed...


In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
MODEL_NAME = 'microsoft/MiniLM-L12-H384-uncased'

In [6]:
%%markdown
# Download and prepare validation dataset (trec-covid)

# Download and prepare validation dataset (trec-covid)


In [7]:
url_trec_covid = 'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/trec-covid.zip'
if not Path('./data/trec-covid.zip').is_file():
  !wget {url_trec_covid} -P data/ # type: ignore
  !unzip -o collections/trec-covid.zip -d ./data # type: ignore

# Converte o qrels que veio no trec-covid.zip pra o formato esperado:
with open('./data/trec-covid/qrels/test.tsv', 'r') as fin:
  data = fin.read().splitlines(True)
with open('./data/trec-covid/qrels/test_corrigido.tsv', 'w') as fout:
  for linha in data[1:]:
    campos = linha.split()
    fout.write(f'{campos[0]}\t0\t{campos[1]}\t{campos[2]}\n')

In [8]:
def load_queries():
  if not Path("./data/trec-covid/queries.csv").is_file():

    topics = []
    with open("./data/trec-covid/queries.jsonl", "r") as fin:
      for line in fin:
        query = json.loads(line)
        topics.append((query["_id"], query["text"]))
    topics_df = pd.DataFrame(topics,columns=["id","text"])
    topics_df.to_csv("./data/trec-covid/queries.csv",index=False)
  else:
    aux = pd.read_csv("./data/trec-covid/queries.csv")
    topics = aux.values.tolist()
  return topics

def load_corpus():
  corpus = {}
  #with open("TREC_COVID_2020/corpus_original/corpus_original.jsonl", "w") as fout:
  with open("./data/trec-covid/corpus.jsonl", "r") as fin:
    for line in fin:
      doc = json.loads(line)
      corpus[doc["_id"]] = f"{doc['title']}. {doc['text']}"
      
  return corpus
  #fout.write(json.dumps(doc_dict) + "\n")

In [9]:
corpus = load_corpus()
topics = load_queries()

In [10]:
%%markdown
# Geração de embeddings

# Geração de embeddings


In [11]:
def generate_embeddings(model,tokenizer, sentence,device):
  # Tokenize sentences
  #encoded_input = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')
  encoded_input = tokenizer(sentence,return_length=False,return_tensors='pt', truncation=True,max_length=512)
  encoded_input = {key: value.to(device) for key, value in encoded_input.items()}
  with torch.no_grad():
        # Desativa o cálculo de gradientes para economizar memória e acelerar a inferência
        model_output = model(**encoded_input)
        # Passa os inputs tokenizados para o modelo e obtém a saída do modelo
  embeddings = model_output.last_hidden_state[:, 0, :].cpu().numpy()# to('cpu').numpy()
  return embeddings

In [12]:
%%markdown
## Gerando enconding para queries

## Gerando enconding para queries


In [13]:
def generate_encode(model_name,topics,corpus,tokenizer_name="microsoft/MiniLM-L12-H384-uncased",device=device):
    # Load model from HuggingFace Hub
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
  passages_model = AutoModel.from_pretrained(os.path.join("fine_tuned/checkpoints_seiti_loss", "checkpoint_epoch_2_loss_0.2808033227920532", "_passages")).to(device)
  queries_model = AutoModel.from_pretrained(os.path.join("fine_tuned/checkpoints_seiti_loss", "checkpoint_epoch_2_loss_0.2808033227920532", "_topics")).to(device)

  #%%time
  queries_encoded = {}
  queries_encoded_ids = []
  queries_encoded_list = []
  for id, value in tqdm(topics):
      encoded_text = generate_embeddings(queries_model, tokenizer, [value], device)#[0]
      queries_encoded_list.append(encoded_text)
      queries_encoded_ids.append(id)
      queries_encoded[id] = encoded_text

  corpus_encoded = {}
  corpus_encoded_ids = []
  corpus_encoded_list = []
  for corpo_id in tqdm(corpus):
      encoded_text = generate_embeddings(passages_model, tokenizer, [corpus[corpo_id]], device)
      corpus_encoded_list.append(encoded_text)
      corpus_encoded_ids.append(corpo_id)
      corpus_encoded[corpo_id] = encoded_text

  save_model_name = model_name.split("/")[-1]
  with open(f"./encodeds/{save_model_name}_data_index_checkpoints_seiti_loss.pickle", 'wb') as outputFile:
    pickle.dump({'queries_encoded': queries_encoded,
                 'corpus_encoded': corpus_encoded}, outputFile, pickle.HIGHEST_PROTOCOL)
  #print(len(queries_encoded_ids))
  #print(len(queries_encoded_list))
  #print(len(corpus_encoded_ids))
  #print(len(corpus_encoded_list))
  return (queries_encoded_ids,
          np.concatenate(queries_encoded_list, axis=0),
          corpus_encoded_ids,
          np.concatenate(corpus_encoded_list, axis=0)
          )
  #return queries_encoded, corpus_encoded #np.concatenate(queries_encoded_list, axis=0), np.concatenate(corpus_encoded_list, axis=0)

In [14]:
MODEL_NAME = 'microsoft/MiniLM-L12-H384-uncased'
queries_encoded_ids, queries_encoded_list,corpus_encoded_ids,corpus_encoded_list = generate_encode(MODEL_NAME, topics, corpus,device=device)
#queries_encoded_ids, queries_encoded_list,corpus_encoded_ids,corpus_encoded_list = generate_encode(MODEL_NAME, topics[:50], out,tokenizer_name=MODEL_NAME,device=device)

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/171332 [00:00<?, ?it/s]

In [15]:
%%markdown
## create an search index

## create an search index


In [92]:
corpus_index = faiss.IndexFlatIP(384)
corpus_index.add(corpus_encoded_list)
score_search_results, index_search_results = corpus_index.search(queries_encoded_list, 1000)

In [93]:
def trec_run(save_path, 
             score_search_results,index_search_results, 
             queries_encoded_ids, queries_encoded_list,
             corpus_encoded_ids):
  with open(save_path, 'w') as outputFile:
    for query_index, query_scores in enumerate(score_search_results):
      document_descending_order = np.argsort(query_scores)[::-1]
      tokenized_documents_ordered_indexes = index_search_results[query_index][document_descending_order]
      corpus_encoded_ids = np.array(corpus_encoded_ids)
      original_documents_ordered_indexes = tokenized_documents_ordered_indexes

      remaining_query_scores = []
      remaining_original_documents_indexes = []
      
      """doc_counts = np.unique(original_documents_ordered_indexes, return_counts=True)
      if np.sum(doc_counts[1] > 1) > 0:
        for document_index, document_count in zip(doc_counts[0], doc_counts[1]):
          document_pos = np.where(original_documents_ordered_indexes == document_index)[0]
          if document_count > 1:
            remaining_query_scores.append(np.mean(query_scores[document_descending_order[document_pos]]))
          else:
            remaining_query_scores.append(query_scores[document_descending_order[document_pos]][0])
          remaining_original_documents_indexes.append(document_index)

        query_scores = remaining_query_scores
        document_descending_order = np.argsort(remaining_query_scores)[::-1]
        original_documents_ordered_indexes = np.array(remaining_original_documents_indexes)[document_descending_order]"""


      included_docs = set()
      for i, document_index in enumerate(original_documents_ordered_indexes):
        if document_index not in included_docs:
          included_docs.add(document_index)
          outputFile.write(f"{queries_encoded_ids[query_index]}\tQ0\t{corpus_encoded_ids[document_index]}\t{i+1}\t{query_scores[document_descending_order][i]}\tcheckpoints_seiti_loss\n")

In [94]:
trec_run(save_path = 'runs/checkpoints_seiti_loss.tsv',
         score_search_results = score_search_results, index_search_results= index_search_results,
         queries_encoded_ids = queries_encoded_ids, queries_encoded_list = queries_encoded_list,
         corpus_encoded_ids = corpus_encoded_ids)

In [95]:
!python -m pyserini.eval.trec_eval -c -mrecall.1000 -mmap -mndcg_cut.10 -mrecip_rank.100 data/trec-covid/qrels/test_corrigido.tsv runs/checkpoints_seiti_loss.tsv

Downloading https://search.maven.org/remotecontent?filepath=uk/ac/gla/dcs/terrierteam/jtreceval/0.0.5/jtreceval-0.0.5-jar-with-dependencies.jar to /root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar...
/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar already exists!
Skipping download.
Running command: ['java', '-jar', '/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar', '-c', '-mrecall.1000', '-mmap', '-mndcg_cut.10', '-mrecip_rank.100', 'data/trec-covid/qrels/test_corrigido.tsv', 'runs/checkpoints_seiti_loss.tsv']
Results:
map                   	all	0.0007
recip_rank            	all	0.0288
recall_1000           	all	0.0221
ndcg_cut_10           	all	0.0084
