In [94]:
!pip install pyserini -q
!pip install evaluate -q
!pip install faiss-gpu -q
!pip install trectools -q
!pip install jsonlines -q
!pip install datasets -q
!pip install sentence-transformers -q

In [95]:
import os
os.chdir('/content/drive/MyDrive/doutorado/P_IA368DD_2023S/aula9_10')

In [96]:
from pathlib import Path
import json
import jsonlines

from collections import OrderedDict
import pickle

import numpy as np
import pandas as pd

from tqdm import tqdm
np.random.seed(42)

In [97]:
from pyserini.search.lucene import LuceneSearcher
from evaluate import load
from collections import defaultdict

from datasets import load_dataset

In [98]:
from time import time
from sentence_transformers import CrossEncoder

In [99]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [100]:

path_data = "./data/"

# Dados do TREC-COVID
url_trec_covid = 'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/trec-covid.zip'

In [None]:
#!wget https://huggingface.co/datasets/BeIR/trec-covid-qrels/raw/main/test.tsv

--2023-05-11 00:48:32--  https://huggingface.co/datasets/BeIR/trec-covid-qrels/raw/main/test.tsv
Resolving huggingface.co (huggingface.co)... 13.227.219.41, 13.227.219.105, 13.227.219.125, ...
Connecting to huggingface.co (huggingface.co)|13.227.219.41|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 980831 (958K) [text/plain]
Saving to: ‘test.tsv.3’


2023-05-11 00:48:32 (2.54 MB/s) - ‘test.tsv.3’ saved [980831/980831]



In [101]:

if not Path(f'{path_data}trec-covid.zip').is_file():
  !wget {url_trec_covid} -P ./data/ # type: ignore
  !unzip -o data/trec-covid.zip -d ./data/ # type: ignore

if not Path(f'{path_data}trec-covid/expanded_corpus/expanded_corpus.csv').is_file():
  with open('/content/drive/MyDrive/doutorado/P_IA368DD_2023S/aula5/TREC_COVID_2020/expanded_corpus/expanded_corpus.jsonl', 'r') as json_file:
      json_list = list(json_file)

  expanded_corpus_ids = []
  expanded_corpus_contents = []
  for json_str in tqdm(json_list):
      result = json.loads(json_str)
      expanded_corpus_ids.append(result['id'])
      expanded_corpus_contents.append(result['contents'])

  expanded_corpus_df = pd.DataFrame({'id':expanded_corpus_ids,
                                    'contents':expanded_corpus_contents})
  expanded_corpus_df.to_csv(f'{path_data}trec-covid/expanded_corpus/expanded_corpus.csv')
else:
  expanded_corpus_df = pd.read_csv(f'{path_data}trec-covid/expanded_corpus/expanded_corpus.csv')
  expanded_corpus_df = expanded_corpus_df.rename(columns={'contents':'text'})
  expanded_corpus_df = expanded_corpus_df.drop(columns=['Unnamed: 0'])

# Converte o qrels que veio no trec-covid.zip pra o formato esperado:
with open(f'{path_data}trec-covid/qrels/test.tsv', 'r') as fin:
  data = fin.read().splitlines(True)
with open(f'{path_data}trec-covid/qrels/test_corrigido.tsv', 'w') as fout:
  for linha in data[1:]:
    campos = linha.split()
    fout.write(f'{campos[0]}\t0\t{campos[1]}\t{campos[2]}\n')

def carrega_corpus_trec_covid():
  retorno = []
  ids = []
  titles = []
  texts = []
  with open(f'{path_data}trec-covid/corpus.jsonl') as corpus:
    for i, line in enumerate(corpus):
      doc = json.loads(line)
      ids.append(doc['_id'])
      titles.append(doc['title'])
      texts.append(doc['text'])
      retorno.append(
          (doc['_id'], f"{doc['title']} {doc['text']}")
      )
      if (i % 10000 == 0):
        print(f'Processado {i} documentos')
    corpus_df = pd.DataFrame({"id":ids,"title":titles,"text":texts})
    return retorno, corpus_df

def carrega_queries_trec_covid():
  retorno = []
  with open(f'{path_data}trec-covid/queries.tsv','w') as f_out:
    with open(f'{path_data}trec-covid/queries.jsonl') as queries:
      for line in queries:
        query = json.loads(line)
        # Faz apenas uma pequena tradução de _id para id e text para texto
        retorno.append({'id': query['_id'], 'texto': query['text']})
        f_out.write(f"{query['_id']}\t{query['text']}\n")
  return retorno


In [102]:
%%time
queries_trec_covid = carrega_queries_trec_covid()
corpus_trec_covid, df_corpus_trec_covid = carrega_corpus_trec_covid()


os.makedirs(f'{path_data}trec-covid/corpus_original', exist_ok=True)
with open(f'{path_data}trec-covid/corpus_original/corpus_original.json', 'w') as f:
    json.dump(corpus_trec_covid, f)

Processado 0 documentos
Processado 10000 documentos
Processado 20000 documentos
Processado 30000 documentos
Processado 40000 documentos
Processado 50000 documentos
Processado 60000 documentos
Processado 70000 documentos
Processado 80000 documentos
Processado 90000 documentos
Processado 100000 documentos
Processado 110000 documentos
Processado 120000 documentos
Processado 130000 documentos
Processado 140000 documentos
Processado 150000 documentos
Processado 160000 documentos
Processado 170000 documentos
CPU times: user 3.25 s, sys: 622 ms, total: 3.87 s
Wall time: 5.36 s


In [103]:
qrel = pd.read_csv(f"{path_data}trec-covid/test.tsv", 
                   sep = "\t",
                   header=None, 
                   skiprows = 1, 
                   names = ["query", "docid", "rel"])
qrel["q0"] = "q0"
qrel = qrel[["query","q0","docid","rel"]]
qrel = qrel.to_dict(orient = "list")

In [104]:
df_queries_trec_covid = pd.DataFrame({})
query_ids = []
query_texts = []

for item in queries_trec_covid:
  id = item["id"]
  query_ids.append(id)
  text = item["texto"]
  query_texts.append(text)

df_queries_trec_covid["query_ids"] = query_ids
df_queries_trec_covid["query_texts"] = query_texts

In [None]:
def eval_ndcg10(run):
  trec_eval = load("trec_eval")
  results = trec_eval.compute(predictions=[run], references=[qrel])
  return results['NDCG@10'] 

def run_bm25(searcher,test_ids, test_corpus, top_k = 1000):
  run = defaultdict(list)

  for id, query in tqdm(zip(test_ids, test_corpus)):
    bm25_hits = searcher.search(query, k = top_k)
    ids = [json.loads(bm25_hits[i].raw)['_id'] for i in range(len(bm25_hits))]
    run["query"] += [id] * top_k
    run["q0"] += ["q0"] * top_k
    run["docid"] += ids
    run["rank"] += list(range(1,top_k+1))
    run["score"] += [1] * top_k
    run["system"] += ['bm25'] * top_k

  return run

In [None]:
searcher = LuceneSearcher.from_prebuilt_index("beir-v1.0.0-trec-covid.flat") #('./data/trec-covid/original_index')

_run_bm25 = run_bm25(searcher,query_ids, query_texts)
baseline_bm25_ndcg = eval_ndcg10(_run_bm25)
baseline_bm25_ndcg

Downloading index at https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.beir-v1.0.0-trec-covid.flat.20221116.505594.tar.gz...


lucene-index.beir-v1.0.0-trec-covid.flat.20221116.505594.tar.gz: 216MB [00:08, 26.6MB/s]                           
50it [00:09,  5.46it/s]


Downloading builder script:   0%|          | 0.00/5.51k [00:00<?, ?B/s]

0.5946917010118077

In [None]:
%%markdown
# Doc2query

# Doc2query


In [None]:
!python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input /content/drive/MyDrive/doutorado/P_IA368DD_2023S/aula5/TREC_COVID_2020/expanded_corpus/ \
  --index data/indexes/doc2query_expasion_index \
  --generator DefaultLuceneDocumentGenerator \
  --threads 9 \
  --storePositions --storeDocvectors --storeRaw

In [None]:
!python -m pyserini.search.lucene \
  --index data/indexes/doc2query_expasion_index \
  --topics {path_data}trec-covid/queries.tsv \
  --output {path_data}/runs/run.trec-covid.bm25tuned-doc2query_expansion.txt \
  --hits 1000 \
  --bm25

Running ./data/trec-covid/queries.tsv topics, saving to ./data//runs/run.trec-covid.bm25tuned-doc2query_expansion.txt...
100% 50/50 [00:05<00:00,  8.38it/s]


In [None]:
run = pd.read_csv(f"{path_data}runs/run.trec-covid.bm25tuned-doc2query_expansion.txt", 
                  sep="\s+", 
                  names=["query", "q0", "docid", "rank", "score", "system"])
run = run.to_dict(orient="list")

In [None]:
doc2queryndcg = eval_ndcg10(run)
doc2queryndcg

0.648239216238283

In [None]:
 searcher = LuceneSearcher('data/indexes/doc2query_expasion_index')

In [None]:
%%time
hits = searcher.search('How much does the COVID-19 test cost')

CPU times: user 34.4 ms, sys: 2.68 ms, total: 37.1 ms
Wall time: 54.9 ms


In [None]:
# Print the first 10 hits:
for i in range(0, 10):
    print(f'{i+1:2} {hits[i].docid:15} {hits[i].score:.5f}')

 1 yanexvt1        9.89990
 2 3rblzyry        9.23580
 3 3o6aupqz        9.14730
 4 8809ay2a        9.14730
 5 komiury1        9.10830
 6 pcyfx7t4        8.98350
 7 tib0g9ok        8.70680
 8 ngsstnpr        8.48470
 9 7dbbph2i        8.24620
10 9dttamtf        8.22900


In [None]:
expanded_corpus_df[expanded_corpus_df['id'] == 'yanexvt1']['text'].values

array(['how much does covid cost. Treatment of coronavirus disease 2019 in Shandong, China: a cost and affordability analysis. BACKGROUND: Coronavirus disease 2019 (COVID-19) is now a global public threat. Given the pandemic of COVID-19, the economic impact of COVID-19 is essential to add value to the policy-making process. We retrospectively conducted a cost and affordability analysis to determine the medical costs of COVID-19 patients in China, and also assess the factors affecting their costs. METHODS: This analysis was retrospectively conducted in Shandong Provincial Chest Hospital between 24 January and 16 March 2020. The total direct medical expenditures were analyzed by cost factors. We also assessed affordability by comparing the simulated out-of-pocket expenditure of COVID-19 cases relative to the per capita disposable income. Differences between groups were tested by student t test and Mann-Whitney test when appropriate. A multiple logistic regression model was built to deter

In [None]:
%%markdown
## Reranking with Cross Encoder

## Reranking with Cross Encoder


In [None]:
model_id = "cross-encoder/ms-marco-MiniLM-L-6-v2"
model_name = "ms-marco-MiniLM-L-6-v2"
model = CrossEncoder(model_id, max_length=512, num_labels=1)
#model = model.to(device)

In [119]:
def eval_ndcg10(run):
  trec_eval = load("trec_eval")
  results = trec_eval.compute(predictions=[run], references=[qrel])
  return results['NDCG@10'] 

def run_pipeline(passages,searcher,test_ids, test_corpus, top_k = 1000):
    docs_reranking = []
    for id, query in tqdm(zip(test_ids, test_corpus)):
      bm25_hits = searcher.search(query, k = top_k)
      docs = []
      for doc_id in bm25_hits:
        passagens = passages[passages['id']== doc_id.docid]["text"].values
        if len(passagens) >= 1:
          passagens = passagens[0]
          #import pdb;
          #pdb.set_trace();

        doc = {
            "query_id": id,
            "passage_id": doc_id.docid,
            "query_text": query,
            "passage_text": passagens,
        }
        docs.append(doc)
      # Second stage
      #print("here")
      aux, _dt = reranking_cross_encoder(docs, 100, 100)
      docs_reranking.extend(aux)
    return docs_reranking, _dt


def reranking_cross_encoder(docs, max=1000, batch_size=500):
    scores = []

    query_ids = []
    passage_ids = []

    start = time()
    for i in tqdm(range(0, len(docs), batch_size), leave=False):  # tqdm(docs):
        i_end = i + batch_size
        i_end = len(docs) if i_end > len(docs) else i_end

        batch = docs[i:i_end]
        
        text_pairs = [(sample['query_text'], sample['passage_text']) for sample in batch]
        
        predictions = model.predict(text_pairs)
        
        for score, result in zip(predictions, batch):
            query_id = result["query_id"]
            query_ids.append(query_id)

            passage_id = result["passage_id"]
            passage_ids.append(passage_id)
            scores.append(score)

    results = []

    for query, passage, score in zip(query_ids, passage_ids, scores):
        results.append((query, passage, score))

    sorted_list = sorted(results, key=lambda x: x[2], reverse=True)
    end = time()
    elapsed_time = round(end - start, 3)

    #if "cross_encoder" not in data:
    #    data["cross_encoder"] = {
    #        "reranking": 0
    #    }
    data = {"cross_encoder": {"reranking": 0}}
    data["cross_encoder"]["reranking"] += elapsed_time

    return sorted_list[:max], data

In [None]:
searcher = LuceneSearcher.from_prebuilt_index("beir-v1.0.0-trec-covid.flat")
#_run_bm25 = run_bm25(searcher,query_ids, query_texts)
docs_reranking, data = run_pipeline(df_corpus_trec_covid,searcher,query_ids, query_texts)

In [None]:
ranks = np.arange(1,101)
all_ranks = np.tile(ranks,50)

df_docs_reranking = pd.DataFrame(docs_reranking,columns=["query","docid","score"])
df_docs_reranking['q0'] = "Q0"
df_docs_reranking["rank"] = all_ranks#.reshape(-1,1)
df_docs_reranking["system"] = "CrossEncoder"
df_docs_reranking = df_docs_reranking[["query","q0","docid","rank","score","system"]]
df_docs_reranking.to_csv(f"{path_data}runs/run.trec-covid.crossEncoder.txt")

In [None]:
run = df_docs_reranking.to_dict(orient="list")
doc2queryndcg = eval_ndcg10(run)
doc2queryndcg

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  topX = topX.groupby("query").apply(lambda x: x.head(n_relevant_docs.loc[x.name])).reset_index(drop=True)


0.6388725678226342

In [None]:
%%markdown
# doc2query + Cross encoder

# doc2query + Cross encoder


In [None]:
model_id = "cross-encoder/ms-marco-MiniLM-L-6-v2"
model_name = "ms-marco-MiniLM-L-6-v2"
model = CrossEncoder(model_id, max_length=512, num_labels=1)
#model = model.to(device)


#searcher = LuceneSearcher.from_prebuilt_index("beir-v1.0.0-trec-covid.flat")
searcher = LuceneSearcher('data/indexes/doc2query_expasion_index')
#LuceneSearcher('data/indexes/doc2query_expasion_index')
#_run_bm25 = run_bm25(searcher,query_ids, query_texts)
#docs_expanded_reranking, data_ = run_pipeline(df_corpus_trec_covid,searcher,expanded_corpus_df['id'], expanded_corpus_df['contents'])
docs_reranking, data = run_pipeline(expanded_corpus_df,searcher,query_ids, query_texts)

In [121]:
ranks = np.arange(1,101)
all_ranks = np.tile(ranks,50)

df_docs_reranking = pd.DataFrame(docs_reranking,columns=["query","docid","score"])
df_docs_reranking['q0'] = "Q0"
df_docs_reranking["rank"] = all_ranks#.reshape(-1,1)
df_docs_reranking["system"] = "CrossEncoder"
df_docs_reranking = df_docs_reranking[["query","q0","docid","rank","score","system"]]
df_docs_reranking.to_csv(f"{path_data}runs/run.trec-covid.crossEncoder_corpusExpanded.txt")

In [122]:
run = df_docs_reranking.to_dict(orient="list")
doc2queryndcg = eval_ndcg10(run)
doc2queryndcg

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  topX = topX.groupby("query").apply(lambda x: x.head(n_relevant_docs.loc[x.name])).reset_index(drop=True)


0.7166953809546601

In [125]:
!nvidia-smi

Thu May 11 03:30:13 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   77C    P0    32W /  70W |   3321MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [123]:
with open(f"{path_data}runs/run.trec-covid.bm25.{model_name}_reranked_st_bl_expanded.trec", "w") as f:
    for i, (query_id, passage_id, score) in enumerate(docs_reranking):
        f.write(f'{query_id}\tQ0\t{passage_id}\t{i+1}\t{score}\t{model_name}_reranked\n')

In [124]:
!python -m pyserini.eval.trec_eval -c -m map -m recip_rank -m ndcg_cut.10 {path_data}trec-covid/qrels/test_corrigido.tsv {path_data}runs/run.trec-covid.bm25.{model_name}_reranked_st_bl_expanded.trec

Downloading https://search.maven.org/remotecontent?filepath=uk/ac/gla/dcs/terrierteam/jtreceval/0.0.5/jtreceval-0.0.5-jar-with-dependencies.jar to /root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar...
/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar already exists!
Skipping download.
Running command: ['java', '-jar', '/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar', '-c', '-m', 'map', '-m', 'recip_rank', '-m', 'ndcg_cut.10', './data/trec-covid/qrels/test_corrigido.tsv', './data/runs/run.trec-covid.bm25.ms-marco-MiniLM-L-6-v2_reranked_st_bl_expanded.trec']
Results:
map                   	all	0.0898
recip_rank            	all	0.9217
ndcg_cut_10           	all	0.7167
