In [None]:
!pip install transformers -q

!pip install transformers sentencepiece -q
!pip install sacrebleu -q
!pip install huggingface_hub -q


!pip install pyserini -q
!pip install faiss-cpu==1.7.2 -q

!pip install datasets

In [4]:
import os
os.chdir('/content/drive/MyDrive/doutorado/P_IA368DD_2023S/aula5')

import gc

In [5]:
import glob
import json

import pickle


import numpy as np

from tqdm.auto import tqdm

In [6]:
from datasets import load_metric

In [7]:
import torch
from transformers import (AutoModelForSeq2SeqLM,
                          AutoTokenizer,
                          Seq2SeqTrainer,
                          Seq2SeqTrainingArguments,
                          T5ForConditionalGeneration,
                          T5Tokenizer
)
from transformers import EarlyStoppingCallback

from transformers import pipeline

In [8]:
from pyserini.search.lucene import LuceneSearcher

In [8]:
class MyCustomDataset(torch.utils.data.Dataset):
  def __init__(self, data_path,tokenizer):
      self.data = []
      
      self.tokenizer = tokenizer
      self.load_data(data_path)
      
  def load_data(self, data_path):
      with open(data_path, "r") as fin:
          for line in tqdm(fin):
              query, pos_doc, _ = line.strip().split("\t")
              encoding_inputs = self.tokenizer(
                  pos_doc,
                  add_special_tokens=True,
                  max_length=128,
                  truncation=True,
                  padding='max_length',
                  return_tensors='pt'
              )
              encoding_labels = self.tokenizer(
                  query,
                  add_special_tokens=True,
                  max_length=64,
                  truncation=True,
                  padding='max_length',
                  return_tensors='pt',
              )
              self.data.append((encoding_labels, encoding_inputs))
              
  def __len__(self):
      return len(self.data)

  def __getitem__(self, idx):

    encoding_labels, encoding_inputs = self.data[idx]
    return {
          'input_ids': encoding_inputs['input_ids'].flatten(),
          'labels': encoding_labels['input_ids'].flatten(),
      }

In [None]:
tokenizer = AutoTokenizer.from_pretrained("t5-base")
train_dataset = MyCustomDataset("data/train.tsv", tokenizer)
eval_dataset = MyCustomDataset("data/validation.tsv", tokenizer)

In [10]:
metric = load_metric("sacrebleu")


def postprocess_text(texts, is_label=False):
  if is_label:
    p_texts = [[text.strip()] for text in texts]
  else:
    p_texts = [text.strip() for text in texts]

  return p_texts

def compute_metrics(eval_preds):
  predictions, labels = eval_preds
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
  
  decoded_preds = postprocess_text(decoded_preds)
  decoded_labels = postprocess_text(decoded_labels,True)

  
  result = metric.compute(predictions=decoded_preds, references=decoded_labels)
  result = {"eval_sacrebleu": result["score"]}
  return result

  metric = load_metric("sacrebleu")


In [11]:
model = T5ForConditionalGeneration.from_pretrained("t5-base")

In [12]:
# Define the seq2seq training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=50,
    logging_dir='./logs',
    evaluation_strategy='epoch',
    seed=0,
    learning_rate=5e-4,
    predict_with_generate=True,
    save_strategy="epoch",
    fp16=True
)

# Compute_metrics function as argument
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

trainer.train()



Epoch,Training Loss,Validation Loss,Sacrebleu
1,0.2688,0.230459,16.908533
2,0.2012,0.228611,17.8957
3,0.1539,0.23605,18.65686
4,0.1267,0.250716,19.078961
5,0.0966,0.26882,19.439861
6,0.0726,0.287111,18.954199
7,0.0563,0.311799,19.291304
8,0.0426,0.325948,19.559675
9,0.0358,0.340482,19.55082
10,0.0305,0.346389,19.662743


TrainOutput(global_step=3130, training_loss=0.13409842709763745, metrics={'train_runtime': 1929.5574, 'train_samples_per_second': 51.825, 'train_steps_per_second': 1.622, 'total_flos': 1.5223947264e+16, 'train_loss': 0.13409842709763745, 'epoch': 10.0})

In [14]:
# Make sure we get the best epoch (best BLEU) - Epoch 10
model = AutoModelForSeq2SeqLM.from_pretrained("results/checkpoint-3130")

# Exports the final model and tokenizer to a folder
model.save_pretrained("results/final_model")
tokenizer.save_pretrained("results/final_model")

In [18]:
%%markdown
#TREC-COVID 2020

#TREC-COVID 2020


In [15]:
topics = []

with open("TREC_COVID_2020/queries.jsonl", "r") as fin:
  for line in fin:
    query = json.loads(line)
    topics.append((query["_id"], query["text"]))

In [7]:
os.makedirs('TREC_COVID_2020/corpus_original', exist_ok=True)

corpus = {}
with open("TREC_COVID_2020/corpus_original/corpus_original.jsonl", "w") as fout:
  with open("TREC_COVID_2020/corpus.jsonl", "r") as fin:
    for line in fin:
      doc = json.loads(line)
      corpus[doc["_id"]] = f"{doc['title']}. {doc['text']}"
      doc_dict = {
          "id": doc["_id"],
          "contents": f"{doc['title']}. {doc['text']}"
      }
      fout.write(json.dumps(doc_dict) + "\n")

In [44]:
def trec_run(save_path, topics, top_k, searcher):
  with open(save_path, 'w') as fout:
    for id, query in tqdm(topics):
      hits = searcher.search(query, top_k)
      for idx, hit in enumerate(hits):
        fout.write(f"{id}\tQ0\t{hit.docid}\t{idx+1}\t{hit.score}\tBM25\n")
     

In [None]:
# Creates the BM25 index
!python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input TREC_COVID_2020/corpus_original \
  --index TREC_COVID_2020/original_index \
  --language en\
  --generator DefaultLuceneDocumentGenerator \
  --threads 1 \
  --storePositions --storeDocvectors --storeRaw 
     

In [32]:
searcher = LuceneSearcher('TREC_COVID_2020/original_index')

trec_run('bm25/run-original-bm25.tsv', topics, 1000, searcher)

!python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 -mmap -l 2 TREC_COVID_2020/qrels.tsv bm25/run-original-bm25.tsv

Running queries:   0%|          | 0/50 [00:00<?, ?it/s]

In [19]:
%%markdown
# Start Doc2query

# Start Doc2query


In [36]:
!head -n 3 TREC_COVID_2020/expanded_corpus/expanded_corpus.jsonl

{"id": "kv1yqwgb", "contents": "what is the predictive value of a mechanical thrombectomy. Predictors of Outcome and Hemorrhage in Patients Undergoing Endovascular Therapy with Solitaire Stent for Acute Ischemic Stroke.. BACKGROUND Endovascular mechanical thrombectomy is emerging as a promising therapeutic approach for acute ischemic stroke and show some advantages. However, the data of predicting clinical outcome after thrombectomy with Solitaire retriever were limited. We attempt to identify prognostic factors of clinical outcome in patients with acute ischemic stroke undergoing thrombectomy with Solitaire retriever. METHODS We conducted a retrospective analysis of consecutive acute ischemic strokes cases treated between December 2010 and December2013 where the Solitaire stent retriever was used for acute ischemic stroke. We assessed the effect of selected demographic characteristics, clinical factors on poor outcome at 3 months (modified Rankin score 3-6), mortality at 3 months, and

In [None]:
# Creates the new BM25 expanded index
!python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input TREC_COVID_2020/expanded_corpus \
  --index TREC_COVID_2020/doc2query_index \
  --language en\
  --generator DefaultLuceneDocumentGenerator \
  --threads 1 \
  --storePositions --storeDocvectors --storeRaw 

In [16]:
searcher = LuceneSearcher('TREC_COVID_2020/doc2query_index')

trec_run('bm25/run-doc2query-bm25.tsv', topics, 1000, searcher)

!python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 -mmap -l 2 TREC_COVID_2020/qrels.tsv bm25/run-doc2query-bm25.tsv

Running queries:   0%|          | 0/50 [00:00<?, ?it/s]

In [20]:
%%markdown
#make comparisons

#make comparisons


In [22]:
%%markdown
## Original index -- no expand

## Original index -- no expand


In [24]:
searcher = LuceneSearcher('TREC_COVID_2020/original_index')

trec_run('bm25/run-original-bm25.tsv', topics, 1000, searcher)
!python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 -mmap -l 2 TREC_COVID_2020/qrels.tsv bm25/run-original-bm25.tsv

Running queries:   0%|          | 0/50 [00:00<?, ?it/s]

Downloading https://search.maven.org/remotecontent?filepath=uk/ac/gla/dcs/terrierteam/jtreceval/0.0.5/jtreceval-0.0.5-jar-with-dependencies.jar to /root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar...
/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar already exists!
Skipping download.
Running command: ['java', '-jar', '/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar', '-c', '-m', 'ndcg_cut.10', '-mmap', '-l', '2', 'TREC_COVID_2020/qrels.tsv', 'bm25/run-original-bm25.tsv']
Results:
map                   	all	0.1606
ndcg_cut_10           	all	0.5947


In [54]:
%%markdown
## doc2query index -- doc2query expand

## doc2query index -- doc2query expand


In [28]:
searcher = LuceneSearcher('TREC_COVID_2020/doc2query_index')

trec_run('bm25/run-doc2query-bm25.tsv', topics, 1000, searcher)
!python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 -mmap -l 2 TREC_COVID_2020/qrels.tsv bm25/run-doc2query-bm25.tsv

Running queries:   0%|          | 0/50 [00:00<?, ?it/s]

Downloading https://search.maven.org/remotecontent?filepath=uk/ac/gla/dcs/terrierteam/jtreceval/0.0.5/jtreceval-0.0.5-jar-with-dependencies.jar to /root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar...
/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar already exists!
Skipping download.
Running command: ['java', '-jar', '/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar', '-c', '-m', 'ndcg_cut.10', '-mmap', '-l', '2', 'TREC_COVID_2020/qrels.tsv', 'bm25/run-doc2query-bm25.tsv']
Results:
map                   	all	0.1800
ndcg_cut_10           	all	0.6482


In [32]:
%%markdown
## beir-v1.0.0-trec-covid.flat -- no expand

## beir-v1.0.0-trec-covid.flat -- no expand


In [33]:
searcher = LuceneSearcher.from_prebuilt_index('beir-v1.0.0-trec-covid.flat')

trec_run('bm25/run-original-bm25_from_prebuilt.tsv', topics, 1000, searcher)

!python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 -mmap -l 2 TREC_COVID_2020/qrels.tsv bm25/run-original-bm25_from_prebuilt.tsv

Running queries:   0%|          | 0/50 [00:00<?, ?it/s]

Downloading https://search.maven.org/remotecontent?filepath=uk/ac/gla/dcs/terrierteam/jtreceval/0.0.5/jtreceval-0.0.5-jar-with-dependencies.jar to /root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar...
/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar already exists!
Skipping download.
Running command: ['java', '-jar', '/root/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar', '-c', '-m', 'ndcg_cut.10', '-mmap', '-l', '2', 'TREC_COVID_2020/qrels.tsv', 'bm25/run-original-bm25_from_prebuilt.tsv']
Results:
map                   	all	0.1606
ndcg_cut_10           	all	0.5947
