<a href="https://colab.research.google.com/github/juliatessler/1s2023-unicamp-dl-for-search-systems/blob/main/9-inpars/9_inpars.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 8- InPars
by Júlia Tessler

In [None]:
!pip install transformers -q
!pip install datasets -q
!pip install langchain -q
!pip install huggingface_hub -q
!pip install accelerate -q
!pip install openai -q
!pip install ftfy -q
!pip install pyserini -q
!pip install faiss-cpu -q
!pip install rank-eval -q
# !pip install pygaggle -q

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Wed May  3 20:22:51 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
from google.colab import drive

drive.mount('/content/gdrive', force_remount = True)

Mounted at /content/gdrive


In [None]:
# get a token: https://huggingface.co/docs/api-inference/quicktour#get-your-api-token

from getpass import getpass

HUGGINGFACEHUB_API_TOKEN = getpass()

··········


In [None]:
OPENAI_API_KEY = getpass()

··········


In [None]:
import os

os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
workdir = '/content/gdrive/MyDrive/Unicamp/DL_applied_to_IR/Notebooks'
my_personal_seed = 6

In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import pandas as pd
import numpy as np
import pytorch_lightning as pl
import json
import time
import ftfy

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BatchEncoding,
    AdamW,
    get_linear_schedule_with_warmup
)

from datasets import load_dataset
from langchain import (
    HuggingFacePipeline,
    HuggingFaceHub,
    PromptTemplate, 
    LLMChain
)
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM
)
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm
from collections import defaultdict
from pyserini.search.lucene import LuceneSearcher
from sklearn.model_selection import train_test_split
from statistics import mean
# from evaluate import load

## Dataset generation

In [None]:
trec_covid_queries = load_dataset("BeIR/trec-covid", 'queries')
trec_covid_corpus = load_dataset("BeIR/trec-covid", 'corpus')

Downloading builder script:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

Downloading and preparing dataset trec-covid/queries to /root/.cache/huggingface/datasets/BeIR___trec-covid/queries/0.0.0/093f1fe2ffa7a9c72fa48239c8f279b51d6b171abd77737c7fd1406125307599...


Downloading data:   0%|          | 0.00/4.70k [00:00<?, ?B/s]

Generating queries split: 0 examples [00:00, ? examples/s]

Dataset trec-covid downloaded and prepared to /root/.cache/huggingface/datasets/BeIR___trec-covid/queries/0.0.0/093f1fe2ffa7a9c72fa48239c8f279b51d6b171abd77737c7fd1406125307599. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading and preparing dataset trec-covid/corpus to /root/.cache/huggingface/datasets/BeIR___trec-covid/corpus/0.0.0/093f1fe2ffa7a9c72fa48239c8f279b51d6b171abd77737c7fd1406125307599...


Downloading data:   0%|          | 0.00/73.5M [00:00<?, ?B/s]

Generating corpus split: 0 examples [00:00, ? examples/s]

Dataset trec-covid downloaded and prepared to /root/.cache/huggingface/datasets/BeIR___trec-covid/corpus/0.0.0/093f1fe2ffa7a9c72fa48239c8f279b51d6b171abd77737c7fd1406125307599. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
trec_covid_corpus

DatasetDict({
    corpus: Dataset({
        features: ['_id', 'title', 'text'],
        num_rows: 171332
    })
})

In [None]:
trec_covid_corpus['corpus'][:2]

{'_id': ['ug7v899j', '02tnwd4m'],
 'title': ['Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia',
  'Nitric oxide: a pro-inflammatory mediator in lung disease?'],
 'text': ['OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More 

In [None]:
np.random.seed(my_personal_seed)

queries_ids = np.random.randint(len(trec_covid_corpus['corpus']), 
                                size = 1000)
queries_ids.shape

(1000,)

In [None]:
test_queries_ids = np.random.randint(len(trec_covid_corpus['corpus']), 
                                size = 5)
test_queries_ids.shape

(5,)

In [None]:
# Using tutorial 
# https://python.langchain.com/en/latest/modules/models/llms/integrations/huggingface_hub.html
# flan_model = "google/flan-t5-xl"
# llama_model = 'decapoda-research/llama-7b-hf'

# llm = HuggingFacePipeline.from_model_id(model_id = llama_model, 
#                                         model_kwargs={"temperature":0, "max_length":64},
#                                         task="text-generation",
#                                         device = 0)
# llm = HuggingFaceHub(repo_id=flan_model,
#                          model_kwargs={'temperature':1e-10,
#                                        "max_length": 64})

In [None]:
# Using tutorial
# https://python.langchain.com/en/latest/modules/models/llms/getting_started.html

llm = ChatOpenAI(model_name = 'gpt-3.5-turbo',
                 model_kwargs={'temperature':1e-10})

In [None]:
template = """Given this document, please write a good query for it.

Document: {title}
{text}"""

prompt = PromptTemplate(template = template, 
                        input_variables = ["title", "text"])
llm_chain = LLMChain(prompt = prompt, llm = llm)

title = trec_covid_corpus['corpus']['title'][0]
text = trec_covid_corpus['corpus']['text'][0]

print(llm_chain.run({'title': title, 'text': text}))

What are the epidemiology and clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital in Jeddah, Saudi Arabia?


In [None]:
# To make it easier to organize things, I created a Pandas DataFrame to fill the data
df = pd.DataFrame()
pos_doc_ids = []

for idx in test_queries_ids:
  pos_doc_ids.append(trec_covid_corpus['corpus']['_id'][idx])

df['positive_doc_id'] = pos_doc_ids
df.head()

Unnamed: 0,positive_doc_id
0,td0776wj
1,wlk4mooc
2,zolyze84
3,63hs1loe
4,ytejzvgg


In [None]:
# Inspired by Gustavo Guedes 
# https://colab.research.google.com/drive/1QE6xVgoZiRzksRdPRmKEZLfTNXZv0wNG#scrollTo=mBMmmB4Osnqw
MAX_REQUEST_PER_MINUTE = 50

def generate_queries(llm_chain, samples, save_file_path):
  request_count = 0
  df = pd.DataFrame()
  generated_queries = []
  pos_doc_ids = []

  for sample in tqdm(samples):
    title = ftfy.fix_text(trec_covid_corpus['corpus']['title'][sample])
    text = ftfy.fix_text(trec_covid_corpus['corpus']['text'][sample])

    generated_query = llm_chain.run({'title': title, 'text': text})
    generated_queries.append(generated_query)
    pos_doc_ids.append(trec_covid_corpus['corpus']['_id'][sample])

    request_count += 1

    if request_count == MAX_REQUEST_PER_MINUTE:
      print(f"{request_count} requests. Sleep")
      time.sleep(5)
      request_count = 0

  df['query'] = generated_queries
  df['positive_doc_id'] = pos_doc_ids

  df.to_csv(save_file_path)
  print(f'Saved to {save_file_path}')
  return df

In [None]:
test_generated_queries = generate_queries(llm_chain, test_queries_ids, f'{workdir}/test_queries.csv')

  0%|          | 0/5 [00:00<?, ?it/s]

Saved to /content/gdrive/MyDrive/Unicamp/DL_applied_to_IR/Notebooks/test_queries.csv


In [None]:
test_generated_queries

Unnamed: 0,query,positive_doc_id
0,What is the impact of bioinformatics and 'omic...,td0776wj
1,Query: What is neonatal necrotizing enterocoli...,wlk4mooc
2,What is the relationship between TH2 immune re...,zolyze84
3,What are the implications of age-specific unde...,63hs1loe
4,What were the genomic characteristics of the s...,ytejzvgg


In [None]:
generated_queries = generate_queries(llm_chain, queries_ids, f'{workdir}/generated_queries.csv')

  0%|          | 0/1000 [00:00<?, ?it/s]

50 requests. Sleep
50 requests. Sleep
50 requests. Sleep
50 requests. Sleep
50 requests. Sleep
50 requests. Sleep
50 requests. Sleep
50 requests. Sleep
50 requests. Sleep
50 requests. Sleep
50 requests. Sleep
50 requests. Sleep
50 requests. Sleep
50 requests. Sleep
50 requests. Sleep
50 requests. Sleep
50 requests. Sleep
50 requests. Sleep
50 requests. Sleep
50 requests. Sleep
Saved to /content/gdrive/MyDrive/Unicamp/DL_applied_to_IR/Notebooks/generated_queries.csv


In [None]:
generated_queries.head()

Unnamed: 0,query,positive_doc_id
0,What are the roles of Y43H and ∆344 in selecti...,vpqzjkhn
1,What were the imaging findings of 2019-nCoV in...,e5uyahea
2,Query: What are the latest updates on the spre...,sb1n3fra
3,What are the benefits of using nanomaterials f...,guiy89x8
4,"What is the safety, pharmacokinetics, and effi...",sfwvim5f


In [None]:
generated_queries.shape

(1000, 2)

### Negative examples


In [None]:
generated_queries = pd.read_csv(f'{workdir}/generated_queries.csv')
test_generated_queries = pd.read_csv(f'{workdir}/test_queries.csv')

In [None]:
# Inspired by Manoel Veríssimo dos Santos Neto
# https://github.com/verissimomanoel/P_IA368DD_2023S1/blob/main/Exercicio8/generate_dataset.py

def generate_ramdom_numbers(max = 5, k = 1000):
    random_list = []
    while len(random_list) < max:
        n = np.random.randint(0, k)

        # Prevent duplicated index
        if n not in random_list:
            random_list.append(n)

    return random_list

def search_with_bm25(query, max = 5, k = 1000):
    searcher = LuceneSearcher.from_prebuilt_index('beir-v1.0.0-trec-covid.flat')
    hits = searcher.search(query, k)
    random_list = generate_ramdom_numbers(max = max, k = k)
    random_ids = []

    for index in random_list:
        jsondoc = json.loads(hits[index].raw)
        random_ids.append(jsondoc["_id"])

    return random_ids

In [None]:
test_generated_queries['negative_doc_ids'] = test_generated_queries['query'].apply(search_with_bm25)

In [None]:
test_generated_queries.head()

Unnamed: 0.1,Unnamed: 0,query,positive_doc_id,negative_doc_ids
0,0,What is the impact of bioinformatics and 'omic...,td0776wj,"[3bvtcdja, bu1ib2ul, woz18l9j, fkeleaia, o13ti..."
1,1,Query: What is neonatal necrotizing enterocoli...,wlk4mooc,"[btnw6kn1, 5weczntn, vxfteypy, 4wnoa4l7, iqswl..."
2,2,What is the relationship between TH2 immune re...,zolyze84,"[in91dr4g, 0y53hnve, 7u6ofjul, ax87r0bj, 3a18y..."
3,3,What are the implications of age-specific unde...,63hs1loe,"[qiys3oz9, n8jk0iv3, kzz677r2, hb1etry7, 2zism..."
4,4,What were the genomic characteristics of the s...,ytejzvgg,"[ewdfqktw, dxj0mu4z, 3q3sktuq, b1r2bydo, rk46t..."


In [None]:
generated_queries['negative_doc_ids'] = generated_queries['query'].apply(search_with_bm25)

In [None]:
generated_queries.columns

Index(['Unnamed: 0', 'query', 'positive_doc_id', 'negative_doc_ids'], dtype='object')

In [None]:
generated_queries.drop('Unnamed: 0', axis = 1, inplace = True)
generated_queries.head()

Unnamed: 0,query,positive_doc_id,negative_doc_ids
0,What are the roles of Y43H and ∆344 in selecti...,vpqzjkhn,"[1in7m56w, 3utk3k6z, 4mhh29l0, l52dkwbx, ns8ut..."
1,What were the imaging findings of 2019-nCoV in...,e5uyahea,"[bnvozg5x, bgdr25z1, oegn8m1k, 5xki1ulf, zoakp..."
2,Query: What are the latest updates on the spre...,sb1n3fra,"[im5lqh6d, ne1qvf5g, 1915kvwk, dio3qyop, kohfd..."
3,What are the benefits of using nanomaterials f...,guiy89x8,"[djpwtaow, 10bu7iwg, clqfq8hw, licog17e, 0y9it..."
4,"What is the safety, pharmacokinetics, and effi...",sfwvim5f,"[xmrjnrwi, w8wrn9lx, iofm2qw4, ikcmwesr, rc5bn..."


In [None]:
generated_queries.to_json(f'{workdir}/juliatessler_1000_queries.jsonl', lines = True, orient = 'records')

## Getting dataset from all students

This code is a courtesy from [Marcos Piau](https://huggingface.co/datasets/unicamp-dl/trec-covid-experiment/blob/main/sugestao_uso_dataset.ipynb) 

In [None]:
ds = load_dataset('unicamp-dl/trec-covid-experiment')

Downloading builder script:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

Downloading and preparing dataset trec-covid-experiment/default to /root/.cache/huggingface/datasets/unicamp-dl___trec-covid-experiment/default/0.0.0/b4916ab469ccacf895d77d33bd1c846bb5cfdd8b4c50a7d5ee10f01f77e0310a...


Downloading data files:   0%|          | 0/18 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/309 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/346 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/185k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/255k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/152k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/311k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/627k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/280k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/307k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/238k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/237k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/356k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/266k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/249k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/241k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/224k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/188k [00:00<?, ?B/s]

Generating example split: 0 examples [00:00, ? examples/s]

Generating example2 split: 0 examples [00:00, ? examples/s]

Generating eduseiti_100_queries_expansion_20230501_01 split: 0 examples [00:00, ? examples/s]

Generating leandro_carisio_01 split: 0 examples [00:00, ? examples/s]

Generating thales_1k_generated_queries_20230429 split: 0 examples [00:00, ? examples/s]

Generating manoel_1k_generated_queries_20230430 split: 0 examples [00:00, ? examples/s]

Generating manoel_2k_generated_queries_20230501 split: 0 examples [00:00, ? examples/s]

Generating thiago_laitz_1k_queries split: 0 examples [00:00, ? examples/s]

Generating mirelle_1k_generated_queries_20230501 split: 0 examples [00:00, ? examples/s]

Generating hugo_padovani_query_generation split: 0 examples [00:00, ? examples/s]

Generating marcus_borela_1k_gptj6b_20230501 split: 0 examples [00:00, ? examples/s]

Generating juliatessler_1000_queries split: 0 examples [00:00, ? examples/s]

Generating pedro_holanda_1k_generated_queries_20230502 split: 0 examples [00:00, ? examples/s]

Generating leonardo_avila_queries_v1 split: 0 examples [00:00, ? examples/s]

Generating marcus_borela_1k_gptj6b_20230501_v2 split: 0 examples [00:00, ? examples/s]

Generating gustavo_1k_cohere split: 0 examples [00:00, ? examples/s]

Generating marcospiau_1k_v1 split: 0 examples [00:00, ? examples/s]

Generating pedrogengo_queries_inparsv1 split: 0 examples [00:00, ? examples/s]

Dataset trec-covid-experiment downloaded and prepared to /root/.cache/huggingface/datasets/unicamp-dl___trec-covid-experiment/default/0.0.0/b4916ab469ccacf895d77d33bd1c846bb5cfdd8b4c50a7d5ee10f01f77e0310a. Subsequent calls will reuse this data.


  0%|          | 0/18 [00:00<?, ?it/s]

In [None]:
df = pd.concat((v.to_pandas().assign(origin=k) for k,v in ds.items()),
               ignore_index=True)
df.head()

Unnamed: 0,query,positive_doc_id,negative_doc_ids,origin
0,This is a example query 1,doc1,"[xxx, yyy, zzz]",example
1,This is another example query,doc2,"[aaa, bbb, ccc]",example
2,Example of query with no negative doc_ids,doc2,[],example
3,This is a example query 1 (file 2),doc12222,"[xxx, yyy, zzz]",example2
4,This is another example query (file 2),doc12345,"[aaa, bbb, ccc]",example2


In [None]:
df = df.drop(df[(df.origin == 'example') | (df.origin == 'example2')].index)
df.head()

Unnamed: 0,query,positive_doc_id,negative_doc_ids,origin
6,How can chatbots be designed to effectively sh...,70hskj1o,"[mt00852w, x7ol32mz, b54dymlu, h5vh6px7, bza9a...",eduseiti_100_queries_expansion_20230501_01
7,What strategies can be used to encourage desir...,70hskj1o,"[et84j0qi, xsfolppr, 5t2o287y, kj2tnw8q, j68x0...",eduseiti_100_queries_expansion_20230501_01
8,What are the risks associated with amplifying ...,70hskj1o,"[2c1m04je, rd93y7hu, vlmvi0tf, dbq3z982, 848fs...",eduseiti_100_queries_expansion_20230501_01
9,What research has been conducted on the effect...,70hskj1o,"[49zlztqu, amjqr9hr, hpx4723v, e790rxq9, 95bso...",eduseiti_100_queries_expansion_20230501_01
10,How can collaborations between healthcare work...,70hskj1o,"[eg2lj9zc, prmf9yob, ara8bsws, zjmshwl3, apvc5...",eduseiti_100_queries_expansion_20230501_01


In [None]:
df.shape

(16672, 4)

In [None]:
def compute_len(negative_docs_list_size):
  return len(negative_docs_list_size)

df['negative_docs_list_size'] = df['negative_doc_ids'].map(compute_len)

In [None]:
df = df[df['negative_docs_list_size'] > 0]
df.shape

(15672, 5)

In [None]:
# This ready functions came from Mirelle

def search_in_corpus(doc_id, corpus):
  found = corpus[corpus['_id'] == doc_id]
  title = found['title'].to_list()[0]
  doc = found['text'].to_list()[0]
  return title + ' ' + doc

# df format = query label hypotesis
def format_data(df_all, corpus):
  data = {
      'query': [],
      'label': [],
      'passage': []
  }

  for idx, row in tqdm(df_all.iterrows(), total=len(df_all)):
    #row pos
    data['query'].append(row['query'])
    data['passage'].append(search_in_corpus(row['positive_doc_id'],
                                             corpus))
    data['label'].append(True)

    #row neg
    data['query'].append(row['query'])
    data['passage'].append(search_in_corpus(row['negative_doc_ids'][0],
                                             corpus))
    data['label'].append(False)
  return data

In [None]:
df_trec_covid_corpus = pd.DataFrame(trec_covid_corpus['corpus'])
df_trec_covid_corpus.head()

Unnamed: 0,_id,title,text
0,ug7v899j,Clinical features of culture-proven Mycoplasma...,OBJECTIVE: This retrospective chart review des...
1,02tnwd4m,Nitric oxide: a pro-inflammatory mediator in l...,Inflammatory diseases of the respiratory tract...
2,ejv2xln0,Surfactant protein-D and pulmonary host defense,Surfactant protein-D (SP-D) participates in th...
3,2b73a28n,Role of endothelin-1 in lung disease,Endothelin-1 (ET-1) is a 21 amino acid peptide...
4,9785vg6d,Gene expression in epithelial cells in respons...,Respiratory syncytial virus (RSV) and pneumoni...


In [None]:
df_all = pd.DataFrame(format_data(df, df_trec_covid_corpus))
df_all

  0%|          | 0/15672 [00:00<?, ?it/s]

Unnamed: 0,query,label,passage
0,How can chatbots be designed to effectively sh...,True,Chatbots in the fight against the COVID-19 pan...
1,How can chatbots be designed to effectively sh...,False,You Need a Plan: A Stepwise Protocol for Opera...
2,What strategies can be used to encourage desir...,True,Chatbots in the fight against the COVID-19 pan...
3,What strategies can be used to encourage desir...,False,Using Thinkalouds to Understand Rule Learning ...
4,What are the risks associated with amplifying ...,True,Chatbots in the fight against the COVID-19 pan...
...,...,...,...
31339,What is the effect of UV-photofunctionalizatio...,False,Water recycling with PV-powered UV-LED disinfe...
31340,What is the virtual foot and ankle physical ex...,True,The Virtual Foot and Ankle Physical Examinatio...
31341,What is the virtual foot and ankle physical ex...,False,First COVID-19 infections in the Philippines: ...
31342,What is the method used to detect Hepatitis A ...,True,Development of Lectin-Linked Immunomagnetic Se...


## Dataset Preparation

### Train/test split

In [None]:
df_train, df_test = train_test_split(df_all, 
                                     random_state = my_personal_seed, 
                                     train_size = 0.9)

df_train.shape, df_test.shape

((28209, 3), (3135, 3))

### PyTorch Dataset, DataLoader & Trainer classes

In [None]:
max_seq_length = 512
batch_size = 16        # T4: 16, V100: 32, A100: 64
lr = 5e-5
epochs = 10

In [None]:
model_name = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

optimizer = AdamW(model.parameters(), lr = lr)




In [None]:
# Got it from Carísio
class Dataset(Dataset):
  # Recebe um dataframe do pandas. Precisa ter as colunas query, passage e label (0/1)
  def __init__(self, tokenizer, df, max_seq_length):
    self.max_seq_length = max_seq_length
    self.tokenizer = tokenizer

    # Já concatenas as query com as passagens e guarda em uma lista
    query_passage = df['query'] + ' [SEP] ' + df['passage']
    self.query_passage = query_passage.tolist()
    # Converte os labels para inteiros e guarda em uma lista
    self.labels = df.label.tolist()
    self.labels = [float(x) for x in self.labels]

    # Cria um cache vazio. Como tem treino em algumas épocas, guarda o encode no cache
    self.cache = {}

  def __len__(self):
    return len(self.query_passage)
  
  def get_token_type_ids(self, input_ids):
    idx_sep = input_ids.index(102)+1
    tam_seq = len(input_ids)
    token_type_ids = [0]*idx_sep + [1]*(tam_seq - idx_sep)

    # Apesar do tokenizer fazer isso, não precisa pois o attention_mask já zera.
    # for i in range(len(token_type_ids)):
    #   token_type_ids[i] = token_type_ids[i] if input_ids[i] != 0 else 0

    return token_type_ids

  def get_token_type_ids_from_slice(self, idx, matriz_input_ids):
    if isinstance(idx, slice):
      token_types = []
      for i in range(idx.start or 0, idx.stop or len(matriz_input_ids), idx.step or 1):
        token_types.append(self.get_token_type_ids(matriz_input_ids[i]))
      return token_types
    else:
      return self.get_token_type_ids(matriz_input_ids)

  def get_input_ids_e_labels(self, idx):
    input_ids_e_labels = self.tokenizer(self.query_passage[idx],
                                padding=True,
                                truncation=True,
                                max_length=self.max_seq_length)
    input_ids_e_labels['labels'] = self.labels[idx]

    input_ids_e_labels['token_type_ids'] = self.get_token_type_ids_from_slice(idx, input_ids_e_labels['input_ids'])

    return input_ids_e_labels

  def __getitem__(self, idx):
    # Guarda os itens tokenizados num dict e apenas recupera de lá, pra não ter que ficar tokenizando a cada época
    # Como estamos guardando no dict e idx é um slice, é necessário converter ele pra algo mapeável
    self.cache[str(idx)] = self.cache.get(str(idx), self.get_input_ids_e_labels(idx))
    return self.cache[str(idx)]
    

In [None]:
dataset_train = Dataset(tokenizer, df_train, max_seq_length)
dataset_val = Dataset(tokenizer, df_test, max_seq_length)

collate_fn = lambda batch: BatchEncoding(tokenizer.pad(batch, return_tensors = 'pt'))
dataloader_train = DataLoader(dataset_train, 
                              batch_size = batch_size, 
                              shuffle = False, 
                              collate_fn = collate_fn)
dataloader_val = DataLoader(dataset_val, 
                            batch_size = batch_size, 
                            shuffle = False, 
                            collate_fn = collate_fn)

## Train model

In [None]:
def evaluate(model, dataloader, set_name):
  losses = []
  correct = 0
  model.eval()
  with torch.no_grad():
    for batch in tqdm(dataloader, mininterval=0.5, desc=set_name, disable=False):
      outputs = model(**batch.to(device))
      loss_val = outputs.loss
      losses.append(loss_val.cpu().item())
      # Só tem uma classe. Joga pra sigmoide e arredonda pro inteiro mais próximo
      preds = torch.round(torch.sigmoid(outputs.logits))
      correct += (preds.squeeze() == batch['labels']).sum().item()

  print(f'{set_name} loss: {mean(losses):0.3f}; {set_name} accuracy: {correct / len(dataloader.dataset):0.3f}')

def automodel_train(model, optimizer, dataloader_train, dataloader_val, epoch_inicial, epochs):
  num_training_steps = epochs * len(dataloader_train)
  # Warm up is important to stabilize training.
  num_warmup_steps = int(num_training_steps * 0.1)
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)

  evaluate(model=model, dataloader=dataloader_val, set_name='Validation')

  # Training loop
  for epoch in tqdm(range(epoch_inicial, epochs), desc='Epochs'):
    model.train()
    train_losses = []
    for batch in tqdm(dataloader_train, mininterval=0.5, desc='Train', disable=False):
      optimizer.zero_grad()
      outputs = model(**batch.to(device))
      loss = outputs.loss
      loss.backward()
      optimizer.step()
      scheduler.step()
      train_losses.append(loss.cpu().item())

    print(f'Epoch: {epoch + 1} Training loss: {mean(train_losses):0.2f}')
    model.save_pretrained(f'{workdir}/inpars-model/{epoch+1}/')
    evaluate(model=model, dataloader=dataloader_val, set_name='Validation')
    print('---------------------------------------------------------------------')

In [None]:
%%time
automodel_train(model, optimizer, dataloader_train, dataloader_val, 0, epochs = 20)

Validation:   0%|          | 0/196 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Validation loss: 40.924; Validation accuracy: 0.893


Epochs:   0%|          | 0/20 [00:00<?, ?it/s]

Train:   0%|          | 0/1764 [00:00<?, ?it/s]

Epoch: 1 Training loss: 3.44


Validation:   0%|          | 0/196 [00:00<?, ?it/s]

Validation loss: 0.119; Validation accuracy: 0.741
---------------------------------------------------------------------


Train:   0%|          | 0/1764 [00:00<?, ?it/s]

Epoch: 2 Training loss: 0.07


Validation:   0%|          | 0/196 [00:00<?, ?it/s]

Validation loss: 0.078; Validation accuracy: 0.880
---------------------------------------------------------------------


Train:   0%|          | 0/1764 [00:00<?, ?it/s]

Epoch: 3 Training loss: 0.03


Validation:   0%|          | 0/196 [00:00<?, ?it/s]

Validation loss: 0.028; Validation accuracy: 0.903
---------------------------------------------------------------------


Train:   0%|          | 0/1764 [00:00<?, ?it/s]

Epoch: 4 Training loss: 0.02


Validation:   0%|          | 0/196 [00:00<?, ?it/s]

Validation loss: 0.019; Validation accuracy: 0.946
---------------------------------------------------------------------


Train:   0%|          | 0/1764 [00:00<?, ?it/s]

Epoch: 5 Training loss: 0.01


Validation:   0%|          | 0/196 [00:00<?, ?it/s]

Validation loss: 0.016; Validation accuracy: 0.958
---------------------------------------------------------------------


Train:   0%|          | 0/1764 [00:00<?, ?it/s]

Epoch: 6 Training loss: 0.01


Validation:   0%|          | 0/196 [00:00<?, ?it/s]

Validation loss: 0.013; Validation accuracy: 0.923
---------------------------------------------------------------------


Train:   0%|          | 0/1764 [00:00<?, ?it/s]

Epoch: 7 Training loss: 0.01


Validation:   0%|          | 0/196 [00:00<?, ?it/s]

Validation loss: 0.013; Validation accuracy: 0.963
---------------------------------------------------------------------


Train:   0%|          | 0/1764 [00:00<?, ?it/s]

Epoch: 8 Training loss: 0.01


Validation:   0%|          | 0/196 [00:00<?, ?it/s]

Validation loss: 0.019; Validation accuracy: 0.723
---------------------------------------------------------------------


Train:   0%|          | 0/1764 [00:00<?, ?it/s]

Epoch: 9 Training loss: 0.00


Validation:   0%|          | 0/196 [00:00<?, ?it/s]

Validation loss: 0.015; Validation accuracy: 0.796
---------------------------------------------------------------------


Train:   0%|          | 0/1764 [00:00<?, ?it/s]

Epoch: 10 Training loss: 0.00


Validation:   0%|          | 0/196 [00:00<?, ?it/s]

Validation loss: 0.017; Validation accuracy: 0.968
---------------------------------------------------------------------


Train:   0%|          | 0/1764 [00:00<?, ?it/s]

Epoch: 11 Training loss: 0.00


Validation:   0%|          | 0/196 [00:00<?, ?it/s]

Validation loss: 0.013; Validation accuracy: 0.977
---------------------------------------------------------------------


Train:   0%|          | 0/1764 [00:00<?, ?it/s]

Epoch: 12 Training loss: 0.00


Validation:   0%|          | 0/196 [00:00<?, ?it/s]

Validation loss: 0.013; Validation accuracy: 0.969
---------------------------------------------------------------------


Train:   0%|          | 0/1764 [00:00<?, ?it/s]

Epoch: 13 Training loss: 0.00


Validation:   0%|          | 0/196 [00:00<?, ?it/s]

Validation loss: 0.017; Validation accuracy: 0.683
---------------------------------------------------------------------


Train:   0%|          | 0/1764 [00:00<?, ?it/s]

Epoch: 14 Training loss: 0.00


Validation:   0%|          | 0/196 [00:00<?, ?it/s]

Validation loss: 0.013; Validation accuracy: 0.982
---------------------------------------------------------------------


Train:   0%|          | 0/1764 [00:00<?, ?it/s]

Epoch: 15 Training loss: 0.00


Validation:   0%|          | 0/196 [00:00<?, ?it/s]

Validation loss: 0.014; Validation accuracy: 0.981
---------------------------------------------------------------------


Train:   0%|          | 0/1764 [00:00<?, ?it/s]

Epoch: 16 Training loss: 0.00


Validation:   0%|          | 0/196 [00:00<?, ?it/s]

Validation loss: 0.014; Validation accuracy: 0.980
---------------------------------------------------------------------


Train:   0%|          | 0/1764 [00:00<?, ?it/s]

Epoch: 17 Training loss: 0.00


Validation:   0%|          | 0/196 [00:00<?, ?it/s]

Validation loss: 0.014; Validation accuracy: 0.979
---------------------------------------------------------------------


Train:   0%|          | 0/1764 [00:00<?, ?it/s]

Epoch: 18 Training loss: 0.00


Validation:   0%|          | 0/196 [00:00<?, ?it/s]

Validation loss: 0.014; Validation accuracy: 0.983
---------------------------------------------------------------------


Train:   0%|          | 0/1764 [00:00<?, ?it/s]

Epoch: 19 Training loss: 0.00


Validation:   0%|          | 0/196 [00:00<?, ?it/s]

Validation loss: 0.013; Validation accuracy: 0.977
---------------------------------------------------------------------


Train:   0%|          | 0/1764 [00:00<?, ?it/s]

Epoch: 20 Training loss: 0.00


Validation:   0%|          | 0/196 [00:00<?, ?it/s]

Validation loss: 0.013; Validation accuracy: 0.983
---------------------------------------------------------------------
CPU times: user 3h 23min 16s, sys: 38.9 s, total: 3h 23min 55s
Wall time: 3h 26min 6s


## BM25 + Rerank
Very much copied from Mirelle

In [None]:
%%shell
cd /content/ &&  git clone --recurse-submodules https://github.com/castorini/pyserini.git
cd pyserini
cd tools/eval && tar xvfz trec_eval.9.0.4.tar.gz && cd trec_eval.9.0.4 && make && cd ../../..
cd tools/eval/ndeval && make && cd ../../..

Cloning into 'pyserini'...
remote: Enumerating objects: 7503, done.[K
remote: Counting objects: 100% (549/549), done.[K
remote: Compressing objects: 100% (312/312), done.[K
remote: Total 7503 (delta 371), reused 369 (delta 237), pack-reused 6954[K
Receiving objects: 100% (7503/7503), 4.42 MiB | 9.18 MiB/s, done.
Resolving deltas: 100% (5416/5416), done.
Submodule 'tools' (https://github.com/castorini/anserini-tools.git) registered for path 'tools'
Cloning into '/content/pyserini/tools'...
remote: Enumerating objects: 788, done.        
remote: Counting objects: 100% (545/545), done.        
remote: Compressing objects: 100% (467/467), done.        
remote: Total 788 (delta 101), reused 514 (delta 77), pack-reused 243        
Receiving objects: 100% (788/788), 119.60 MiB | 20.80 MiB/s, done.
Resolving deltas: 100% (185/185), done.
Submodule path 'tools': checked out '7b84f773225b5973b4533dfa0aa18653409a6146'
trec_eval.9.0.4/
trec_eval.9.0.4/m_prefs_pair.c
trec_eval.9.0.4/m_ndcg_p.c




In [None]:
!pip install pyserini faiss intel-openmp nltk --quiet
!apt install libomp-dev
%cd /content
!rm -rf pygaggle && pip uninstall -y pygaggle
!git clone  --recursive https://github.com/castorini/pygaggle.git
%cd pygaggle
! pip install --editable . --quiet
! pip install gensim==4.2.0 jsonlines --quiet
! pip install faiss-cpu --no-cache --quiet

[31mERROR: Could not find a version that satisfies the requirement faiss (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for faiss[0m[31m
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libomp-10-dev libomp5-10
Suggested packages:
  libomp-10-doc
The following NEW packages will be installed:
  libomp-10-dev libomp-dev libomp5-10
0 upgraded, 3 newly installed, 0 to remove and 24 not upgraded.
Need to get 351 kB of archives.
After this operation, 2,281 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 libomp5-10 amd64 1:10.0.0-4ubuntu1 [300 kB]
Get:2 http://archive.ubuntu.com/ubuntu focal/universe amd64 libomp-10-dev amd64 1:10.0.0-4ubuntu1 [47.7 kB]
Get:3 http://archive.ubuntu.com/ubuntu focal/universe amd64 libomp-dev amd64 1:10.0-50~exp1 [2,824 B]
Fetched 351 kB in 0s (3,262 kB/s)
Selecting previousl

In [None]:
!cd /content/ && ls

gdrive	pygaggle  pyserini  sample_data


In [None]:
data_out = []
for idx, row in trec_covid_corpus['corpus'].to_pandas().iterrows():
  segment = row['title'] + ' '+ row['text']
  data_out.append({'id':row['_id'], 'contents':segment})
with open('/content/trec-covid-corpus/corpus.jsonl', 'w') as fout:
    json.dump(data_out, fout)

In [None]:
! python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input /content/trec-covid-corpus \
  --index /content/Index_BM25 \
  --generator DefaultLuceneDocumentGenerator \
  --threads 9 \
  --storePositions --storeDocvectors --storeRaw

2023-05-04 01:25:44,149 INFO  [main] index.IndexCollection (IndexCollection.java:380) - Setting log level to INFO
2023-05-04 01:25:44,151 INFO  [main] index.IndexCollection (IndexCollection.java:383) - Starting indexer...
2023-05-04 01:25:44,151 INFO  [main] index.IndexCollection (IndexCollection.java:385) - DocumentCollection path: /content/trec-covid-corpus
2023-05-04 01:25:44,151 INFO  [main] index.IndexCollection (IndexCollection.java:386) - CollectionClass: JsonCollection
2023-05-04 01:25:44,152 INFO  [main] index.IndexCollection (IndexCollection.java:387) - Generator: DefaultLuceneDocumentGenerator
2023-05-04 01:25:44,152 INFO  [main] index.IndexCollection (IndexCollection.java:388) - Threads: 9
2023-05-04 01:25:44,153 INFO  [main] index.IndexCollection (IndexCollection.java:389) - Language: en
2023-05-04 01:25:44,153 INFO  [main] index.IndexCollection (IndexCollection.java:390) - Stemmer: porter
2023-05-04 01:25:44,154 INFO  [main] index.IndexCollection (IndexCollection.java:391

In [None]:
from pyserini.search.lucene import LuceneSearcher
from pygaggle.rerank.base import hits_to_texts


def get_results(path_out_bm25='/content/output_bm25', qrys=None,k=100):
  searcher = LuceneSearcher(path_out_bm25)

  results = {}
  for key, value in qrys.items():
    results_found = [found.docid for found in searcher.search(value, k)]
    scores = [found.score for found in searcher.search(value, k)]
    results[str(key)] = {'query':value, 'founds':results_found, 'scores':scores}
  return results


In [None]:
df_queries = trec_covid_queries['queries'].to_pandas()
df_queries.drop('title', axis = 1, inplace = True)
df_queries.head()

Unnamed: 0,_id,text
0,1,what is the origin of COVID-19
1,2,how does the coronavirus respond to changes in...
2,3,will SARS-CoV2 infected people develop immunit...
3,4,what causes death from Covid-19?
4,5,what drugs have been active against SARS-CoV o...


In [None]:
df_queries.shape

(50, 2)

In [None]:
results = get_results(path_out_bm25 = '/content/Index_BM25',  
                      qrys = dict(df_queries.values),
                      k = 1000)

In [None]:
print('Total qrels: ', len(results), ' --Total docs ids founds: ', len(results['1']['founds']))


Total qrels:  50  --Total docs ids founds:  1000


In [None]:
# qrels = load_dataset('beir/trec-covid-qrels')
!wget https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/trec-covid.zip
!unzip trec-covid.zip


--2023-05-04 02:07:32--  https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/trec-covid.zip
Resolving public.ukp.informatik.tu-darmstadt.de (public.ukp.informatik.tu-darmstadt.de)... 130.83.167.186
Connecting to public.ukp.informatik.tu-darmstadt.de (public.ukp.informatik.tu-darmstadt.de)|130.83.167.186|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 73876720 (70M) [application/zip]
Saving to: ‘trec-covid.zip.1’


2023-05-04 02:07:33 (45.9 MB/s) - ‘trec-covid.zip.1’ saved [73876720/73876720]

Archive:  trec-covid.zip
replace trec-covid/qrels/test.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: trec-covid/qrels/test.tsv  
replace trec-covid/corpus.jsonl? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: trec-covid/corpus.jsonl  
  inflating: trec-covid/queries.jsonl  


In [None]:
def load_file(path):
  qrls = {}
  qrels_file = pd.read_csv(path, sep='\t')

  for i, row in qrels_file.iterrows():
    qid = str(row['query-id'])
    if qid in qrls:
      qrls[qid]['doc_ids'].append(row['corpus-id'])
      qrls[qid]['rating'].append(row['score'])
    else:
      qrls[qid] = {'doc_ids':[row['corpus-id']], 'rating':[row['score']]}
  return qrls

In [None]:
qrels = load_file('trec-covid/qrels/test.tsv')

In [None]:
from rank_eval import Qrels, Run, evaluate

#Configs vars
qrels_ = Qrels()
qrels_.add_multi(q_ids=results.keys(),
                doc_ids=[qrels[k]['doc_ids'] for k in results.keys()],
                scores=[qrels[k]['rating'] for k in results.keys()])
run = Run()
run.add_multi(
    q_ids=results.keys(),
    doc_ids=[results[k]["founds"] for k in results.keys() ],
    scores=[results[k]["scores"] for k in results.keys()],
)


In [None]:
evaluate(qrels_, run, ["mrr","ndcg@10"]) # base bm25

{'mrr': 0.8528571428571429, 'ndcg@10': 0.5946917010118077}