In [1]:
import torch
torch.cuda.is_available()

True

In [2]:
import pyterrier as pt
import numpy as np
import pandas as pd

from tqdm import tqdm
import zipfile
import glob
import ir_datasets
if not pt.started():
  pt.init()

from pyterrier_t5 import MonoT5ReRanker

PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [3]:
dataset_nyt = ir_datasets.load("nyt")
dataset_wapo = ir_datasets.load("wapo/v2")

In [None]:
dataset_nyt.

In [4]:
index_ref_nyt = pt.IndexRef.of("/app/indices/nyt/")
index_ref_wapo = pt.IndexRef.of("/app/indices/wapo/")

In [5]:
monoT5 = MonoT5ReRanker(text_field="body", batch_size=100, verbose=True)

bm25 = pt.BatchRetrieve(index_ref_wapo , wmodel='BM25', num_results=200)
mono_pipeline = bm25 >> pt.text.get_text(index_ref_wapo, "body") >> monoT5

mono_pipeline_500 = pt.BatchRetrieve(index_ref_wapo , wmodel='BM25', num_results=500) >>  pt.text.get_text(index_ref_wapo, "body") >> monoT5
mono_pipeline_50 = pt.BatchRetrieve(index_ref_wapo , wmodel='BM25', num_results=50) >>  pt.text.get_text(index_ref_wapo, "body") >> monoT5

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
ds1_nyt = pt.get_dataset('irds:nyt/trec-core-2017')
ds1_wapo = pt.get_dataset('irds:wapo/v2/trec-core-2018')

In [10]:
for i, row in ds1_nyt.get_topics('title').iterrows():
    query = row['query']

    with open(f"/workspace/data/nyt/topics/topic.{i+1}", "w") as f:
        f.write(query)

In [None]:
title_queries = ""
for i, row in ds1_nyt.get_topics('title').iterrows():
    i_id = i+1
    query = row['query']

    line = f"1,1,{i_id},{query}\n"
    title_queries += line

with open(f"/workspace/data/nyt/title_queries", "w") as f:
    f.write(title_queries)

In [6]:
ds1_nyt.get_topics('title')

Unnamed: 0,qid,query
0,307,new hydroelectric projects
1,310,radio waves and brain cancer
2,321,women in parliaments
3,325,cult lifestyles
4,330,iran iraq cooperation
5,336,black bear attacks
6,341,airport security
7,344,abuses of e mail
8,345,overseas tobacco sales
9,347,wildlife extinction


In [7]:
pt.Experiment(
    [bm25],
    ds1_wapo.get_topics('title'),
    ds1_wapo.get_qrels(),
    eval_metrics=["map", "recip_rank", "P_10", "ndcg_cut_10"],
    names=["BM25"],
    verbose=True
)

pt.Experiment: 100%|██████████| 1/1 [00:07<00:00,  7.63s/system]


Unnamed: 0,name,map,recip_rank,P_10,ndcg_cut_10
0,BM25,0.168733,0.663436,0.404,0.37107


In [9]:
pt.Experiment(
    [mono_pipeline],
    ds1_wapo.get_topics('title'),
    ds1_wapo.get_qrels(),
    eval_metrics=["map", "recip_rank", "P_10", "ndcg_cut_10"],
    names=["MonoT5"],
    verbose=True
)

monoT5: 100%|██████████| 100/100 [03:29<00:00,  2.10s/batches]
pt.Experiment: 100%|██████████| 1/1 [03:42<00:00, 222.32s/system]


Unnamed: 0,name,map,recip_rank,P_10,ndcg_cut_10
0,MonoT5,0.20841,0.670647,0.476,0.446702


In [None]:
pt.Experiment(
    [mono_pipeline_500],
    ds1_wapo.get_topics('title'),
    ds1_wapo.get_qrels(),
    eval_metrics=["map", "recip_rank", "P_10", "ndcg_cut_10"],
    names=["MonoT5"],
    verbose=True
)

In [8]:
pt.Experiment(
    [mono_pipeline_50],
    ds1_wapo.get_topics('title'),
    ds1_wapo.get_qrels(),
    eval_metrics=["map", "recip_rank", "P_10", "ndcg_cut_10"],
    names=["MonoT5"],
    verbose=True
)

pt.Experiment:   0%|          | 0/1 [00:00<?, ?system/s]Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors
monoT5: 100%|██████████| 25/25 [00:53<00:00,  2.15s/batches]
pt.Experiment: 100%|██████████| 1/1 [00:57<00:00, 57.32s/system]


Unnamed: 0,name,map,recip_rank,P_10,ndcg_cut_10
0,MonoT5,0.163762,0.657692,0.48,0.453916
