In [2]:
import json
from urllib.request import urlretrieve
import pandas as pd
from tqdm.auto import tqdm

import ir_measures as irms
from datasets import load_dataset

In [3]:
import pyterrier as pt
if not pt.started():
    pt.init()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


Code below is copied from 

In [None]:
# Only loading the first 40k docs from HF Datasets
ds = load_dataset('neuclir/neuclir1', split='zho', streaming=True) # total 3179209
doc_subset = [ o for i, o in zip(tqdm(range(40_000), desc='Loading first 40k docs from NeuCLIR Chinese Collection'), ds) ]
subset_doc_ids = set([ d['id'] for d in doc_subset ])

In [5]:
# Download topics and qrels from NIST
urlretrieve('https://trec.nist.gov/data/neuclir/topics.0720.utf8.jsonl', "topics.0720.utf8.jsonl")
urlretrieve('https://trec.nist.gov/data/neuclir/2022-qrels.zho', "2022-qrels.zho")

('2022-qrels.zho', <http.client.HTTPMessage at 0xffff35b58610>)

In [6]:
use_topic = '66' # use topic 66 as demo 

qrels = pd.DataFrame([ l for l in irms.read_trec_qrels('2022-qrels.zho') if l.query_id == use_topic and l.doc_id in subset_doc_ids ])
topics = [ t for t in map(json.loads, open("topics.0720.utf8.jsonl",encoding='UTF-8')) if t['topic_id'] == use_topic ]

In [7]:
topic_id_idx = { t['topic_id']: i for i, t in enumerate(topics) }
def get_query_by_topic_id(topic_id, query_type='title', lang="eng"):
    for topic in topics[ topic_id_idx[topic_id] ]['topics']:
      if topic["lang"] == lang:
        return topic[f'topic_{query_type}']

doc_id_to_idx = { d['id']: i for i, d in enumerate(doc_subset) }
def get_doc_text_by_doc_id(doc_id):
    doc = doc_subset[ doc_id_to_idx[doc_id] ]
    return doc['title'] + ' ' + doc['text']

In [8]:
with open("collection/zho_neuclir_subset.jsonl", "w") as f:
  for doc_id in tqdm(doc_id_to_idx, total = len(doc_id_to_idx)):
    content = get_doc_text_by_doc_id(doc_id)
    text = json.dumps({"id": doc_id, "contents": content})
    f.write(text+"\n")

  0%|          | 0/40000 [00:00<?, ?it/s]

In [9]:
topic_text = get_query_by_topic_id(use_topic, lang="zho")

In [10]:
with open("zho_topics.txt", "w") as f:
  f.write(f"{use_topic}\t{topic_text}\n")

In [12]:
to_rerank = pd.DataFrame([ l for l in irms.read_trec_run("runs/zho_neuclir_subset_bm25.title.txt")])
irms.calc_aggregate([irms.nDCG@20, irms.AP], qrels, to_rerank)

{AP: 0.06837054789182448, nDCG@20: 0.1482972305701491}

In [14]:
topics

[{'topic_id': '66',
  'languages_with_qrels': [],
  'topics': [{'lang': 'eng',
    'source': 'original',
    'topic_title': 'COVID-19 vaccination rate in China',
    'topic_description': 'I am interested in finding articles that provide information about the COVID-19 vaccination rate in China.',
    'topic_narrative': 'Find articles that that provide information about the COVID-19 vaccination rate in China. Relevant articles should include information of vaccination rate at the national level (mainland only) or local level, or discussions about the vaccination rate of the country or any of its regions if no specific rates are mentioned. Articles on other issues, like information and discussions about vaccines, effectiveness of the vaccination, vaccination rate in other countries, are not considered relevant. '},
   {'lang': 'fas',
    'source': 'human translation',
    'topic_title': 'نرخ واکسیناسیون کووید-۱۹ در چین',
    'topic_description': 'علاقه مند به یافتن مقالاتی هستم که اطلاعات

In [15]:
qrels

Unnamed: 0,query_id,doc_id,relevance,iteration
0,66,7d66f5e4-bc43-4bbf-8b4c-105873c77285,0,0
1,66,1a959671-efb2-4ccb-901d-8410b4ec021e,0,0
2,66,bb0d3721-fe49-4204-a444-2850908271fe,0,0
3,66,71e319d2-0ee2-4601-a56b-e1543633338e,0,0
4,66,350bfe16-8f80-4914-8217-2e18bbdec757,1,0
...,...,...,...,...
60,66,98eb7d7c-1cd7-45fa-8a6c-6c30a909adb5,0,0
61,66,504e9afe-6b36-426e-82e9-5c02c2b2818f,0,0
62,66,04794074-9fbd-495b-8855-0c5567130b2e,0,0
63,66,683861af-9a2f-4ab7-bda7-33069a226e05,0,0
