In [1]:
import gzip
import json
import os

def convert_clerc_passages_to_jsonl():
    """Convert CLERC passage TSV to Pyserini JSONL format"""
    
    input_file = "collection/collection.passage.tsv.gz"
    output_file = "collection/passage_full.jsonl"
    
    # Ensure collection directory exists
    os.makedirs("collection", exist_ok=True)
    
    print("Converting CLERC passages to JSONL format...")
    doc_count = 0
    
    with gzip.open(input_file, 'rt', encoding='utf-8') as fin, \
         open(output_file, 'w', encoding='utf-8') as fout:
        
        for line_num, line in enumerate(fin):
            if line_num % 1000000 == 0:
                print(f"Processed {line_num:,} lines, {doc_count:,} documents")
            
            line = line.strip()
            if not line:
                continue
                
            # Split TSV: passage_id \t passage_text
            parts = line.split('\t', 1)
            if len(parts) < 2:
                continue
                
            passage_id, passage_text = parts[0], parts[1]
            
            # Create Pyserini document format
            doc = {
                "id": passage_id,
                "contents": passage_text
            }
            
            fout.write(json.dumps(doc, ensure_ascii=False) + '\n')
            doc_count += 1
    
    print(f"Conversion complete! {doc_count:,} passages written to {output_file}")
    return doc_count

# Run the conversion
convert_clerc_passages_to_jsonl()


Converting CLERC passages to JSONL format...
Processed 0 lines, 0 documents
Processed 1,000,000 lines, 1,000,000 documents
Processed 2,000,000 lines, 2,000,000 documents
Processed 3,000,000 lines, 3,000,000 documents
Processed 4,000,000 lines, 4,000,000 documents
Processed 5,000,000 lines, 5,000,000 documents
Processed 6,000,000 lines, 6,000,000 documents
Processed 7,000,000 lines, 7,000,000 documents
Processed 8,000,000 lines, 8,000,000 documents
Processed 9,000,000 lines, 9,000,000 documents
Processed 10,000,000 lines, 10,000,000 documents
Processed 11,000,000 lines, 11,000,000 documents
Processed 12,000,000 lines, 12,000,000 documents
Processed 13,000,000 lines, 13,000,000 documents
Processed 14,000,000 lines, 14,000,000 documents
Processed 15,000,000 lines, 15,000,000 documents
Processed 16,000,000 lines, 16,000,000 documents
Processed 17,000,000 lines, 17,000,000 documents
Processed 18,000,000 lines, 18,000,000 documents
Processed 19,000,000 lines, 19,000,000 documents
Processed 2

23711597

In [2]:
!mkdir -p clean_collection
!mv collection/passage_full.jsonl clean_collection/passage_full.jsonl


'mv' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
!move collection\passage_full.jsonl clean_collection\passage_full.jsonl

        1 file(s) moved.


In [1]:
!python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input clean_collection \
  --index indexes/bm25_clerc_full \
  --generator DefaultLuceneDocumentGenerator \
  --threads 16 \
  --storePositions --storeDocvectors --storeRaw \
  --optimize


2025-08-19 19:03:59,295 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:204) - Setting log level to INFO
2025-08-19 19:03:59,303 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:208) - AbstractIndexer settings:
2025-08-19 19:03:59,303 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:209) -  + DocumentCollection path: clean_collection
2025-08-19 19:03:59,304 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:210) -  + CollectionClass: JsonCollection
2025-08-19 19:03:59,304 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:211) -  + Index path: indexes/bm25_clerc_full
2025-08-19 19:03:59,304 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:212) -  + Threads: 16
2025-08-19 19:03:59,305 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:213) -  + Optimize (merge segments)? true
2025-08-19 19:03:59,376 INFO  [main] index.IndexCollection (IndexCollection.java:246) - Using DefaultEnglishAnalyzer
2025-08-19 19:03:59,386 INFO  [main] index.Ind

In [None]:

!python -m pyserini.search.lucene \
  --index indexes/bm25_clerc_full \
  --topics queries/test.single-removed.direct.tsv \
  --output runs/bm25.trec \
  --bm25 \
  --hits 1000

In [None]:
import sys
from collections import defaultdict

# --- CONFIG ---
run_file = "runs/debug_20.trec"
mapping_file = "collection/mapping.pid2did.tsv"
output_file = "runs/bm25_full.did.trec"
# --------------

# 1. Read mapping: pid -> did
pid2did = {}
with open(mapping_file, "r") as f:
    for line in f:
        line = line.strip()
        if not line: continue
        pid, did = line.split(None, 1)
        pid2did[pid] = did

# 2. Parse run and aggregate by (qid, did) with MaxP
maxp = defaultdict(lambda: defaultdict(lambda: (-float("inf"), "")))  # {qid: {did: (score, linebase)}}

with open(run_file, "r") as fin:
    for line in fin:
        parts = line.strip().split()
        if len(parts) < 6:
            continue
        qid, _, pid, _, score, run_name = parts[:6]
        did = pid2did.get(pid)
        if did is None:
            continue  # passage not mapped, skip
        score = float(score)
        # MaxP: keep the *highest*-scoring passage for each (qid, did)
        if score > maxp[qid][did][0]:
            maxp[qid][did] = (score, f"{qid} Q0 {did}")

# 3. Output TREC run file, with reranked documents per query (descending by score)
with open(output_file, "w") as fout:
    for qid in sorted(maxp):
        ranked = sorted(maxp[qid].items(), key=lambda x: -x[1][0])
        for rank, (did, (score, base)) in enumerate(ranked, 1):
            fout.write(f"{base} {rank} {score:.6f} BM25MaxP\n")


In [1]:
!python -m pyserini.eval.trec_eval \
  -c -m recall.1000 -m ndcg_cut.10 \
  qrles/qrels-doc.test.direct.tsv \
  runs/bm25_full.did.trec


Downloading https://search.maven.org/remotecontent?filepath=uk/ac/gla/dcs/terrierteam/jtreceval/0.0.5/jtreceval-0.0.5-jar-with-dependencies.jar to C:\Users\hrith\.cache\pyserini\eval\jtreceval-0.0.5-jar-with-dependencies.jar...
Running command: ['java', '-jar', 'C:\\Users\\hrith\\.cache\\pyserini\\eval\\jtreceval-0.0.5-jar-with-dependencies.jar', '-c', '-m', 'recall.1000', '-m', 'ndcg_cut.10', 'qrles/qrels-doc.test.direct.tsv', 'runs/bm25_full.did.trec']
Results:
recall_1000           	all	0.4840
ndcg_cut_10           	all	0.0608



jtreceval-0.0.5-jar-with-dependencies.jar: 0.00B [00:00, ?B/s]
jtreceval-0.0.5-jar-with-dependencies.jar:   0%|          | 8.00k/1.79M [00:03<11:53, 2.61kB/s]
jtreceval-0.0.5-jar-with-dependencies.jar:   2%|2         | 40.0k/1.79M [00:03<01:53, 16.1kB/s]
jtreceval-0.0.5-jar-with-dependencies.jar:   6%|5         | 104k/1.79M [00:03<00:36, 49.0kB/s] 
jtreceval-0.0.5-jar-with-dependencies.jar:  13%|#2        | 232k/1.79M [00:03<00:12, 129kB/s] 
jtreceval-0.0.5-jar-with-dependencies.jar:  21%|##1       | 392k/1.79M [00:03<00:05, 253kB/s]
jtreceval-0.0.5-jar-with-dependencies.jar:  26%|##5       | 472k/1.79M [00:03<00:04, 305kB/s]
jtreceval-0.0.5-jar-with-dependencies.jar:  34%|###3      | 616k/1.79M [00:04<00:02, 439kB/s]
jtreceval-0.0.5-jar-with-dependencies.jar:  40%|###9      | 728k/1.79M [00:04<00:02, 542kB/s]
jtreceval-0.0.5-jar-with-dependencies.jar:  46%|####5     | 840k/1.79M [00:04<00:01, 637kB/s]
jtreceval-0.0.5-jar-with-dependencies.jar:  53%|#####2    | 968k/1.79M [00:04<00:01