There is a slight problem with our formatting of webpage storage (too many files for pyserini). The indexing appears to freeze. So this is a script that makes it indexible by placing all webpages into a single file, then removing it once indexing is complete.

In [None]:
# reformat into jsonl file
import os
import json
os.mkdir('data_2017-09/webpages_tmp')
with open('data_2017-09/webpages_tmp/webpages_tmp.jsonl', 'w') as f:
    for i,file in enumerate(os.listdir('data_2017-09/webpages/')):
        webpage = json.load(open('data_2017-09/webpages/' + file, 'r'))
        f.write(json.dumps(webpage) + '\n')
        if i % 10000 == 0: print(i)

In [2]:
# sets up the pyserini sparse index
!python3 -m pyserini.index.lucene --stopwords "data_2017-09/stopwords.txt" --collection JsonCollection --input "data_2017-09/webpages_tmp" --index "data_2017-09/pyserini/pyserini_index" --generator DefaultLuceneDocumentGenerator --threads 10 -verbose

2022-05-07 11:43:17,504 INFO  [main] index.IndexCollection (IndexCollection.java:636) - Setting log level to DEBUG
2022-05-07 11:43:17,506 INFO  [main] index.IndexCollection (IndexCollection.java:646) - Starting indexer...
2022-05-07 11:43:17,507 INFO  [main] index.IndexCollection (IndexCollection.java:648) - DocumentCollection path: data_2017-09/webpages_tmp
2022-05-07 11:43:17,507 INFO  [main] index.IndexCollection (IndexCollection.java:649) - CollectionClass: JsonCollection
2022-05-07 11:43:17,507 INFO  [main] index.IndexCollection (IndexCollection.java:650) - Generator: DefaultLuceneDocumentGenerator
2022-05-07 11:43:17,508 INFO  [main] index.IndexCollection (IndexCollection.java:651) - Threads: 10
2022-05-07 11:43:17,508 INFO  [main] index.IndexCollection (IndexCollection.java:652) - Language: en
2022-05-07 11:43:17,508 INFO  [main] index.IndexCollection (IndexCollection.java:653) - Stemmer: porter
2022-05-07 11:43:17,508 INFO  [main] index.IndexCollection (IndexCollection.java:65

In [3]:
os.remove('data_2017-09/webpages_tmp/webpages_tmp.jsonl')
os.rmdir('data_2017-09/webpages_tmp')

In [4]:
from eval import eval

### BM25 for full run

In [6]:
!python3 -m pyserini.search.lucene --stopwords "data_2017-09/stopwords.txt" --batch-size 50 --threads 10 --hits 10 --index "data_2017-09/pyserini/pyserini_index" --topics "data_2017-09/queries/queries_train.tsv" --output out/bm25_runs/run.train_8_0.99.txt --bm25 --k1 8 --b 0.99

# note MAP = MRR when there is exactly one relevant result
# https://stats.stackexchange.com/questions/127041/mean-average-precision-vs-mean-reciprocal-ran
#!python3 -m pyserini.eval.trec_eval -m P.1 "data_2017-09/queries/relevance_scores.txt" out/bm25_runs/run.val_4_0.9.txt

Setting BM25 parameters: k1=8.0, b=0.99
Using custom stopwords=data_2017-09/stopwords.txt
Running data_2017-09/queries/queries_train.tsv topics, saving to out/bm25_runs/run.train_9_0.99.txt...
100%|███████████████████████████████████| 128404/128404 [36:55<00:00, 57.95it/s]


In [18]:
ru = eval.load_run('out/bm25_runs/run.val_2_0.75.txt')
rs = eval.load_rel_scores('data_2017-09/queries/relevance_scores.txt')
q_ids = eval.load_query_ids('data_2017-09/queries/queries_val.tsv')
print('P@1: ', eval.compute_p1(rs,ru, q_ids))
print('MRR@10: ', eval.compute_mrr10(rs,ru, q_ids))

P@1:  0.17270594369134515
MRR@10:  0.23812078967840314


### BM25 for only last

In [1]:
!python3 -m pyserini.search.lucene --stopwords "data_2017-09/stopwords.txt" --batch-size 50 --threads 10 --hits 10 --index "data_2017-09/pyserini/pyserini_index" --topics "data_2017-09/queries_onlylast/queries_train.tsv" --output out/bm25_runs/run.onlylast.train_4_0.9.txt --bm25 --k1 4 --b 0.9

Setting BM25 parameters: k1=4.0, b=0.9
Using custom stopwords=data_2017-09/stopwords.txt
Running data_2017-09/queries_onlylast/queries_train.tsv topics, saving to out/bm25_runs/run.onlylast.train_4_0.9.txt...
100%|███████████████████████████████████| 128404/128404 [26:45<00:00, 79.99it/s]


In [6]:
ru = eval.load_run('out/bm25_runs/run.onlylast.val_4_0.99.txt')
rs = eval.load_rel_scores('data_2017-09/queries/relevance_scores.txt')
q_ids = eval.load_query_ids('data_2017-09/queries_onlylast/queries_val.tsv')
print('P@1: ', eval.compute_p1(rs,ru, q_ids))
print('MRR@10: ', eval.compute_mrr10(rs,ru, q_ids))

P@1:  0.24426485922836289
MRR@10:  0.320607688026549


### BM25 for remove last

In [13]:
!python3 -m pyserini.search.lucene --stopwords "data_2017-09/stopwords.txt" --batch-size 50 --threads 10 --hits 10 --index "data_2017-09/pyserini/pyserini_index" --topics "data_2017-09/queries_removelast/queries_val.tsv" --output out/bm25_runs/run.removelast.val_7_0.9.txt --bm25 --k1 7 --b 0.9

Setting BM25 parameters: k1=7.0, b=0.9
Using custom stopwords=data_2017-09/stopwords.txt
Running data_2017-09/queries_removelast/queries_val.tsv topics, saving to out/bm25_runs/run.removelast.val_7_0.9.txt...
100%|█████████████████████████████████████| 15344/15344 [03:40<00:00, 69.46it/s]


In [14]:
ru = eval.load_run('out/bm25_runs/run.removelast.val_7_0.9.txt')
rs = eval.load_rel_scores('data_2017-09/queries/relevance_scores.txt')
q_ids = eval.load_query_ids('data_2017-09/queries_removelast/queries_val.tsv')
print('P@1: ', eval.compute_p1(rs,ru, q_ids))
print('MRR@10: ', eval.compute_mrr10(rs,ru, q_ids))

P@1:  0.08335505735140772
MRR@10:  0.12613932130029562


### For easy evaluation of semantic search baseline

In [17]:
ru = eval.load_run('out/semantic_runs/run.val_removelast.txt')
rs = eval.load_rel_scores('data_2017-09/queries/relevance_scores.txt')
q_ids = eval.load_query_ids('data_2017-09/queries/queries_val.tsv')
print('P@1: ', eval.compute_p1(rs,ru, q_ids))
print('MRR@10: ', eval.compute_mrr10(rs,ru, q_ids))

P@1:  0.029131908237747653
MRR@10:  0.0431263913716338


### For semantic fine tune CSV validation analysis

In [18]:
import csv
with open('out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-dot-v5-queries-2022-05-10_21-05-52/eval/Information-Retrieval_evaluation_results.csv') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        print(row[0], row[1], row[21], row[29])

epoch steps dot_score-Precision@1 dot_score-MRR@10
0 500 0.528971028971029 0.6172169100740533
0 1000 0.554945054945055 0.6404942279942285
0 1500 0.5729270729270729 0.6543179043179042
0 2000 0.5544455544455544 0.6413007627293341
0 2500 0.5854145854145855 0.6650890775890778
0 3000 0.5744255744255744 0.6615555079840792
0 3500 0.5704295704295704 0.6545385170385173
0 4000 0.577922077922078 0.6630601541315827
0 4500 0.5719280719280719 0.6547488226059655
0 5000 0.5919080919080919 0.6734761270475554
0 5500 0.5604395604395604 0.6474739546168117
0 6000 0.5704295704295704 0.6585719835719837
