There is a slight problem with our formatting of webpage storage (too many files for pyserini). The indexing appears to freeze. So this is a script that makes it indexible by placing all webpages into a single file, then removing it once indexing is complete.

In [None]:
# reformat into jsonl file
import os
import json
os.mkdir('data_2017-09/webpages_tmp')
with open('data_2017-09/webpages_tmp/webpages_tmp.jsonl', 'w') as f:
    for i,file in enumerate(os.listdir('data_2017-09/webpages/')):
        webpage = json.load(open('data_2017-09/webpages/' + file, 'r'))
        f.write(json.dumps(webpage) + '\n')
        if i % 10000 == 0: print(i)

In [2]:
# sets up the pyserini sparse index
!python3 -m pyserini.index.lucene --stopwords "data_2017-09/stopwords.txt" --collection JsonCollection --input "data_2017-09/webpages_tmp" --index "data_2017-09/pyserini/pyserini_index" --generator DefaultLuceneDocumentGenerator --threads 10 -verbose

2022-05-07 11:43:17,504 INFO  [main] index.IndexCollection (IndexCollection.java:636) - Setting log level to DEBUG
2022-05-07 11:43:17,506 INFO  [main] index.IndexCollection (IndexCollection.java:646) - Starting indexer...
2022-05-07 11:43:17,507 INFO  [main] index.IndexCollection (IndexCollection.java:648) - DocumentCollection path: data_2017-09/webpages_tmp
2022-05-07 11:43:17,507 INFO  [main] index.IndexCollection (IndexCollection.java:649) - CollectionClass: JsonCollection
2022-05-07 11:43:17,507 INFO  [main] index.IndexCollection (IndexCollection.java:650) - Generator: DefaultLuceneDocumentGenerator
2022-05-07 11:43:17,508 INFO  [main] index.IndexCollection (IndexCollection.java:651) - Threads: 10
2022-05-07 11:43:17,508 INFO  [main] index.IndexCollection (IndexCollection.java:652) - Language: en
2022-05-07 11:43:17,508 INFO  [main] index.IndexCollection (IndexCollection.java:653) - Stemmer: porter
2022-05-07 11:43:17,508 INFO  [main] index.IndexCollection (IndexCollection.java:65

In [3]:
os.remove('data_2017-09/webpages_tmp/webpages_tmp.jsonl')
os.rmdir('data_2017-09/webpages_tmp')

In [2]:
from eval import eval

### BM25 for full run

In [1]:
!python3 -m pyserini.search.lucene --stopwords "data_2017-09/stopwords.txt" --batch-size 50 --threads 10 --hits 10 --index "data_2017-09/pyserini/pyserini_index" --topics "data_2017-09/queries/queries_test.tsv" --output out/bm25_runs/run.test_8_0.99.txt --bm25 --k1 8 --b 0.99

# note MAP = MRR when there is exactly one relevant result
# https://stats.stackexchange.com/questions/127041/mean-average-precision-vs-mean-reciprocal-ran
#!python3 -m pyserini.eval.trec_eval -m P.1 "data_2017-09/queries/relevance_scores.txt" out/bm25_runs/run.val_4_0.9.txt

Setting BM25 parameters: k1=8.0, b=0.99
Using custom stopwords=data_2017-09/stopwords.txt
Running data_2017-09/queries/queries_test.tsv topics, saving to out/bm25_runs/run.test_8_0.99.txt...
100%|█████████████████████████████████████| 15249/15249 [04:37<00:00, 55.04it/s]


In [4]:
ru = eval.load_run('out/bm25_runs/run.test_8_0.99.txt')
rs = eval.load_rel_scores('data_2017-09/queries/relevance_scores.txt')
q_ids = eval.load_query_ids('data_2017-09/queries/queries_val.tsv')
print('P@1: ', eval.compute_p1(rs,ru, q_ids))
print('MRR@10: ', eval.compute_mrr10(rs,ru, q_ids))

P@1:  0.20203336809176226
MRR@10:  0.27804125908270966


### BM25 for only last

In [5]:
!python3 -m pyserini.search.lucene --stopwords "data_2017-09/stopwords.txt" --batch-size 50 --threads 10 --hits 10 --index "data_2017-09/pyserini/pyserini_index" --topics "data_2017-09/queries_onlylast/queries_test.tsv" --output out/bm25_runs/run.onlylast.test_4_0.9.txt --bm25 --k1 4 --b 0.9

Setting BM25 parameters: k1=4.0, b=0.9
Using custom stopwords=data_2017-09/stopwords.txt
Running data_2017-09/queries_onlylast/queries_test.tsv topics, saving to out/bm25_runs/run.onlylast.test_4_0.9.txt...
100%|█████████████████████████████████████| 15249/15249 [03:32<00:00, 71.60it/s]


In [7]:
ru = eval.load_run('out/bm25_runs/run.onlylast.test_4_0.9.txt')
rs = eval.load_rel_scores('data_2017-09/queries/relevance_scores.txt')
q_ids = eval.load_query_ids('data_2017-09/queries_onlylast/queries_val.tsv')
print('P@1: ', eval.compute_p1(rs,ru, q_ids))
print('MRR@10: ', eval.compute_mrr10(rs,ru, q_ids))

P@1:  0.23468456725755996
MRR@10:  0.31207113167651523


### BM25 for remove last

In [8]:
!python3 -m pyserini.search.lucene --stopwords "data_2017-09/stopwords.txt" --batch-size 50 --threads 10 --hits 10 --index "data_2017-09/pyserini/pyserini_index" --topics "data_2017-09/queries_removelast/queries_test.tsv" --output out/bm25_runs/run.removelast.test_7_0.99.txt --bm25 --k1 7 --b 0.99

Setting BM25 parameters: k1=7.0, b=0.99
Using custom stopwords=data_2017-09/stopwords.txt
Running data_2017-09/queries_removelast/queries_test.tsv topics, saving to out/bm25_runs/run.removelast.test_7_0.99.txt...
100%|█████████████████████████████████████| 15249/15249 [03:26<00:00, 73.80it/s]


In [10]:
ru = eval.load_run('out/bm25_runs/run.removelast.test_7_0.99.txt')
rs = eval.load_rel_scores('data_2017-09/queries/relevance_scores.txt')
q_ids = eval.load_query_ids('data_2017-09/queries_removelast/queries_val.tsv')
print('P@1: ', eval.compute_p1(rs,ru, q_ids))
print('MRR@10: ', eval.compute_mrr10(rs,ru, q_ids))

P@1:  0.08192127215849844
MRR@10:  0.12333250057930666


### For easy evaluation of semantic search baseline

In [6]:
ru = eval.load_run('out/semantic_runs/run.test_removelast_msmarco-distilbert-cos-v5.txt')
rs = eval.load_rel_scores('data_2017-09/queries/relevance_scores.txt')
q_ids = eval.load_query_ids('data_2017-09/queries/queries_val.tsv')
print('P@1: ', eval.compute_p1(rs,ru, q_ids))
print('MRR@10: ', eval.compute_mrr10(rs,ru, q_ids))

P@1:  0.06654066736183524
MRR@10:  0.09665750119999321


### For semantic fine tune CSV validation analysis

In [3]:
import csv
#with open('out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-dot-v5-queries_onlylast-2022-05-17_19-33-33/eval/Information-Retrieval_evaluation_results.csv') as csvfile:
#with open('out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-dot-v5-queries_onlylast-2022-05-17_12-01-23/eval/Information-Retrieval_evaluation_results.csv') as csvfile:

#with open('out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-dot-v5-queries_removelast-2022-05-18_13-27-41/eval/Information-Retrieval_evaluation_results.csv') as csvfile: #cosine
#with open('out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-dot-v5-queries_removelast-2022-05-19_07-51-48/eval/Information-Retrieval_evaluation_results.csv') as csvfile: #dot

#with open('out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-cos-v5-queries-2022-05-21_20-12-34/eval/Information-Retrieval_evaluation_results.csv') as csvfile:
#with open('out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-cos-v5-queries_onlylast-2022-05-22_10-35-21/eval/Information-Retrieval_evaluation_results.csv') as csvfile:
with open('out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-cos-v5-queries_removelast-2022-05-22_18-44-28/eval/Information-Retrieval_evaluation_results.csv') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        print(row[0], row[1], row[6], row[14], row[21], row[29])

epoch steps cos_sim-Precision@1 cos_sim-MRR@10 dot_score-Precision@1 dot_score-MRR@10
0 3500 0.04959593326381648 0.07674741277289501 0.04959593326381648 0.07674741277289501
0 -1 0.05969760166840459 0.09267510075806472 0.05969760166840459 0.09267510075806472
1 3500 0.05194212721584984 0.08270819885131658 0.05194212721584984 0.08270819885131658
1 -1 0.05298488008342023 0.08063050341791222 0.05298488008342023 0.08063050341791222
2 3500 0.05396246089676746 0.08406054483837325 0.05396246089676746 0.08406054483837325
2 -1 0.0513555787278415 0.07936466557425889 0.0513555787278415 0.07936466557425889
3 3500 0.049791449426485924 0.07600654617077973 0.049791449426485924 0.07600654617077973
3 -1 0.049661105318039626 0.07581180586589875 0.049661105318039626 0.07581180586589875
4 3500 0.05070385818561001 0.07780074316831358 0.05070385818561001 0.07780074316831358
4 -1 0.04998696558915537 0.07757600302894882 0.04998696558915537 0.07757600302894882


### For evaluation of semantic finetune

In [9]:
ru = eval.load_run('out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-cos-v5-queries_removelast-2022-05-22_18-44-28/eval/run.test_removelast.txt')
rs = eval.load_rel_scores('data_2017-09/queries/relevance_scores.txt')
q_ids = eval.load_query_ids('data_2017-09/queries/queries_val.tsv')
print('P@1: ', eval.compute_p1(rs,ru, q_ids))
print('MRR@10: ', eval.compute_mrr10(rs,ru, q_ids))

P@1:  0.05885036496350365
MRR@10:  0.09088563649303986
