There is a slight problem with our formatting of webpage storage (too many files for pyserini). The indexing appears to freeze. So this is a script that makes it indexible by placing all webpages into a single file, then removing it once indexing is complete.

In [None]:
# reformat into jsonl file
import os
import json
os.mkdir('data_2017-09/webpages_tmp')
with open('data_2017-09/webpages_tmp/webpages_tmp.jsonl', 'w') as f:
    for i,file in enumerate(os.listdir('data_2017-09/webpages/')):
        webpage = json.load(open('data_2017-09/webpages/' + file, 'r'))
        f.write(json.dumps(webpage) + '\n')
        if i % 10000 == 0: print(i)

In [2]:
# sets up the pyserini sparse index
!python3 -m pyserini.index.lucene --stopwords "data_2017-09/stopwords.txt" --collection JsonCollection --input "data_2017-09/webpages_tmp" --index "data_2017-09/pyserini/pyserini_index" --generator DefaultLuceneDocumentGenerator --threads 10 -verbose

2022-05-07 11:43:17,504 INFO  [main] index.IndexCollection (IndexCollection.java:636) - Setting log level to DEBUG
2022-05-07 11:43:17,506 INFO  [main] index.IndexCollection (IndexCollection.java:646) - Starting indexer...
2022-05-07 11:43:17,507 INFO  [main] index.IndexCollection (IndexCollection.java:648) - DocumentCollection path: data_2017-09/webpages_tmp
2022-05-07 11:43:17,507 INFO  [main] index.IndexCollection (IndexCollection.java:649) - CollectionClass: JsonCollection
2022-05-07 11:43:17,507 INFO  [main] index.IndexCollection (IndexCollection.java:650) - Generator: DefaultLuceneDocumentGenerator
2022-05-07 11:43:17,508 INFO  [main] index.IndexCollection (IndexCollection.java:651) - Threads: 10
2022-05-07 11:43:17,508 INFO  [main] index.IndexCollection (IndexCollection.java:652) - Language: en
2022-05-07 11:43:17,508 INFO  [main] index.IndexCollection (IndexCollection.java:653) - Stemmer: porter
2022-05-07 11:43:17,508 INFO  [main] index.IndexCollection (IndexCollection.java:65

In [3]:
os.remove('data_2017-09/webpages_tmp/webpages_tmp.jsonl')
os.rmdir('data_2017-09/webpages_tmp')

In [4]:
from eval import eval

# note: v2 runs are with the queries that have been updated to resolve the wiki mobile link error

### BM25 for full run

In [1]:
!python3 -m pyserini.search.lucene --stopwords "data_2017-09/stopwords.txt" --batch-size 50 --threads 10 --hits 10 --index "data_2017-09/pyserini/pyserini_index" --topics "data_2017-09/queries/queries_train.tsv" --output out/bm25_runs/v2_run.train_8_0.99.txt --bm25 --k1 8 --b 0.99

# note MAP = MRR when there is exactly one relevant result
# https://stats.stackexchange.com/questions/127041/mean-average-precision-vs-mean-reciprocal-ran
#!python3 -m pyserini.eval.trec_eval -m P.1 "data_2017-09/queries/relevance_scores.txt" out/bm25_runs/run.val_4_0.9.txt

Setting BM25 parameters: k1=8.0, b=0.99
Using custom stopwords=data_2017-09/stopwords.txt
Running data_2017-09/queries/queries_train.tsv topics, saving to out/bm25_runs/v2_run.train_8_0.99.txt...
100%|███████████████████████████████████| 128404/128404 [36:34<00:00, 58.52it/s]


In [5]:
ru = eval.load_run('out/bm25_runs/v2_run.val_7_0.99.txt')
rs = eval.load_rel_scores('data_2017-09/queries/relevance_scores.txt')
q_ids = eval.load_query_ids('data_2017-09/queries/queries_val.tsv')
print('P@1: ', eval.compute_p1(rs,ru, q_ids))
print('MRR@10: ', eval.compute_mrr10(rs,ru, q_ids))

P@1:  0.20020855057351408
MRR@10:  0.27496974154625464


### BM25 for only last

In [2]:
!python3 -m pyserini.search.lucene --stopwords "data_2017-09/stopwords.txt" --batch-size 50 --threads 10 --hits 10 --index "data_2017-09/pyserini/pyserini_index" --topics "data_2017-09/queries_onlylast/queries_train.tsv" --output out/bm25_runs/v2_run.onlylast.train_4_0.9.txt --bm25 --k1 4 --b 0.9

Setting BM25 parameters: k1=4.0, b=0.9
Using custom stopwords=data_2017-09/stopwords.txt
Running data_2017-09/queries_onlylast/queries_train.tsv topics, saving to out/bm25_runs/v2_run.onlylast.train_4_0.9.txt...
100%|███████████████████████████████████| 128404/128404 [25:58<00:00, 82.40it/s]


In [9]:
ru = eval.load_run('out/bm25_runs/v2_run.onlylast.test_4_0.99.txt')
rs = eval.load_rel_scores('data_2017-09/queries/relevance_scores.txt')
q_ids = eval.load_query_ids('data_2017-09/queries_onlylast/queries_val.tsv')
print('P@1: ', eval.compute_p1(rs,ru, q_ids))
print('MRR@10: ', eval.compute_mrr10(rs,ru, q_ids))

P@1:  0.2286235662148071
MRR@10:  0.30180203316119714


### BM25 for remove last

In [8]:
!python3 -m pyserini.search.lucene --stopwords "data_2017-09/stopwords.txt" --batch-size 50 --threads 10 --hits 10 --index "data_2017-09/pyserini/pyserini_index" --topics "data_2017-09/queries_removelast/queries_test.tsv" --output out/bm25_runs/run.removelast.test_7_0.99.txt --bm25 --k1 7 --b 0.99

Setting BM25 parameters: k1=7.0, b=0.99
Using custom stopwords=data_2017-09/stopwords.txt
Running data_2017-09/queries_removelast/queries_test.tsv topics, saving to out/bm25_runs/run.removelast.test_7_0.99.txt...
100%|█████████████████████████████████████| 15249/15249 [03:26<00:00, 73.80it/s]


In [10]:
ru = eval.load_run('out/bm25_runs/run.removelast.test_7_0.99.txt')
rs = eval.load_rel_scores('data_2017-09/queries/relevance_scores.txt')
q_ids = eval.load_query_ids('data_2017-09/queries_removelast/queries_val.tsv')
print('P@1: ', eval.compute_p1(rs,ru, q_ids))
print('MRR@10: ', eval.compute_mrr10(rs,ru, q_ids))

P@1:  0.08192127215849844
MRR@10:  0.12333250057930666


### For easy evaluation of semantic search baseline

In [7]:
ru = eval.load_run('out/semantic_runs/v2_run.val_onlylast_msmarco-distilbert-cos-v5.txt')
rs = eval.load_rel_scores('data_2017-09/queries/relevance_scores.txt')
q_ids = eval.load_query_ids('data_2017-09/queries/queries_val.tsv')
print('P@1: ', eval.compute_p1(rs,ru, q_ids))
print('MRR@10: ', eval.compute_mrr10(rs,ru, q_ids))

P@1:  0.17127215849843588
MRR@10:  0.22523017114388463


### For semantic fine tune CSV validation analysis

In [2]:
import csv
#with open('out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-dot-v5-queries_onlylast-2022-05-17_19-33-33/eval/Information-Retrieval_evaluation_results.csv') as csvfile:
#with open('out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-dot-v5-queries_onlylast-2022-05-17_12-01-23/eval/Information-Retrieval_evaluation_results.csv') as csvfile:

#with open('out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-dot-v5-queries_removelast-2022-05-18_13-27-41/eval/Information-Retrieval_evaluation_results.csv') as csvfile: #cosine
#with open('out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-dot-v5-queries_removelast-2022-05-19_07-51-48/eval/Information-Retrieval_evaluation_results.csv') as csvfile: #dot

#with open('out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-cos-v5-queries-2022-05-21_20-12-34/eval/Information-Retrieval_evaluation_results.csv') as csvfile:
#with open('out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-cos-v5-queries_onlylast-2022-05-22_10-35-21/eval/Information-Retrieval_evaluation_results.csv') as csvfile:
with open('out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-cos-v5-queries-2022-06-01_12-04-50/eval/Information-Retrieval_evaluation_results.csv') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        print(row[0], row[1], row[6], row[14], row[21], row[29])

epoch steps cos_sim-Precision@1 cos_sim-MRR@10 dot_score-Precision@1 dot_score-MRR@10
0 3500 0.23559697601668406 0.3076395095751195 0.23559697601668406 0.3076395095751195
0 -1 0.2315563086548488 0.30012126657232197 0.2315563086548488 0.30012126657232197
1 3500 0.22940563086548488 0.29929678836254725 0.22940563086548488 0.29929678836254725
1 -1 0.24094108446298226 0.31088957785060495 0.24094108446298226 0.31088957785060495
2 3500 0.24276590198123044 0.3119622271049537 0.24276590198123044 0.3119622271049537
2 -1 0.2375521376433785 0.30603138913716404 0.2375521376433785 0.30603138913716404
3 3500 0.2391162669447341 0.30835844526375134 0.2391162669447341 0.30835844526375134
3 -1 0.23866006256517205 0.3075655961898148 0.23866006256517205 0.3075655961898148
4 3500 0.2415276329509906 0.3096220331198177 0.2415276329509906 0.3096220331198177
4 -1 0.24022419186652763 0.3087839256748932 0.24022419186652763 0.3087839256748932


### For evaluation of semantic finetune

In [6]:
ru = eval.load_run('out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-cos-v5-queries_onlylast-2022-06-02_21-24-49/eval/run.val_onlylast.txt')
rs = eval.load_rel_scores('data_2017-09/queries/relevance_scores.txt')
q_ids = eval.load_query_ids('data_2017-09/queries/queries_val.tsv')
print('P@1: ', eval.compute_p1(rs,ru, q_ids))
print('MRR@10: ', eval.compute_mrr10(rs,ru, q_ids))

P@1:  0.21584984358706985
MRR@10:  0.2752359642153708
