There is a slight problem with our formatting of webpage storage (too many files for pyserini). The indexing appears to freeze. So this is a script that makes it indexible by placing all webpages into a single file, then removing it once indexing is complete.

In [None]:
# reformat into jsonl file
import os
import json
os.mkdir('data_2017-09/webpages_tmp')
with open('data_2017-09/webpages_tmp/webpages_tmp.jsonl', 'w') as f:
    for i,file in enumerate(os.listdir('data_2017-09/webpages/')):
        webpage = json.load(open('data_2017-09/webpages/' + file, 'r'))
        f.write(json.dumps(webpage) + '\n')
        if i % 10000 == 0: print(i)

In [6]:
# sets up the pyserini sparse index
!python3 -m pyserini.index.lucene --stopwords "data_2017-09/stopwords.txt" --collection JsonCollection --input "data_2017-09/webpages_tmp" --index "data_2017-09/pyserini/pyserini_index_rm3" --generator DefaultLuceneDocumentGenerator --threads 10 -verbose --storeDocvectors

2022-06-14 13:39:43,824 INFO  [main] index.IndexCollection (IndexCollection.java:636) - Setting log level to DEBUG
2022-06-14 13:39:43,826 INFO  [main] index.IndexCollection (IndexCollection.java:646) - Starting indexer...
2022-06-14 13:39:43,826 INFO  [main] index.IndexCollection (IndexCollection.java:648) - DocumentCollection path: data_2017-09/webpages_tmp
2022-06-14 13:39:43,827 INFO  [main] index.IndexCollection (IndexCollection.java:649) - CollectionClass: JsonCollection
2022-06-14 13:39:43,827 INFO  [main] index.IndexCollection (IndexCollection.java:650) - Generator: DefaultLuceneDocumentGenerator
2022-06-14 13:39:43,827 INFO  [main] index.IndexCollection (IndexCollection.java:651) - Threads: 10
2022-06-14 13:39:43,828 INFO  [main] index.IndexCollection (IndexCollection.java:652) - Language: en
2022-06-14 13:39:43,828 INFO  [main] index.IndexCollection (IndexCollection.java:653) - Stemmer: porter
2022-06-14 13:39:43,828 INFO  [main] index.IndexCollection (IndexCollection.java:65

In [7]:
os.remove('data_2017-09/webpages_tmp/webpages_tmp.jsonl')
os.rmdir('data_2017-09/webpages_tmp')

In [5]:
from eval import eval

# note: v2 runs are with the queries that have been updated to resolve the wiki mobile link error

### BM25 for full run

In [1]:
!python3 -m pyserini.search.lucene --stopwords "data_2017-09/stopwords.txt" --batch-size 50 --threads 10 --hits 10 --index "data_2017-09/pyserini/pyserini_index" --topics "data_2017-09/queries/queries_test.tsv" --output out/bm25_runs/v2_run.test_8_0.99.txt --bm25 --k1 8 --b 0.99

Setting BM25 parameters: k1=8.0, b=0.99
Using custom stopwords=data_2017-09/stopwords.txt
Running data_2017-09/queries/queries_test.tsv topics, saving to out/bm25_runs/v2_run.test_8_0.99.txt...
100%|█████████████████████████████████████| 15249/15249 [04:28<00:00, 56.71it/s]


In [1]:
# bm25 with rm3
!python3 -m pyserini.search.lucene --stopwords "data_2017-09/stopwords.txt" --hits 10 --index "data_2017-09/pyserini/pyserini_index_rm3" --topics "data_2017-09/queries/queries_test.tsv" --output out/bm25rm3_runs/v2_run.test_8_0.99_0.9_1_10.txt --rm3 --k1 8 --b 0.99 --original_query_weight 0.9 --fb_docs 1 --fb_terms 10

Setting BM25 parameters: k1=8.0, b=0.99
Using custom stopwords=data_2017-09/stopwords.txt
Running data_2017-09/queries/queries_test.tsv topics, saving to out/bm25rm3_runs/v2_run.test_8_0.99_0.9_1_10.txt...
100%|███████████████████████████████████| 15249/15249 [1:10:16<00:00,  3.62it/s]


In [8]:
ru = eval.load_run('out/bm25rm3_runs/v2_run.test_8_0.99_0.9_1_10.txt')
rs = eval.load_rel_scores('data_2017-09/queries/relevance_scores.txt')
q_ids = eval.load_query_ids('data_2017-09/queries/queries_test.tsv')
print('P@1: ', eval.compute_p1(rs,ru, q_ids))
print('MRR@10: ', eval.compute_mrr10(rs,ru, q_ids))

P@1:  0.19352088661551578
MRR@10:  0.254706761931827


### BM25 for only last

In [6]:
!python3 -m pyserini.search.lucene --stopwords "data_2017-09/stopwords.txt" --batch-size 50 --threads 10 --hits 10 --index "data_2017-09/pyserini/pyserini_index" --topics "data_2017-09/queries_onlylast/queries_test.tsv" --output out/bm25_runs/v2_run.onlylast.test_4_0.9.txt --bm25 --k1 4 --b 0.9

Setting BM25 parameters: k1=4.0, b=0.9
Using custom stopwords=data_2017-09/stopwords.txt
Running data_2017-09/queries_onlylast/queries_test.tsv topics, saving to out/bm25_runs/v2_run.onlylast.test_4_0.9.txt...
100%|█████████████████████████████████████| 15249/15249 [03:32<00:00, 71.66it/s]


In [2]:
# for rm3
!python3 -m pyserini.search.lucene --stopwords "data_2017-09/stopwords.txt" --hits 10 --index "data_2017-09/pyserini/pyserini_index_rm3" --topics "data_2017-09/queries_onlylast/queries_test.tsv" --output out/bm25rm3_runs/v2_run.onlylast.test_4_0.9_0.9_1_10.txt --rm3 --k1 4 --b 0.9 --original_query_weight 0.9 --fb_docs 1 --fb_terms 10

Setting BM25 parameters: k1=4.0, b=0.9
Using custom stopwords=data_2017-09/stopwords.txt
Running data_2017-09/queries_onlylast/queries_test.tsv topics, saving to out/bm25rm3_runs/v2_run.onlylast.test_4_0.9_0.9_1_10.txt...
100%|█████████████████████████████████████| 15249/15249 [31:06<00:00,  8.17it/s]


In [7]:
ru = eval.load_run('out/bm25rm3_runs/v2_run.onlylast.test_4_0.9_0.9_1_10.txt')
rs = eval.load_rel_scores('data_2017-09/queries/relevance_scores.txt')
q_ids = eval.load_query_ids('data_2017-09/queries_onlylast/queries_test.tsv')
print('P@1: ', eval.compute_p1(rs,ru, q_ids))
print('MRR@10: ', eval.compute_mrr10(rs,ru, q_ids))

P@1:  0.22060462981179094
MRR@10:  0.286381984975336


### BM25 for remove last

In [8]:
!python3 -m pyserini.search.lucene --stopwords "data_2017-09/stopwords.txt" --batch-size 50 --threads 10 --hits 10 --index "data_2017-09/pyserini/pyserini_index" --topics "data_2017-09/queries_removelast/queries_test.tsv" --output out/bm25_runs/run.removelast.test_7_0.99.txt --bm25 --k1 7 --b 0.99

Setting BM25 parameters: k1=7.0, b=0.99
Using custom stopwords=data_2017-09/stopwords.txt
Running data_2017-09/queries_removelast/queries_test.tsv topics, saving to out/bm25_runs/run.removelast.test_7_0.99.txt...
100%|█████████████████████████████████████| 15249/15249 [03:26<00:00, 73.80it/s]


In [3]:
!python3 -m pyserini.search.lucene --stopwords "data_2017-09/stopwords.txt" --hits 10 --index "data_2017-09/pyserini/pyserini_index_rm3" --topics "data_2017-09/queries_removelast/queries_test.tsv" --output out/bm25rm3_runs/run.removelast.test_7_0.99_0.9_1_10.txt --rm3 --k1 7 --b 0.99 --original_query_weight 0.9 --fb_docs 1 --fb_terms 10

Setting BM25 parameters: k1=7.0, b=0.99
Using custom stopwords=data_2017-09/stopwords.txt
Running data_2017-09/queries_removelast/queries_test.tsv topics, saving to out/bm25rm3_runs/run.removelast.test_7_0.99_0.9_1_10.txt...
100%|█████████████████████████████████████| 15249/15249 [46:53<00:00,  5.42it/s]


In [6]:
ru = eval.load_run('out/bm25rm3_runs/run.removelast.test_7_0.99_0.9_1_10.txt')
rs = eval.load_rel_scores('data_2017-09/queries/relevance_scores.txt')
q_ids = eval.load_query_ids('data_2017-09/queries_removelast/queries_test.tsv')
print('P@1: ', eval.compute_p1(rs,ru, q_ids))
print('MRR@10: ', eval.compute_mrr10(rs,ru, q_ids))

P@1:  0.08164469801298446
MRR@10:  0.11681601291575679


### For easy evaluation of semantic search baseline

In [9]:
ru = eval.load_run('out/semantic_runs/v2_run.test_full_msmarco-distilbert-cos-v5.txt')
rs = eval.load_rel_scores('data_2017-09/queries/relevance_scores.txt')
q_ids = eval.load_query_ids('data_2017-09/queries/queries_test.tsv')
print('P@1: ', eval.compute_p1(rs,ru, q_ids))
print('MRR@10: ', eval.compute_mrr10(rs,ru, q_ids))

P@1:  0.16427306708636633
MRR@10:  0.22143749733263868


### For semantic fine tune CSV validation analysis

In [2]:
import csv
#with open('out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-dot-v5-queries_onlylast-2022-05-17_19-33-33/eval/Information-Retrieval_evaluation_results.csv') as csvfile:
#with open('out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-dot-v5-queries_onlylast-2022-05-17_12-01-23/eval/Information-Retrieval_evaluation_results.csv') as csvfile:

#with open('out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-dot-v5-queries_removelast-2022-05-18_13-27-41/eval/Information-Retrieval_evaluation_results.csv') as csvfile: #cosine
#with open('out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-dot-v5-queries_removelast-2022-05-19_07-51-48/eval/Information-Retrieval_evaluation_results.csv') as csvfile: #dot

#with open('out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-cos-v5-queries-2022-05-21_20-12-34/eval/Information-Retrieval_evaluation_results.csv') as csvfile:
#with open('out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-cos-v5-queries_onlylast-2022-05-22_10-35-21/eval/Information-Retrieval_evaluation_results.csv') as csvfile:
with open('out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-cos-v5-queries-2022-06-01_12-04-50/eval/Information-Retrieval_evaluation_results.csv') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        print(row[0], row[1], row[6], row[14], row[21], row[29])

epoch steps cos_sim-Precision@1 cos_sim-MRR@10 dot_score-Precision@1 dot_score-MRR@10
0 3500 0.23559697601668406 0.3076395095751195 0.23559697601668406 0.3076395095751195
0 -1 0.2315563086548488 0.30012126657232197 0.2315563086548488 0.30012126657232197
1 3500 0.22940563086548488 0.29929678836254725 0.22940563086548488 0.29929678836254725
1 -1 0.24094108446298226 0.31088957785060495 0.24094108446298226 0.31088957785060495
2 3500 0.24276590198123044 0.3119622271049537 0.24276590198123044 0.3119622271049537
2 -1 0.2375521376433785 0.30603138913716404 0.2375521376433785 0.30603138913716404
3 3500 0.2391162669447341 0.30835844526375134 0.2391162669447341 0.30835844526375134
3 -1 0.23866006256517205 0.3075655961898148 0.23866006256517205 0.3075655961898148
4 3500 0.2415276329509906 0.3096220331198177 0.2415276329509906 0.3096220331198177
4 -1 0.24022419186652763 0.3087839256748932 0.24022419186652763 0.3087839256748932


### For evaluation of semantic finetune

In [11]:
ru = eval.load_run('out/semantic_finetune_runs/train_bi-encoder-mnrl-msmarco-distilbert-cos-v5-queries_onlylast-2022-06-02_21-24-49/eval/v2_run.test_onlylast.txt')
rs = eval.load_rel_scores('data_2017-09/queries/relevance_scores.txt')
q_ids = eval.load_query_ids('data_2017-09/queries/queries_test.tsv')
print('P@1: ', eval.compute_p1(rs,ru, q_ids))
print('MRR@10: ', eval.compute_mrr10(rs,ru, q_ids))

P@1:  0.21634205521673552
MRR@10:  0.27457028147565327
