In [2]:
from pyserini.search.lucene import LuceneSearcher
import pickle
import json
import re

In [11]:
# load training data
training_data = pickle.load(open('../../data/reddit/bbc_news_scrape_raw.pkl', 'rb'))
training_data[0].keys()

dict_keys(['post_id', 'comment_id', 'url', 'ancestors', 'text', 'full_context'])

In [16]:
# format documents and relevance data for pyserini (https://github.com/castorini/pyserini/)

pyserini_retrieval_docs = []
relevance_scores = []
queries = []

# params
REMOVE_HYPERLINK=True # small effect, barely changes scores

REMOVE_LAST_COMMENT=False  # large decrease in performance, as expected

ONLY_LAST_COMMENT=False


# can probably make this more efficient (single loop with write)
for i,example in enumerate(training_data):
    doc = {"id": i, "contents": " ".join(example['text'])}
    pyserini_retrieval_docs.append(doc)

    relevance_score = str(i) + ' 0 ' + str(i) + ' 1'
    relevance_scores.append(relevance_score)

    # will likely change to different formats (ignore current comment, only current comment, remove links, etc.)
    if REMOVE_LAST_COMMENT:
        query = re.sub('\n', ' ', ' '.join(example['full_context'][:-1]))
    elif ONLY_LAST_COMMENT:
        query = re.sub('\n', ' ', example['full_context'][-1])
    else:
        query = re.sub('\n', ' ', ' '.join(example['full_context']))

    if REMOVE_HYPERLINK:
        query = re.sub(example['url'], '', query)

    queries.append(str(i) + '\t' + query)



with open('../../data/reddit/pyserini/bbc_news_pyserini.jsonl', 'w') as f:
    for doc in pyserini_retrieval_docs:
        f.write(json.dumps(doc) + '\n')

with open('../../data/reddit/pyserini/bbc_news_rel.txt', 'w') as f:
    for rs in relevance_scores:
        f.write(rs + '\n')

with open('../../data/reddit/pyserini/bbc_news_queries.tsv', 'w') as f:
    for query in queries:
        f.write(query + '\n')

In [17]:
# sets up the pyserini sparse index
!python3 -m pyserini.index.lucene --collection JsonCollection --input ../../data/reddit/pyserini/ --index bbc_news_sparse --generator DefaultLuceneDocumentGenerator --threads 1

2022-03-13 09:22:31,769 INFO  [main] index.IndexCollection (IndexCollection.java:643) - Setting log level to INFO
2022-03-13 09:22:31,771 INFO  [main] index.IndexCollection (IndexCollection.java:646) - Starting indexer...
2022-03-13 09:22:31,772 INFO  [main] index.IndexCollection (IndexCollection.java:648) - DocumentCollection path: ../../data/reddit/pyserini/
2022-03-13 09:22:31,772 INFO  [main] index.IndexCollection (IndexCollection.java:649) - CollectionClass: JsonCollection
2022-03-13 09:22:31,772 INFO  [main] index.IndexCollection (IndexCollection.java:650) - Generator: DefaultLuceneDocumentGenerator
2022-03-13 09:22:31,773 INFO  [main] index.IndexCollection (IndexCollection.java:651) - Threads: 1
2022-03-13 09:22:31,773 INFO  [main] index.IndexCollection (IndexCollection.java:652) - Language: en
2022-03-13 09:22:31,773 INFO  [main] index.IndexCollection (IndexCollection.java:653) - Stemmer: porter
2022-03-13 09:22:31,773 INFO  [main] index.IndexCollection (IndexCollection.java:65

In [18]:
!python3 -m pyserini.search.lucene --index bbc_news_sparse --topics ../../data/reddit/pyserini/bbc_news_queries.tsv --output run.bbc_sparse.txt --bm25

Running ../../data/reddit/pyserini/bbc_news_queries.tsv topics, saving to run.bbc_sparse.txt...
100%|█████████████████████████████████████████| 152/152 [00:01<00:00, 92.53it/s]


In [19]:
# note MAP = MRR when there is exactly one relevant result
# https://stats.stackexchange.com/questions/127041/mean-average-precision-vs-mean-reciprocal-rank
!python3 -m pyserini.eval.trec_eval -m map -m recall.1 -m recall.5 ../../data/reddit/pyserini/bbc_news_rel.txt run.bbc_sparse.txt

Downloading https://search.maven.org/remotecontent?filepath=uk/ac/gla/dcs/terrierteam/jtreceval/0.0.5/jtreceval-0.0.5-jar-with-dependencies.jar to /home/kjros2/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar...
/home/kjros2/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar already exists!
Skipping download.
Running command: ['java', '-jar', '/home/kjros2/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar', '-m', 'map', '-m', 'recall.1', '-m', 'recall.5', '../../data/reddit/pyserini/bbc_news_rel.txt', 'run.bbc_sparse.txt']
Results:
map                   	all	0.2251
recall_1              	all	0.0987

