In [65]:
from pyserini.search.lucene import LuceneSearcher
import pickle
import json
import re

In [70]:
# load training data
training_data = pickle.load(open('../../data/reddit/bbc_news_scrape_raw.pkl', 'rb'))
training_data[0].keys()

dict_keys(['post_id', 'comment_id', 'url', 'ancestors', 'text', 'full_context'])

In [74]:
# format documents and relevance data for pyserini (https://github.com/castorini/pyserini/)

pyserini_retrieval_docs = []
relevance_scores = []
queries = []

# params
REMOVE_HYPERLINK=False # small effect, barely changes scores

REMOVE_LAST_COMMENT=False  # large decrease in performance, as expected

ONLY_LAST_COMMENT=False # interestingly, this makes the model perform the best :)


# can probably make this more efficient (single loop with write)
for i,example in enumerate(training_data):
    # will likely change to different formats (ignore current comment, only current comment, remove links, etc.)
    if REMOVE_LAST_COMMENT:
        query = re.sub('\n', ' ', ' '.join(example['full_context'][:-1]))
    elif ONLY_LAST_COMMENT:
        query = re.sub('\n', ' ', example['full_context'][-1])
    else:
        query = re.sub('\n', ' ', ' '.join(example['full_context']))

    if REMOVE_HYPERLINK:
        query = re.sub(example['url'], '', query)

    # proxy for when a comment ONLY has URL, ignore for now
    if len(query) < 3:
        continue

    queries.append(str(i) + '\t' + query)

    doc = {"id": i, "contents": example['text']}
    pyserini_retrieval_docs.append(doc)

    relevance_score = str(i) + ' 0 ' + str(i) + ' 1'
    relevance_scores.append(relevance_score)



with open('../../data/reddit/pyserini/bbc_news_pyserini.jsonl', 'w') as f:
    for doc in pyserini_retrieval_docs:
        f.write(json.dumps(doc) + '\n')

with open('../../data/reddit/pyserini/bbc_news_rel.txt', 'w') as f:
    for rs in relevance_scores:
        f.write(rs + '\n')

with open('../../data/reddit/pyserini/bbc_news_queries.tsv', 'w') as f:
    for query in queries:
        f.write(query + '\n')

In [75]:
# sets up the pyserini sparse index
!python3 -m pyserini.index.lucene --collection JsonCollection --input ../../data/reddit/pyserini/ --index bbc_news_sparse --generator DefaultLuceneDocumentGenerator --threads 1

2022-03-18 17:36:41,977 INFO  [main] index.IndexCollection (IndexCollection.java:643) - Setting log level to INFO
2022-03-18 17:36:41,979 INFO  [main] index.IndexCollection (IndexCollection.java:646) - Starting indexer...
2022-03-18 17:36:41,980 INFO  [main] index.IndexCollection (IndexCollection.java:648) - DocumentCollection path: ../../data/reddit/pyserini/
2022-03-18 17:36:41,980 INFO  [main] index.IndexCollection (IndexCollection.java:649) - CollectionClass: JsonCollection
2022-03-18 17:36:41,981 INFO  [main] index.IndexCollection (IndexCollection.java:650) - Generator: DefaultLuceneDocumentGenerator
2022-03-18 17:36:41,981 INFO  [main] index.IndexCollection (IndexCollection.java:651) - Threads: 1
2022-03-18 17:36:41,981 INFO  [main] index.IndexCollection (IndexCollection.java:652) - Language: en
2022-03-18 17:36:41,981 INFO  [main] index.IndexCollection (IndexCollection.java:653) - Stemmer: porter
2022-03-18 17:36:41,982 INFO  [main] index.IndexCollection (IndexCollection.java:65

In [76]:
!python3 -m pyserini.search.lucene --index bbc_news_sparse --topics ../../data/reddit/pyserini/bbc_news_queries.tsv --output run.bbc_sparse.txt --bm25 --k1 3 --b 0.9

# note MAP = MRR when there is exactly one relevant result
# https://stats.stackexchange.com/questions/127041/mean-average-precision-vs-mean-reciprocal-rank
!python3 -m pyserini.eval.trec_eval -m map -m P.1 ../../data/reddit/pyserini/bbc_news_rel.txt run.bbc_sparse.txt

Setting BM25 parameters: k1=3.0, b=0.9
Running ../../data/reddit/pyserini/bbc_news_queries.tsv topics, saving to run.bbc_sparse.txt...
100%|█████████████████████████████████████████| 152/152 [00:01<00:00, 96.07it/s]
Downloading https://search.maven.org/remotecontent?filepath=uk/ac/gla/dcs/terrierteam/jtreceval/0.0.5/jtreceval-0.0.5-jar-with-dependencies.jar to /home/kjros2/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar...
/home/kjros2/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar already exists!
Skipping download.
Running command: ['java', '-jar', '/home/kjros2/.cache/pyserini/eval/jtreceval-0.0.5-jar-with-dependencies.jar', '-m', 'map', '-m', 'P.1', '../../data/reddit/pyserini/bbc_news_rel.txt', 'run.bbc_sparse.txt']
Results:
map                   	all	0.4802
P_1                   	all	0.3750



Qualitative Analysis

Some observations
1. Low number of documents is likely leading to good performance

In [57]:
searcher = LuceneSearcher('bbc_news_sparse')

In [58]:
for i, query in enumerate(queries):
    hits = searcher.search(query)
    top5 = [int(x.docid) for x in hits]
    print(top5, i)
    if i in top5: print('FOUND')

[0, 44, 101, 68, 26, 100, 54, 78, 35, 149] 0
FOUND
[0, 44, 43, 1, 68, 148, 101, 100, 8, 23] 1
FOUND
[0, 44, 95, 59, 79, 78, 108, 131, 66, 23] 2
[0, 44, 68, 56, 137, 101, 66, 54, 78, 95] 3
[0, 44, 101, 54, 68, 110, 100, 147, 78, 56] 4
[0, 44, 101, 68, 121, 56, 54, 110, 25, 131] 5
[0, 44, 68, 101, 100, 54, 131, 77, 123, 147] 6
[0, 44, 68, 54, 110, 101, 100, 27, 123, 85] 7
[8, 0, 44, 100, 54, 75, 68, 125, 121, 78] 8
FOUND
[0, 44, 66, 35, 68, 95, 8, 110, 131, 56] 9
[0, 44, 68, 11, 100, 54, 10, 123, 148, 101] 10
FOUND
[0, 44, 68, 11, 54, 100, 10, 123, 148, 101] 11
FOUND
[0, 44, 12, 68, 100, 8, 5, 33, 42, 78] 12
FOUND
[0, 44, 13, 68, 70, 54, 101, 110, 42, 147] 13
FOUND
[0, 44, 68, 15, 17, 110, 84, 54, 148, 107] 14
[0, 44, 68, 15, 17, 110, 84, 54, 148, 107] 15
FOUND
[0, 44, 15, 17, 68, 110, 54, 84, 148, 107] 16
[0, 44, 15, 17, 68, 110, 54, 84, 148, 107] 17
FOUND
[0, 44, 78, 79, 95, 112, 68, 123, 27, 66] 18
[0, 44, 19, 131, 4, 105, 111, 29, 123, 72] 19
FOUND
[0, 44, 118, 120, 21, 22, 52, 54, 2