In [1]:
import csv
from collections import Counter, defaultdict
from tqdm import tqdm
import json
import numpy as np
import gzip
from datetime import datetime
from sentence_transformers import SentenceTransformer
import os
import re
import joblib

# your modules are imported here
from indexing import Indexer, BasicInvertedIndex
from document_preprocessor import RegexTokenizer, Doc2QueryAugmenter
from ranker import Ranker, BM25, CrossEncoderScorer
from vector_ranker import VectorRanker
from l2r import L2RFeatureExtractor, L2RRanker
from vectordb_ranker import VectorDBRanker

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from importlib import reload
import indexing
reload(indexing)
from indexing import Indexer
import document_preprocessor
reload(document_preprocessor)
from document_preprocessor import RegexTokenizer, Doc2QueryAugmenter
import l2r
reload(l2r)
from l2r import L2RFeatureExtractor, L2RRanker

In [3]:
data_prefix = '../data/'
model_prefix = '../models/'

In [4]:
with open(data_prefix + 'rec_cats.json', 'r') as f:
    rec_cats = json.load(f)
    five_cats = rec_cats['short']
    all_cats = rec_cats['full']
with open(data_prefix + 'doc_cat_info.json', 'r') as f:
    doc_cat_info = json.load(f)
    doc_cat_info = {int(k):v for k, v in doc_cat_info.items()}

In [5]:
# with open('../interior_dic.json', 'r') as f:
#     query_alts_rels = json.load(f)
#     queries = list(query_alts_rels.keys())
#     for query in queries:
#         del query_alts_rels[query]['alt_qs'][5]
#         for i in range(5):
#             q = query_alts_rels[query]['alt_qs'][i]
#             query_alts_rels[query]['alt_qs'][i] = q[3:]
#         query_alts_rels[query]['scored_docs'] = [(l[0], l[1]) for l in query_alts_rels[query]['scored_docs']]
#     train_queries = queries[:41]
#     test_queries = queries[41:]
# with open('../train_data.json', 'w') as f:
#     train_data = {query: query_alts_rels[query] for query in train_queries}
#     test_data = {query: query_alts_rels[query] for query in test_queries}
#     json.dump(train_data, f, indent=2)
# with open('../test_data.json', 'w') as f:
#     json.dump(test_data, f, indent=2)


In [6]:
# aug1 = Doc2QueryAugmenter()
# aug2 = Doc2QueryAugmenter('doc2query/msmarco-t5-small-v1')
# aug3 = Doc2QueryAugmenter('google/flan-t5-small')
# prefix = "Generate a query for the following text: "

In [7]:
# with open('../final_data_with_categories.json', 'r') as f:
#     lines = f.readlines()
#     docs = []
#     for line in tqdm(lines):
#         doc = json.loads(line)
#         doc['alt_qs'] = aug1.get_queries(doc['text'], 5)
#         doc['dumb_qs_1'] = aug2.get_queries(doc['text'], 5)
#         doc['dumb_qs_2'] = aug3.get_queries(doc['text'], 5, prefix)
#         docs.append(doc)
# with open('../data/doc_dataset.jsonl', 'a') as f:
#     for doc in docs:
#         f.write(json.dumps(doc) + '\n')

In [8]:
# with open('../data/doc_dataset_old.jsonl', 'r') as f:
#     line = f.readline()
#     doc_inds = [m.start() for m in re.finditer('{"docid":', line)]
#     docs = []
#     for i in range(len(doc_inds)):
#         start = doc_inds[i]
#         end = len(line) if i == len(doc_inds) - 1 else doc_inds[i + 1]
#         doc_text = line[start:end]
#         doc = json.loads(doc_text)
#         docs.append(doc)

In [9]:
# with open('../data/doc_dataset.jsonl', 'a') as f:
#     for doc in docs:
#         f.write(json.dumps(doc) + '\n')

In [10]:
doc_preproc = RegexTokenizer('\\w+')
stopwords = set()
with open(data_prefix + 'stopwords.txt', 'r', encoding='utf-8') as file:
    for stopword in file:
        stopwords.add(stopword.strip())
f'Stopwords collected {len(stopwords)}'

'Stopwords collected 543'

In [11]:
# doc_base_index = Indexer.create_index(data_prefix + 'doc_dataset.jsonl', doc_preproc, stopwords, 'text', 'alt_qs', data_prefix + 'doc_base_index')

In [12]:
doc_base_index = BasicInvertedIndex()
doc_base_index.load(data_prefix + 'doc_base_index')
doc_small_index = BasicInvertedIndex()
doc_small_index.load(data_prefix + 'doc_small_index')
doc_flan_index = BasicInvertedIndex()
doc_flan_index.load(data_prefix + 'doc_flan_index')
doc_index = BasicInvertedIndex()
doc_index.load(data_prefix + 'doc_index')
tit_index = BasicInvertedIndex()
tit_index.load(data_prefix + 'title_index')

In [13]:
# doc_small_index = Indexer.create_index(data_prefix + 'doc_dataset.jsonl', doc_preproc, stopwords, 'text', 'dumb_qs_1', data_prefix + 'doc_small_index')
# doc_flan_index = Indexer.create_index(data_prefix + 'doc_dataset.jsonl', doc_preproc, stopwords, 'text', 'dumb_qs_2', data_prefix + 'doc_flan_index')
# doc_index = Indexer.create_index(data_prefix + 'doc_dataset.jsonl', doc_preproc, stopwords, 'text', '', data_prefix + 'doc_index')
# tit_index = Indexer.create_index(data_prefix + 'doc_dataset.jsonl', doc_preproc, stopwords, 'title', '', data_prefix + 'title_index')


In [14]:
def query_runner(ranker, queries):
    scores = [ranker.query(query) for query in queries]
    final_scores = []
    docids = [dic['docid'] for dic in scores[0]]
    for doc in docids:
        cum_score = 0
        for score_l in scores:
            for dic in score_l:
                if dic['docid'] == doc:
                    cum_score += dic['score']
                    break
        final_scores.append({'docid': doc, 'score': cum_score / len(queries)})
    return final_scores

In [15]:
with open(data_prefix + 'raw_text.json', 'r') as f:
    raw_text_dict = json.load(f)
    raw_text_dict = {int(k):v for k, v in raw_text_dict.items()}
with open(data_prefix + 'base_raw_text.json', 'r') as f:
    base_raw_text_dict = json.load(f)
    base_raw_text_dict = {int(k):v for k, v in base_raw_text_dict.items()}
with open(data_prefix + 'small_raw_text.json', 'r') as f:
    small_raw_text_dict = json.load(f)
    small_raw_text_dict = {int(k):v for k, v in small_raw_text_dict.items()}
with open(data_prefix + 'flan_raw_text.json', 'r') as f:
    flan_raw_text_dict = json.load(f)
    flan_raw_text_dict = {int(k):v for k, v in flan_raw_text_dict.items()}

In [16]:
ce_scorer = CrossEncoderScorer(raw_text_dict)
bce_scorer = CrossEncoderScorer(base_raw_text_dict)
sce_scorer = CrossEncoderScorer(small_raw_text_dict)
fce_scorer = CrossEncoderScorer(flan_raw_text_dict)

In [17]:
nn_feat_extract = L2RFeatureExtractor(doc_index, tit_index, doc_cat_info, doc_preproc, stopword, set(), ce_scorer)
nf_feat_extract = L2RFeatureExtractor(doc_index, tit_index, doc_cat_info, doc_preproc, stopword, set(five_cats), ce_scorer)
na_feat_extract = L2RFeatureExtractor(doc_index, tit_index, doc_cat_info, doc_preproc, stopword, set(all_cats), ce_scorer)
bf_feat_extract = L2RFeatureExtractor(doc_base_index, tit_index, doc_cat_info, doc_preproc, stopword, set(five_cats), bce_scorer)
sf_feat_extract = L2RFeatureExtractor(doc_small_index, tit_index, doc_cat_info, doc_preproc, stopword, set(five_cats), sce_scorer)
ff_feat_extract = L2RFeatureExtractor(doc_flan_index, tit_index, doc_cat_info, doc_preproc, stopword, set(five_cats), fce_scorer)
ba_feat_extract = L2RFeatureExtractor(doc_base_index, tit_index, doc_cat_info, doc_preproc, stopword, set(all_cats), bce_scorer)

In [4]:
nn_model = joblib.load(model_prefix + 'nn_model.joblib')
nf_model = joblib.load(model_prefix + 'nf_model.joblib')
na_model = joblib.load(model_prefix + 'na_model.joblib')
bf_model = joblib.load(model_prefix + 'bf_model.joblib')
sf_model = joblib.load(model_prefix + 'sf_model.joblib')
ff_model = joblib.load(model_prefix + 'ff_model.joblib')
ba_model = joblib.load(model_prefix + 'ba_model.joblib')

: 

In [None]:
with open(data_prefix + 'test_data.json', 'r') as f:
    test_data = json.load(f)
    test_queries = list(test_data.keys())

In [None]:
bm25_ranker = BM25(doc_index)
vdb_ranker = VectorDBRanker(raw_text_dict)
bvdb_ranker = VectorDBRanker(base_raw_text_dict)
svdb_ranker = VectorDBRanker(small_raw_text_dict)
fvdb_ranker = VectorDBRanker(flan_raw_text_dict)

In [None]:
bm25_nq_nn_ranker = L2RRanker(doc_index, tit_index, doc_preproc, stopword, bm25_ranker, nn_feat_extract)
bm25_nq_nn_ranker.model.lgbmranker = nn_model
bm25_nq_nn_scores = [bm25_nq_nn_ranker.query(query) for query in test_queries]

In [None]:
bm25_nq_nf_ranker = L2RRanker(doc_index, tit_index, doc_preproc, stopword, bm25_ranker, nf_feat_extract)
bm25_nq_nf_ranker.model.lgbmranker = nf_model
bm25_nq_nf_scores = [bm25_nq_nf_ranker.query(query) for query in test_queries]

In [None]:
bienc_nq_nn_ranker = L2RRanker(doc_index, tit_index, doc_preproc, stopword, vdb_ranker, nn_feat_extract)
bienc_nq_nn_ranker.model.lgbmranker = nn_model
bienc_nq_nn_scores = [bm25_nq_nn_ranker.query(query) for query in test_queries]

In [None]:
bienc_nq_na_ranker = L2RRanker(doc_index, tit_index, doc_preproc, stopword, vdb_ranker, na_feat_extract)
bienc_nq_na_ranker.model.lgbmranker = na_model
bienc_nq_na_scores = [bienc_nq_na_ranker.query(query) for query in test_queries]

In [None]:
bienc_nq_nf_ranker = L2RRanker(doc_index, tit_index, doc_preproc, stopword, vdb_ranker, nf_feat_extract)
bienc_nq_nf_ranker.model.lgbmranker = nf_model
bienc_nq_nf_scores = [bienc_nq_nf_ranker.query(query) for query in test_queries]

In [None]:
bienc_tq_nf_ranker = L2RRanker(doc_index, tit_index, doc_preproc, stopword, vdb_ranker, nf_feat_extract)
bienc_tq_nf_ranker.model.lgbmranker = nf_model
bienc_tq_nf_scores = [query_runner(bienc_tq_nf_ranker, [query, test_data[query]['alt_qs'][-1]]) for query in test_queries]

In [None]:
bienc_sq_nf_ranker = L2RRanker(doc_index, tit_index, doc_preproc, stopword, vdb_ranker, nf_feat_extract)
bienc_sq_nf_ranker.model.lgbmranker = nf_model
bienc_sq_nf_scores = [query_runner(bienc_sq_nf_ranker, [query] + test_data[query]['alt_qs'][:5]) for query in test_queries]

In [None]:
bienc_aq_nf_ranker = L2RRanker(doc_index, tit_index, doc_preproc, stopword, vdb_ranker, nf_feat_extract)
bienc_aq_nf_ranker.model.lgbmranker = nf_model
bienc_aq_nf_scores = [query_runner(bienc_aq_nf_ranker, [query] + test_data[query]['alt_qs']) for query in test_queries]

In [None]:
bienc_nq_bf_ranker = L2RRanker(doc_base_index, tit_index, doc_preproc, stopword, bvdb_ranker, bf_feat_extract)
bienc_nq_bf_ranker.model.lgbmranker = bf_model
bienc_nq_bf_scores = [bienc_nq_bf_ranker.query(query) for query in test_queries]

In [None]:
bienc_nq_sf_ranker = L2RRanker(doc_small_index, tit_index, doc_preproc, stopword, svdb_ranker, sf_feat_extract)
bienc_nq_sf_ranker.model.lgbmranker = sf_model
bienc_nq_sf_scores = [bienc_nq_sf_ranker.query(query) for query in test_queries]

In [None]:
bienc_nq_ff_ranker = L2RRanker(doc_flan_index, tit_index, doc_preproc, stopword, fvdb_ranker, ff_feat_extract)
bienc_nq_ff_ranker.model.lgbmranker = ff_model
bienc_nq_ff_scores = [bienc_nq_ff_ranker.query(query) for query in test_queries]

In [None]:
bienc_aq_ba_ranker = L2RRanker(doc_base_index, tit_index, doc_preproc, stopword, bvdb_ranker, ba_feat_extract)
bienc_aq_ba_ranker.model.lgbmranker = ba_model
bienc_aq_ba_scores = [query_runner(bienc_aq_ba_ranker, [query] + test_data[query]['alt_qs']) for query in test_queries]

In [None]:
from sklearn.metrics import ndcg_score
true_relevance = np.array([[20, 0, 0, 1, 5]])
scores = np.array([[-10, -100, -100, -15, -12]])
ndcg_score(true_relevance, scores)