In [2]:
import csv
from collections import Counter, defaultdict
from tqdm import tqdm
import json
import numpy as np
import gzip
from datetime import datetime
from sentence_transformers import SentenceTransformer
import os
import re

# your modules are imported here
from indexing import Indexer, BasicInvertedIndex
from document_preprocessor import RegexTokenizer, Doc2QueryAugmenter
from ranker import Ranker, BM25, CrossEncoderScorer
from vector_ranker import VectorRanker
from l2r import L2RFeatureExtractor, L2RRanker

In [6]:
data_prefix = '../data/'

In [27]:
with open(data_prefix + 'rec_cats.json', 'r') as f:
    rec_cats = json.load(f)
    top5cats = rec_cats['short']
    cats = rec_cats['full']
with open(data_prefix + 'doc_cat_info.json', 'r') as f:
    doc_cat_info = json.load(f)
    doc_cat_info = {int(k):v for k, v in doc_cat_info.items()}

In [None]:
# with open('../interior_dic.json', 'r') as f:
#     query_alts_rels = json.load(f)
#     queries = list(query_alts_rels.keys())
#     for query in queries:
#         del query_alts_rels[query]['alt_qs'][5]
#         for i in range(5):
#             q = query_alts_rels[query]['alt_qs'][i]
#             query_alts_rels[query]['alt_qs'][i] = q[3:]
#         query_alts_rels[query]['scored_docs'] = [(l[0], l[1]) for l in query_alts_rels[query]['scored_docs']]
#     train_queries = queries[:41]
#     test_queries = queries[41:]
# with open('../train_data.json', 'w') as f:
#     train_data = {query: query_alts_rels[query] for query in train_queries}
#     test_data = {query: query_alts_rels[query] for query in test_queries}
#     json.dump(train_data, f, indent=2)
# with open('../test_data.json', 'w') as f:
#     json.dump(test_data, f, indent=2)


In [None]:
# aug1 = Doc2QueryAugmenter()
# aug2 = Doc2QueryAugmenter('doc2query/msmarco-t5-small-v1')
# aug3 = Doc2QueryAugmenter('google/flan-t5-small')
# prefix = "Generate a query for the following text: "

In [None]:
# with open('../final_data_with_categories.json', 'r') as f:
#     lines = f.readlines()
#     docs = []
#     for line in tqdm(lines):
#         doc = json.loads(line)
#         doc['alt_qs'] = aug1.get_queries(doc['text'], 5)
#         doc['dumb_qs_1'] = aug2.get_queries(doc['text'], 5)
#         doc['dumb_qs_2'] = aug3.get_queries(doc['text'], 5, prefix)
#         docs.append(doc)
# with open('../data/doc_dataset.jsonl', 'a') as f:
#     for doc in docs:
#         f.write(json.dumps(doc) + '\n')

In [None]:
# with open('../data/doc_dataset_old.jsonl', 'r') as f:
#     line = f.readline()
#     doc_inds = [m.start() for m in re.finditer('{"docid":', line)]
#     docs = []
#     for i in range(len(doc_inds)):
#         start = doc_inds[i]
#         end = len(line) if i == len(doc_inds) - 1 else doc_inds[i + 1]
#         doc_text = line[start:end]
#         doc = json.loads(doc_text)
#         docs.append(doc)

In [None]:
# with open('../data/doc_dataset.jsonl', 'a') as f:
#     for doc in docs:
#         f.write(json.dumps(doc) + '\n')

In [9]:
doc_preproc = RegexTokenizer('\\w+')
stopwords = set()
with open(data_prefix + 'stopwords.txt', 'r', encoding='utf-8') as file:
    for stopword in file:
        stopwords.add(stopword.strip())
f'Stopwords collected {len(stopwords)}'

'Stopwords collected 543'

In [None]:
# doc_base_index = Indexer.create_index(data_prefix + 'doc_dataset.jsonl', doc_preproc, stopwords, 'text', 'alt_qs', data_prefix + 'doc_base_index')

In [10]:
doc_base_index = BasicInvertedIndex()
doc_base_index.load(data_prefix + 'doc_base_index')
doc_small_index = BasicInvertedIndex()
doc_small_index.load(data_prefix + 'doc_small_index')
doc_flan_index = BasicInvertedIndex()
doc_flan_index.load(data_prefix + 'doc_flan_index')
doc_index = BasicInvertedIndex()
doc_index.load(data_prefix + 'doc_index')
tit_index = BasicInvertedIndex()
tit_index.load(data_prefix + 'title_index')

In [None]:
# doc_small_index = Indexer.create_index(data_prefix + 'doc_dataset.jsonl', doc_preproc, stopwords, 'text', 'dumb_qs_1', data_prefix + 'doc_small_index')
# doc_flan_index = Indexer.create_index(data_prefix + 'doc_dataset.jsonl', doc_preproc, stopwords, 'text', 'dumb_qs_2', data_prefix + 'doc_flan_index')
# doc_index = Indexer.create_index(data_prefix + 'doc_dataset.jsonl', doc_preproc, stopwords, 'text', '', data_prefix + 'doc_index')
# tit_index = Indexer.create_index(data_prefix + 'doc_dataset.jsonl', doc_preproc, stopwords, 'title', '', data_prefix + 'title_index')


In [11]:
def query_runner(ranker, queries):
    scores = [ranker.query(query) for query in queries]
    final_scores = []
    docids = [dic['docid'] for dic in scores[0]]
    for doc in docids:
        cum_score = 0
        for score_l in scores:
            for dic in score_l:
                if dic['docid'] == doc:
                    cum_score += dic['score']
                    break
        final_scores.append({'docid': doc, 'score': cum_score / len(queries)})
    return final_scores

In [12]:
with open(data_prefix + 'raw_text.json', 'r') as f:
    raw_text_dict = json.load(f)
with open(data_prefix + 'base_raw_text.json', 'r') as f:
    base_raw_text_dict = json.load(f)
with open(data_prefix + 'small_raw_text.json', 'r') as f:
    small_raw_text_dict = json.load(f)
with open(data_prefix + 'flan_raw_text.json', 'r') as f:
    flan_raw_text_dict = json.load(f)

In [13]:
ce_scorer = CrossEncoderScorer(raw_text_dict)

In [33]:
feat_extract = L2RFeatureExtractor(doc_index, tit_index, doc_cat_info, doc_preproc, stopword, set(top5cats), ce_scorer)

In [15]:
docids = list(raw_text_dict.keys())
model = SentenceTransformer("sentence-transformers/msmarco-MiniLM-L12-cos-v5", device='cpu')
texts = [raw_text_dict[docid] for docid in docids]
embs = np.array([model.encode(text) for text in texts])

In [16]:
np.save(data_prefix + 'embs.npy', embs)

In [17]:
vranker = VectorRanker("sentence-transformers/msmarco-MiniLM-L12-cos-v5", embs, docids)

In [34]:
l2r_ranker = L2RRanker(doc_index, tit_index, doc_preproc, stopword, vranker, feat_extract)

In [32]:
from importlib import reload
import indexing
reload(indexing)
from indexing import Indexer
import document_preprocessor
reload(document_preprocessor)
from document_preprocessor import RegexTokenizer, Doc2QueryAugmenter
import l2r
reload(l2r)
from l2r import L2RFeatureExtractor, L2RRanker

In [35]:
l2r_ranker.train(data_prefix + 'train_data.json')

relevance dict created


preparing features:doc cycle: 100%|██████████| 50/50 [00:00<00:00, 2188.68it/s]
preparing features:doc cycle: 100%|██████████| 50/50 [00:00<00:00, 1303.20it/s]
preparing features:doc cycle: 100%|██████████| 50/50 [00:00<00:00, 5555.52it/s]
preparing features:doc cycle: 100%|██████████| 50/50 [00:00<00:00, 1293.96it/s]
preparing features:doc cycle: 100%|██████████| 50/50 [00:00<00:00, 1661.68it/s]
preparing features:doc cycle: 100%|██████████| 50/50 [00:00<00:00, 9102.62it/s]
preparing features:doc cycle: 100%|██████████| 50/50 [00:00<00:00, 6233.36it/s]
preparing features:doc cycle: 100%|██████████| 50/50 [00:00<00:00, 7567.94it/s]
preparing features:doc cycle: 100%|██████████| 50/50 [00:00<00:00, 1904.68it/s]
preparing features:doc cycle: 100%|██████████| 50/50 [00:00<00:00, 12186.37it/s]
preparing features:doc cycle: 100%|██████████| 50/50 [00:00<00:00, 6861.51it/s]
preparing features:doc cycle: 100%|██████████| 50/50 [00:00<00:00, 1910.22it/s]
preparing features:doc cycle: 100%|████

: 

In [4]:
from sklearn.metrics import ndcg_score
true_relevance = np.asarray([[10, 0, 0, 1, 5]])
scores = np.asarray([[10, 0, 0, 0, 5]])
ndcg_score(true_relevance, scores)

ValueError: Only ('multilabel-indicator', 'continuous-multioutput', 'multiclass-multioutput') formats are supported. Got multiclass instead