In [3]:
import pandas as pd
import csv
import json
import numpy as np
from tqdm import tqdm
from pyserini.index import IndexReader
from collections import Counter


In [4]:
data = pd.read_csv('si-650eecs-549-ranker/documents.csv')
query_data = pd.read_csv('si-650eecs-549-ranker/query.csv')
index_reader = IndexReader('si-650eecs-549-ranker/indexes/collection_jsonl')

In [5]:
idx = [i for i in data['id'].unique() if index_reader.doc(str(i))]
len(idx)

59851

In [6]:
terms = {term.term: term.df for term in index_reader.terms()}
len(terms.keys())

114490

In [7]:
dl = {i: len(json.loads(index_reader.doc(str(i)).raw())['contents']) for i in data['id'] if index_reader.doc(str(i))}
len(dl.keys())

59851

In [8]:
avg_dl = np.mean([len(json.loads(index_reader.doc(str(i)).raw())['contents']) for i in data['id'] if index_reader.doc(str(i))])
avg_dl

1185.6945441792889

In [9]:
df = {term: (index_reader.get_term_counts(term, analyzer=None))[0] for term in terms.keys()}
len(df.keys())

114490

In [10]:
tf = {i: index_reader.get_document_vector(str(i)) for i in data['id'] if index_reader.doc(str(i))}
tf

{59087: {'junction': 1,
  'leader': 4,
  'viru': 1,
  'recombin': 1,
  'depend': 1,
  'strongli': 1,
  'while': 1,
  'discontinu': 4,
  'plai': 1,
  'sequenc': 6,
  'either': 1,
  'function': 1,
  'subgenom': 4,
  'from': 3,
  'nidoviru': 2,
  'between': 1,
  'which': 1,
  'sens': 1,
  'multipl': 1,
  'rna': 7,
  'fidel': 1,
  'gener': 1,
  'occur': 1,
  'onli': 1,
  'dure': 4,
  'mrna': 2,
  'templat': 2,
  'molecul': 1,
  'both': 1,
  'uniqu': 1,
  'studi': 1,
  '3': 1,
  'transfer': 4,
  '5': 1,
  'bodi': 5,
  'mutant': 1,
  'present': 2,
  'variat': 1,
  'nascent': 1,
  'here': 1,
  'role': 1,
  'ensur': 1,
  'copi': 1,
  'requir': 1,
  'deriv': 2,
  'we': 1,
  'primari': 1,
  'minu': 2,
  'result': 1,
  'fuse': 1,
  'central': 1,
  'strand': 6,
  'transcript': 1,
  'mechan': 1,
  'antisens': 1,
  'mutagenesi': 1,
  'yield': 1,
  'end': 2,
  'involv': 1,
  'anoth': 1,
  'arter': 1,
  'fulfil': 1,
  'regul': 1,
  'on': 1,
  'choic': 1,
  'process': 2,
  'contain': 1,
  'trss': 2,
  

In [11]:
N = index_reader.stats()['documents']
N

59881

In [None]:
def score(query, doc_id):
    rank_score = 0
    b = 0.2

    # get query term frequency
    qtf = Counter(query)

    for word in query:
        if tf[doc_id].get(word) and df[word]:
            rank_score += qtf[word] * (1 + np.log(1 + np.log(tf[doc_id][word]))) / (
                        1 - b + b * dl[doc_id] / avg_dl) * np.log((N + 1) / df[word])
    return rank_score

In [11]:
QueryId = []
DocumentId = []
Score = []
for qid in tqdm(range(query_data.shape[0])):
    analyzed_query = index_reader.analyze(query_data.loc[qid,'Query Description'])
    rel_score = []
    for i in idx:
        rel_score.append(score(analyzed_query, i))
    rel_score = np.array(rel_score)
    relevant = np.argsort(rel_score)[::-1][:10]
    Score += [rel_score[r] for r in relevant]
    QueryId += [query_data.loc[qid,'QueryId']]*10
    DocumentId += [idx[r] for r in relevant]

100%|██████████| 35/35 [00:18<00:00,  1.87it/s]


In [12]:
result = pd.DataFrame({'QueryId':QueryId, 'DocumentId':DocumentId, 'Score':Score})
result.head()

Unnamed: 0,QueryId,DocumentId,Score
0,0,43214,18.519987
1,0,2051,15.935129
2,0,56044,15.614834
3,0,26349,15.280851
4,0,18456,15.101236


In [13]:
result[['QueryId','DocumentId']].to_csv('part2_result_notebook.csv',index=False)

In [14]:
analyzed_query = index_reader.analyze(query_data.loc[0,'Query Description'])
score(analyzed_query, 43214)

18.519987227653743