In [1]:
import pandas as pd
import pickle
import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer

# Define model

In [4]:
class Model:
    def __init__(self, centroids, papers, sim='structural'):
        if sim == 'structural':
            w1 = 0.2
            w2 = 1 - w1
            self.centroids = {area: centroid[0] * w1 + centroid[1] * w2 for area, centroid in centroids.items()}
            papers['embeddings'] = w1 * papers['title_embeddings'] + w2 * papers['abstract_embeddings']
            self.papers = papers[['docno', 'title', 'embeddings', 'area']].copy()
        elif sim == 'total':
            self.centroids = {area: centroid[2] for area, centroid in centroids.items()}
            self.papers = papers[['docno', 'title', 'total_embeddings', 'area']].copy()
            self.papers.rename(columns={'total_embeddings': 'embeddings'}, inplace=True)
        del papers
            
        # embedding model
        self.embedding_model = SentenceTransformer('all-mpnet-base-v2')
        
        
    def predict(self, queries, topk=100):
        query_embeddings = self.embedding_model.encode(queries)
        results_list = [self.predict_once(query_embeddings[qid], qid, topk) for qid in range(query_embeddings.shape[0])]
        return pd.concat(results_list)
    
    def predict_once(self, query_embedding, qid, topk=100):
        top2_areas = sorted(self.centroids.items(), key=lambda x: np.dot(x[1], query_embedding), reverse=True)[:2]
        top2_areas = set([area for area, _ in top2_areas])
        candidate_papers = self.papers[self.papers['area'].apply(lambda x: bool(x & top2_areas))].copy()   
        candidate_papers['score'] = candidate_papers['embeddings'].apply(lambda x: np.dot(x, query_embedding))
        result_papers = candidate_papers.sort_values('score', ascending=False).head(topk)
        result_papers['qid'] = qid
        print('top 2 areas for query {}: {}'.format(qid, top2_areas))
        return result_papers[['docno', 'qid', 'title', 'score', 'area']].copy().reset_index(drop=True)
        
        
        
        

In [5]:
queries = pd.read_csv('data/queries.csv')
papers = pd.read_pickle('./data/papers_info.pkl')
centroids = pickle.load(open('./data/centroids_dict.pkl', 'rb'))

In [5]:
model = Model(centroids, papers, sim='total')

In [6]:
model2 = Model(centroids, papers, sim='structural')

In [93]:
results = model.predict(queries['query'].to_list(), topk=20)

top 3 areas for query 0: {'Denoising', 'Image Compression'}
top 3 areas for query 1: {'Image Captioning', 'Image Retrieval'}
top 3 areas for query 2: {'Action Recognition', 'Object Tracking'}
top 3 areas for query 3: {'Pose estimation', 'Action Recognition'}
top 3 areas for query 4: {'Object Tracking', 'Object detection'}
top 3 areas for query 5: {'Object Tracking', 'Optical Flow Estimation'}
top 3 areas for query 6: {'Depth Estimation', 'Autonomous vehicles'}
top 3 areas for query 7: {'Image Captioning', 'Style Transfer'}
top 3 areas for query 8: {'Image Captioning', 'Optical Character Recognition'}
top 3 areas for query 9: {'Face Recognition', 'Object detection'}
top 3 areas for query 10: {'Image Retrieval', 'Optical Character Recognition'}
top 3 areas for query 11: {'Depth Estimation', 'Optical Flow Estimation'}
top 3 areas for query 12: {'Image augmentation', 'Image Compression'}
top 3 areas for query 13: {'Depth Estimation', '3D Reconstruction'}
top 3 areas for query 14: {'Denoisi

In [7]:
results2 = model2.predict(queries['query'].to_list(), topk=20)

top 2 areas for query 0: {'Denoising', 'Image Compression'}
top 2 areas for query 1: {'Image Captioning', 'Image Retrieval'}
top 2 areas for query 2: {'Action Recognition', 'Object detection'}
top 2 areas for query 3: {'Pose estimation', 'Action Recognition'}
top 2 areas for query 4: {'Object Tracking', 'Object detection'}
top 2 areas for query 5: {'Object Tracking', 'Optical Flow Estimation'}
top 2 areas for query 6: {'Object Tracking', 'Autonomous vehicles'}
top 2 areas for query 7: {'Image Captioning', 'Style Transfer'}
top 2 areas for query 8: {'Optical Character Recognition', 'Image Retrieval'}
top 2 areas for query 9: {'Face Recognition', 'Object detection'}
top 2 areas for query 10: {'Optical Character Recognition', 'Object Recognition'}
top 2 areas for query 11: {'Depth Estimation', 'Optical Flow Estimation'}
top 2 areas for query 12: {'Image augmentation', 'Image Compression'}
top 2 areas for query 13: {'Depth Estimation', '3D Reconstruction'}
top 2 areas for query 14: {'Super

# Evaluate the model 1

In [97]:
results['docno'] = results['docno'].astype(int)

In [8]:
test_set = pd.read_csv('data/final_test_set.csv')

In [10]:
def pos2score(pos):
    if pos < 40:
        return 5
    elif pos < 100:
        return 4
    elif pos < 200:
        return 3
    elif pos < 400:
        return 2
    else:
        return 1

In [11]:
def ndcg(results_docs, test_docs):
    results_relative_score = []
    for docno in results_docs:
        if docno in test_docs:
            pos_test = test_docs.index(docno)
            relative_score = pos2score(pos_test)
            results_relative_score.append(relative_score)
        else:
            results_relative_score.append(0)
    DCG = sum([score/np.log2(pos+2) for pos, score in enumerate(results_relative_score)])
    IR_relative_score = sorted(results_relative_score, reverse=True)
    IDCG = sum([score/np.log2(pos+2) for pos, score in enumerate(IR_relative_score)])
    if IDCG == 0:
        return 0
    return DCG/IDCG
    

In [99]:
qids = test_set['qid'].unique()
m1_ndcgs = {qid: 0 for qid in qids}
for qid in qids:
    qid_test_set = test_set[test_set['qid'] == qid]
    qid_result = results[results['qid'] == qid]
    q_ndcg = ndcg(qid_result['docno'].to_list(), qid_test_set['docno'].to_list())
    m1_ndcgs[qid] = q_ndcg
    print('qid {}: {}'.format(qid, q_ndcg))
    
print('ndcg: {}'.format(sum(m1_ndcgs.values()) / len(m1_ndcgs)))

qid 13: 0.8790851722477917
qid 2: 0.8126123677847049
qid 3: 0.8988910674253004
qid 6: 0.849701843355969
qid 0: 0.8876190225403359
qid 11: 0.7944776901452953
qid 7: 0.6309297535714575
qid 9: 0.7821261439977852
qid 1: 0.9175257034998602
qid 14: 0.9561446631171914
qid 16: 0.9210791956752119
qid 17: 0.8123019948353362
qid 12: 0.611777657723478
qid 15: 0
qid 21: 0.828180989275756
qid 25: 0
qid 26: 0.24465054211822604
qid 4: 0.9594397022570066
qid 5: 0.8366331157181809
qid 22: 0.9198418172225933
qid 8: 0.8839190832911996
qid 10: 0.8921276357109943
qid 19: 0.9303473626864637
qid 20: 0.6982628326578658
qid 24: 0.9611686061960671
qid 27: 0
qid 28: 0.42671404644728445
qid 29: 0.31546487678572877
qid 18: 0.9115267669897291
qid 23: 0.81032608223349
ndcg: 0.7124291911836769


# Evaluate the model 2

In [12]:
results2['docno'] = results2['docno'].astype(int)

In [13]:
qids = test_set['qid'].unique()
m2_ndcgs = {qid: 0 for qid in qids}
for qid in qids:
    qid_test_set = test_set[test_set['qid'] == qid]
    qid_result = results2[results2['qid'] == qid]
    q_ndcg = ndcg(qid_result['docno'].to_list(), qid_test_set['docno'].to_list())
    m2_ndcgs[qid] = q_ndcg
    print('qid {}: {}'.format(qid, q_ndcg))
    
print('ndcg: {}'.format(sum(m2_ndcgs.values()) / len(m2_ndcgs)))

qid 13: 0.7357585286271314
qid 2: 0.5310978742972116
qid 3: 0.6883288931777398
qid 6: 0.877356031824257
qid 0: 0.8353844775734836
qid 11: 0.8783848220646756
qid 7: 0
qid 9: 0.9620195497026582
qid 1: 0.8992186952999051
qid 14: 0.9472279241043943
qid 16: 0.8611278460601425
qid 17: 0.8044949356840746
qid 12: 0.8269697282424433
qid 15: 0
qid 21: 0.7910988236039674
qid 25: 0
qid 26: 0.7800556383147249
qid 4: 0.8262577629494149
qid 5: 0.8555872638630801
qid 22: 0.7537032566793705
qid 8: 0.9296209349808456
qid 10: 0.8960991819648121
qid 19: 0.923157782496923
qid 20: 0.5788345373247642
qid 24: 0.9187109263942083
qid 27: 0
qid 28: 0.6075991233176815
qid 29: 0.5
qid 18: 0.9570003258902541
qid 23: 0.8809040992431904
ndcg: 0.7015332987893783


# Evaluate for BM25

In [15]:
import pyterrier as pt
import regex as re

In [16]:
if not pt.started():
    pt.init()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
PyTerrier 0.7.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)


No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [17]:
index = pt.IndexFactory.of('./data/index/data.properties')

In [18]:
# only keep words in query
queries['query'] = queries['query'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [19]:
br = pt.BatchRetrieve(index, wmodel="BM25")
bm25_results = br.transform(queries[['qid', 'query']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  queries['qid'] = queries['qid'].astype(str)


In [32]:
bm25_results['qid'] = bm25_results['qid'].astype(int)
bm25_results['docno'] = bm25_results['docno'].astype(int)

In [33]:
qids = test_set['qid'].unique()
bm25_ndcgs = {qid: 0 for qid in qids}
for qid in qids:
    qid_test_set = test_set[test_set['qid'] == qid]
    qid_result = bm25_results[bm25_results['qid'] == qid][:20]
    q_ndcg = ndcg(qid_result['docno'].to_list(), qid_test_set['docno'].to_list())
    bm25_ndcgs[qid] = q_ndcg
    print('qid {}: {}'.format(qid, q_ndcg))
    
print('ndcg: {}'.format(sum(bm25_ndcgs.values()) / len(bm25_ndcgs)))

qid 13: 0.8812349356800284
qid 2: 0.7119384179147197
qid 3: 0.883831510003986
qid 6: 0.8279802570655883
qid 0: 0.5969374864370421
qid 11: 0.9159683719479909
qid 7: 0.23981246656813146
qid 9: 0.8287984373604472
qid 1: 0.8503123308363219
qid 14: 0.35216953753213154
qid 16: 0
qid 17: 0.7703379821030687
qid 12: 0.3867064184620594
qid 15: 0
qid 21: 0.3756778848431553
qid 25: 0.37584446348729567
qid 26: 0.6810784952669073
qid 4: 0.8845576101322127
qid 5: 0.8110623428048456
qid 22: 0.8172678977133249
qid 8: 0.8968718982584183
qid 10: 0.8821366180823369
qid 19: 0.8775603395686095
qid 20: 0.3562071871080222
qid 24: 0.9108976569330755
qid 27: 0.6798452346149234
qid 28: 0.3191081886500419
qid 29: 0
qid 18: 0.7713011555074298
qid 23: 0.5792781240086183
ndcg: 0.6154907749630244


# Evaluate for random

In [102]:
qids = range(30)
r_ndcgs = {qid: 0 for qid in qids}
for qid in qids:
    qid_test_set = test_set[test_set['qid'] == qid]
    qid_result = np.random.choice(8255, 20, replace=False).astype(int)
    q_ndcg = ndcg(qid_result, qid_test_set['docno'].to_list())
    r_ndcgs[qid] = q_ndcg
    print('qid {}: {}'.format(qid, q_ndcg))
    
print('ndcg: {}'.format(sum(r_ndcgs.values()) / len(r_ndcgs)))

qid 0: 0
qid 1: 0.31546487678572877
qid 2: 0.3010299956639812
qid 3: 0.36996350175891385
qid 4: 0.25
qid 5: 0.38685280723454163
qid 6: 0.3617152762344296
qid 7: 0.31601601731991863
qid 8: 0.3075452067542794
qid 9: 0
qid 10: 0
qid 11: 0.5368596003428795
qid 12: 0.25
qid 13: 0.31546487678572877
qid 14: 0
qid 15: 0.23137821315975915
qid 16: 1.0
qid 17: 0
qid 18: 0.2626495350371935
qid 19: 0.38685280723454163
qid 20: 0.5992429307166866
qid 21: 0
qid 22: 0
qid 23: 0
qid 24: 0.23137821315975915
qid 25: 0
qid 26: 0
qid 27: 0
qid 28: 0.23137821315975915
qid 29: 0.24465054211822604
ndcg: 0.22994808711554424
