In [None]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import random
import gensim as gs
import tensorflow as tf
import warnings
import tensorflow_hub as hub
from gensim.models.doc2vec import Doc2Vec
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
warnings.filterwarnings('ignore')
from sklearn.metrics import pairwise_distances
from heapq import nsmallest

In [None]:
train_queries = pd.read_csv('../data/train_queries.csv')

In [None]:
train_queries.info()

In [None]:
fixed_index = pd.read_csv('../data/fixed_test_set_index_top100.csv', delimiter=',')

In [None]:
fixed_index.info()

In [None]:
relevance_judgments = pd.read_csv('../data/data_search_2_e_train_qrels.txt', delimiter=" ")
relevance_judgments = pd.DataFrame(relevance_judgments.values, columns = ["query_id", "dataset_id", "relevance"])

In [None]:
embedding_model = Doc2Vec.load('../../news_table_matching/pre_trained_models/doc2vec.bin')

In [None]:
MAX_PAD = 100

In [None]:
def get_relevance(query_id, dataset_id):
    
    relevance_by_query = relevance_judgments.loc[relevance_judgments['query_id'] == query_id]
    
    relevance_by_dataset = relevance_by_query.loc[relevance_by_query['dataset_id'] == dataset_id]
    
    if relevance_by_dataset.empty:
    
        return 0
    
    else:
    
        relevance_jugdment = relevance_by_dataset['relevance'].values[0].replace("L","")
        
        return int(relevance_jugdment)

In [None]:
def dcg_metric(relevance,k):
    
    dcg = 0
    
    for i in range(1,k+1):
        
        dcg = dcg + (relevance[i-1] / np.log2(i+1))
        
    return dcg

In [None]:
result_ndcg = []

for i, row in tqdm(train_queries.iterrows()):

    #current article values
    query_id = row['query_id']
    query_description = row['query_description']
   
    #embedding 
    queries_search = []
    dataset_index = []

    vector_words = tknzr.tokenize(query_description)
    word_embedding = embedding_model.infer_vector(vector_words)
    queries_search.append(word_embedding)

    
    
    #return index
    return_index = fixed_index.loc[fixed_index['label_index'] == query_id]
    

    #creating embedding 
    for i, row in return_index.iterrows():

#         dataset_title_text = ' '.join(str(row['dataset_description']).split()[:MAX_PAD])
        dataset_title_text = ' '.join(str(row['dataset_title']+" "+row['dataset_description']).split()[:MAX_PAD])

        vector_words = tknzr.tokenize(dataset_title_text)
        word_embedding = embedding_model.infer_vector(vector_words)
        dataset_index.append(word_embedding)
        
    #getting the distances
    distance_vector = pairwise_distances(queries_search, dataset_index, metric='cosine')
    


    #creating the final dataframe for datasets
    ranked_datasets_model = []
    
    for i in range(0,len(distance_vector[0])):

        ranked_datasets_model.append([return_index.iloc[i]['dataset_id'],return_index.iloc[i]['dataset_title'],distance_vector[0][i]]) 

    data_frame = pd.DataFrame(ranked_datasets_model, columns = ['dataset_id', 'dataset_title','dataset_ranking']) 
    data_frame_sorting = data_frame.sort_values('dataset_ranking')
    selected_top = data_frame_sorting.head(5)
    

    #getting true relevance
    
    true_relevance = []
    
    relevance_by_query = relevance_judgments.loc[relevance_judgments['query_id'] == query_id]
    #sorting and reading the top10
    ideal_relevance = relevance_by_query.sort_values(by=['relevance'], ascending=False).head(5)
    
    for i, row in ideal_relevance.iterrows():
        
        dataset_id = row['dataset_id']
        value_relevance = int(row['relevance'].replace("L",""))
        
        true_relevance.append(value_relevance)

    
    
    #getting the predicted relevance
    predict_relevance = []
    
    for i,row in selected_top.iterrows():
        
        value_relevance = get_relevance(query_id,row['dataset_id'])
    
        predict_relevance.append(value_relevance)
    
    #computing NDCG
    dcg = dcg_metric(predict_relevance, len(predict_relevance))
    idcg = dcg_metric(true_relevance, len(true_relevance))
    
    if (idcg != 0):
    
        ndcg = dcg / idcg
    
    else:
        
        count = count + 1
        
        ndcg = 0
    
    result_ndcg.append(ndcg)

In [None]:
np.mean(result_ndcg)