In [None]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import random
import gensim as gs
import tensorflow as tf
import warnings
import tensorflow_hub as hub
from gensim.models.doc2vec import Doc2Vec
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
warnings.filterwarnings('ignore')
from sklearn.metrics import pairwise_distances
from heapq import nsmallest

In [None]:
train_queries = pd.read_csv('../data/final_train_queries.csv')

In [None]:
train_queries.info()

In [None]:
fixed_index = pd.read_csv('../data/fixed_test_set_index_top50.csv', delimiter=',')

In [None]:
fixed_index.info()

In [None]:
relevance_judgments = pd.read_csv('../data/data_search_2_e_train_qrels.txt', delimiter=" ")
relevance_judgments = pd.DataFrame(relevance_judgments.values, columns = ["query_id", "dataset_id", "relevance"])

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"

In [None]:
embed = hub.load(module_url)

In [None]:
def get_relevance(query_id, dataset_id):
    
    relevance_by_query = relevance_judgments.loc[relevance_judgments['query_id'] == query_id]
    
    relevance_by_dataset = relevance_by_query.loc[relevance_by_query['dataset_id'] == dataset_id]
    
    if relevance_by_dataset.empty:
    
        return 0
    
    else:
    
        relevance_jugdment = relevance_by_dataset['relevance'].values[0].replace("L","")
        
        return int(relevance_jugdment)

In [None]:
def dcg_metric(relevance,k):
    
    dcg = 0
    
    for i in range(1,k+1):
        
        dcg = dcg + (relevance[i-1] / np.log2(i+1))
        
    return dcg

In [None]:
result_ndcg = []

for i, row in tqdm(train_queries.iterrows()):

    #current article values
    query_id = row['query_id']
    query_description = row['query_description']
   
    #embedding 
    queries_search_text = []
    queries_search = []
    
    queries_search_text.append(query_description)
    embedding_queries = embed(queries_search_text)
    queries_search = []
    for current_embedding in embedding_queries:
        queries_search.append(current_embedding.numpy())

    
    #return index
    return_index = fixed_index.loc[fixed_index['label_index'] == query_id]
    

    #creating embedding datasets title
    dataset_index_text_title = []
    dataset_index_title = []
    
    for i, row in return_index.iterrows():
            
            dataset_index_text_title.append(str(row['dataset_title']))

    embedding_datasets = embed(dataset_index_text_title)

    for current_embedding in embedding_datasets:
        dataset_index_title.append(current_embedding.numpy())
      
    
    #creating embedding datasets description
    dataset_index_text_description = []
    dataset_index_description = []
    
    for i, row in return_index.iterrows():
            
            dataset_index_text_description.append(str(row['dataset_description']))

    embedding_datasets = embed(dataset_index_text_description)

    for current_embedding in embedding_datasets:
        dataset_index_description.append(current_embedding.numpy())
        
    
    #creating embedding datasets tags
    dataset_index_text_tags = []
    dataset_index_tags = []
    
    for i, row in return_index.iterrows():
            
            dataset_index_text_tags.append(str(row['dataset_tags']))

    embedding_datasets = embed(dataset_index_text_tags)

    for current_embedding in embedding_datasets:
        dataset_index_tags.append(current_embedding.numpy())
    
    
    #getting the distances
    distance_vector_title = pairwise_distances(queries_search, dataset_index_title, metric='cosine')
    distance_vector_description = pairwise_distances(queries_search, dataset_index_description, metric='cosine')
    distance_vector_tags = pairwise_distances(queries_search, dataset_index_tags, metric='cosine')
    
    
    #creating the final dataframe for datasets
    ranked_datasets_model = []
    
    for i in range(0,len(distance_vector_title[0])):
        
        distance_title = distance_vector_title[0][i]
        distance_description = distance_vector_description[0][i]
        distance_tags = distance_vector_tags[0][i]
        distance_mean = (distance_title+distance_description+distance_tags)/3

        ranked_datasets_model.append([return_index.iloc[i]['dataset_id'],return_index.iloc[i]['dataset_title'],distance_title,distance_description,distance_tags, distance_mean]) 

    data_frame = pd.DataFrame(ranked_datasets_model, columns = ['dataset_id', 'dataset_title','dataset_ranking_title','dataset_ranking_description','dataset_ranking_tags','mean_distance']) 
    data_frame_sorting = data_frame.sort_values('mean_distance')
    selected_top = data_frame_sorting.head(10)
    
    #getting true relevance
    
    true_relevance = []
    
    relevance_by_query = relevance_judgments.loc[relevance_judgments['query_id'] == query_id]
    #sorting and reading the top10
    ideal_relevance = relevance_by_query.sort_values(by=['relevance'], ascending=False).head(10)
    
    for i, row in ideal_relevance.iterrows():
        
        dataset_id = row['dataset_id']
        value_relevance = int(row['relevance'].replace("L",""))
        
        true_relevance.append(value_relevance)

    
    
    #getting the predicted relevance
    predict_relevance = []
    
    for i,row in selected_top.iterrows():
        
        value_relevance = get_relevance(query_id,row['dataset_id'])
    
        predict_relevance.append(value_relevance)
    
    #computing NDCG
    dcg = dcg_metric(predict_relevance, len(predict_relevance))
    idcg = dcg_metric(true_relevance, len(true_relevance))
    
    if (idcg != 0):
    
        ndcg = dcg / idcg
    
    else:
        
        count = count + 1
        
        ndcg = 0
    
    result_ndcg.append(ndcg)

In [None]:
np.mean(result_ndcg)