In [None]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import random
import gensim as gs
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
indexing_dataset_search = Elasticsearch(timeout=30, max_retries=10, retry_on_timeout=True)
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

In [None]:
indexing_dataset_search.indices.close(index='dataset_search')
indexing_dataset_search.indices.put_settings(index='dataset_search', body={"index": {"similarity": {"default": {"type": "BM25"}}}})
indexing_dataset_search.indices.open(index='dataset_search')

In [None]:
def elasticsearch_search(query):
    
    result= indexing_dataset_search.search(
        index="dataset_search", 
        body = {
        "_source": ["dataset_id","dataset_title"],
        "from" : 0,
        "size" : 20,
        "query": {
            "multi_match":{
              "type": "most_fields",
              "query":    query, 
              "fields": ["dataset_title","dataset_description"] 
            }
        }
    })
    
    return result

In [None]:
def return_data_index(query):
    
    datasets_index = []

    result_index = elasticsearch_search(query)
        
    for hit in result_index['hits']['hits']:
    
        dataset_id = hit['_source']['dataset_id']
        
        dataset_title = hit['_source']['dataset_title']
    
        datasets_index.append(dataset_id)
    
    return datasets_index

In [None]:
relevance_judgments = pd.read_csv('../data/data_search_2_e_train_qrels.txt', delimiter=" ")
relevance_judgments = pd.DataFrame(relevance_judgments.values, columns = ["query_id", "dataset_id", "relevance"])

In [None]:
def get_relevance(query_id, dataset_id):
    
    relevance_by_query = relevance_judgments.loc[relevance_judgments['query_id'] == query_id]
    
    relevance_by_dataset = relevance_by_query.loc[relevance_by_query['dataset_id'] == dataset_id]
    
    if relevance_by_dataset.empty:
    
        return 0
    
    else:
    
        relevance_jugdment = relevance_by_dataset['relevance'].values[0].replace("L","")
        
        return int(relevance_jugdment)

In [None]:
def dcg_metric(relevance,k):
    
    dcg = 0
    
    for i in range(1,k+1):
        
        dcg = dcg + (relevance[i-1] / np.log2(i+1))
        
    return dcg

In [None]:
train_queries = pd.read_csv('../data/train_queries.csv')

In [None]:
train_queries.sample(5)

In [None]:
result_ndcg = []
count = 0
hard_queries = []

for i,row in tqdm(train_queries.iterrows()):
    
    query_id = row['query_id']
    query_description = row['query_description']
    
    result_index = return_data_index(query_description)
    
    #getting the relevance score for each elasticsearch result
    
    predict_relevance = []
    
    for dataset_id in result_index:
        
        value_relevance = get_relevance(query_id,dataset_id)
    
        predict_relevance.append(value_relevance)
    
    
    
    #getting true relevance
    
    true_relevance = []
    
    relevance_by_query = relevance_judgments.loc[relevance_judgments['query_id'] == query_id]
    #sorting and reading the top10
    ideal_relevance = relevance_by_query.sort_values(by=['relevance'], ascending=False).head(10)
    
    for i, row in ideal_relevance.iterrows():
        
        dataset_id = row['dataset_id']
        value_relevance = int(row['relevance'].replace("L",""))
        
        true_relevance.append(value_relevance)
   


    #computing metrics
    dcg = dcg_metric(predict_relevance, len(predict_relevance))
    idcg = dcg_metric(true_relevance, len(true_relevance))
    
    if (idcg != 0):
    
        ndcg = dcg / idcg
    
    else:
        
        count = count + 1
        
        ndcg = 0
    
    result_ndcg.append(ndcg)

    if ndcg == 0:
        
        hard_queries.append([query_id,query_description])

In [None]:
np.mean(result_ndcg)