In [None]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import random
import gensim as gs
import tensorflow as tf
import warnings

from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
warnings.filterwarnings('ignore')
from sklearn.metrics import pairwise_distances
from heapq import nsmallest

from transformers import TFBertModel, BertTokenizer, TFBertMainLayer, BertConfig

In [None]:
train_queries = pd.read_csv('../data/final_test_queries.csv')

In [None]:
train_queries.info()

In [None]:
fixed_index = pd.read_csv('../data/fixed_test_set_index_top100_test_queries.csv', delimiter=',')

In [None]:
fixed_index.info()

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU not found')
print('found GPU at {}'.format(device_name))

In [None]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')

In [None]:
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [None]:
bert_model = TFBertModel.from_pretrained("bert-base-cased")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [None]:
MAX_TOKENS = 200

In [None]:
ranking_model = tf.keras.models.load_model("../../PhD/learning_to_rank_models/bert/bert_full_model_plus_attention_TM_01_0.9845.h5")

In [None]:
embedding_model = gs.models.FastText.load_fasttext_format('../../PhD/pre_trained_models/cc.en.300.bin')

In [None]:
MAX_PAD = 55

def sequence_padding(X_DIM, value):
    
    value_padding = np.pad(value, ((0,MAX_PAD - X_DIM),(0,0)), 'constant')
    
    return value_padding

In [None]:
def create_embedding(value):

    value = tknzr.tokenize(str(value))
    
    if len(value) < MAX_PAD:
        
        embedding = embedding_model.wv[value]
        
        padding_embedding = sequence_padding(embedding.shape[0],embedding)
        
        return padding_embedding
        
    else:
        
        embedding = embedding_model.wv[value[0:MAX_PAD]]
        
        return embedding

In [None]:
final_result_run = []

count = 0

result_ndcg = []

for i, row in tqdm(train_queries.iterrows()):

    #current article values
    query_id = row['query_id']
    query_description = str(row['query_description'])
   
    #embedding query
    query_dataset_ids = []
    query_dataset_mask = []
    query_dataset_seg = []
    
    query_title = []
    dataset_title = []
    dataset_description = []
    

    #return index
    return_index = fixed_index.loc[fixed_index['label_index'] == query_id]
    

    dataset_index = []
    #creating embedding 
    for i, row in return_index.iterrows():
           
        dataset_title_text = str(row['dataset_title'])
        dataset_description_text = str(row['dataset_description'])

        #getting the bert embedding
        return_tokenizer2 = bert_tokenizer.encode_plus(
          query_description,
          dataset_title_text+" "+dataset_description_text,
          max_length=MAX_TOKENS,
          add_special_tokens=True,
          return_token_type_ids=True,
          pad_to_max_length=True,
          return_attention_mask=True,
        )

        #bert
        query_dataset_ids.append(return_tokenizer2['input_ids'])
        query_dataset_mask.append(return_tokenizer2['attention_mask'])
        query_dataset_seg.append(return_tokenizer2['token_type_ids'])
        
        #fasttext embedding
        query_title_embedding = create_embedding(query_description)
        dataset_title_embedding = create_embedding(dataset_title_text)
        dataset_description_embedding = create_embedding(dataset_description_text)

        query_title.append(query_title_embedding)
        dataset_title.append(dataset_title_embedding)
        dataset_description.append(dataset_description_embedding)

    
    #bert
    query_dataset_ids = np.array(query_dataset_ids)
    query_dataset_mask = np.array(query_dataset_mask)
    query_dataset_seg = np.array(query_dataset_seg)
    
    #fasttext
    query_title = np.array(query_title)
    dataset_title = np.array(dataset_title)
    dataset_description = np.array(dataset_description)

    ranking_model_by_dataset = ranking_model.predict([query_dataset_ids,query_dataset_mask,query_dataset_seg,query_title,query_title,dataset_title,dataset_description]) 
    
    ranked_datasets_model = []
    
    for i in range(0,len(ranking_model_by_dataset)):

        ranked_datasets_model.append([return_index.iloc[i]['dataset_id'],return_index.iloc[i]['dataset_title'],ranking_model_by_dataset[i][0]]) 

    data_frame = pd.DataFrame(ranked_datasets_model, columns = ['dataset_id', 'dataset_title','dataset_ranking']) 
    data_frame_sorting = data_frame.sort_values('dataset_ranking', ascending=False)
    selected_top = data_frame_sorting.head(50)
    
    
    count = 1
    #creating the final result file
    
    for i, row in selected_top.iterrows():
        
        dataset_id = row['dataset_id']
        dataset_score = row['dataset_ranking']
        
        final_result_run.append([query_id,0,dataset_id,count,dataset_score])
        
        count = count + 1

In [None]:
for i in final_result_run:
    print(i[0]+" "+str(i[1])+" "+str(i[2])+" "+str(i[3])+" "+str(i[4])+" "+"NYUCIN-E-1")