In [None]:
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.metrics import pairwise_distances
import random
import gensim as gs
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
import tensorflow as tf
from transformers import TFBertModel, BertTokenizer

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU not found')
print('found GPU at {}'.format(device_name))

In [None]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')

In [None]:
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [None]:
bert_model = TFBertModel.from_pretrained("fine_tuning_bert")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [None]:
MAX_TOKENS = 100

In [None]:
articles = pd.read_csv('../../dataset/data_articles_test.csv', delimiter=',', keep_default_na=False)

tables = pd.read_csv('../../dataset/distinct_tables_allsignals.csv', delimiter=',', keep_default_na=False)
tables = tables.drop(tables[tables.table_page_title == ""].index)

## encode articles using encoder model

In [None]:
articles_ids = []
input_ids_article_title = []
input_masks_article_title = []
input_token_id_article_title = []

for i, row in tqdm(articles.iterrows()):
    
    article_page_title = row['page_title']
    article_page_description = str(row['meta_description'])
       
    #fast text embedding
    return_tokenizer1 = bert_tokenizer.encode_plus(
      article_page_title,
      article_page_description,
      max_length=MAX_TOKENS,
      add_special_tokens=True,
      return_token_type_ids=True,
      pad_to_max_length=True,
      return_attention_mask=True,
    )
    
    #save list
    articles_ids.append(row['page_id'])
    
    input_ids_article_title.append(return_tokenizer1['input_ids'])
    input_masks_article_title.append(return_tokenizer1['attention_mask'])
    input_token_id_article_title.append(return_tokenizer1['token_type_ids']) 

In [None]:
input_ids_article_title = np.array(input_ids_article_title)
input_masks_article_title = np.array(input_masks_article_title)
input_token_id_article_title = np.array(input_token_id_article_title)

In [None]:
input_token_id_article_title.shape

In [None]:
articles_vector = np.asarray(bert_model([input_ids_article_title,input_masks_article_title,input_token_id_article_title])[1])

In [None]:
articles_vector.shape

In [None]:
tables_ids = []
input_ids_tables_title = []
input_masks_tables_title = []
input_token_id_tables_title = []

for i, row in tqdm(tables.iterrows()):
    
    table_title = row['table_page_title']
    table_page_description = str(row['table_page_summary'])
    
    #fast text embedding
    return_tokenizer1 = bert_tokenizer.encode_plus(
      table_title,
      table_page_description,
      max_length=MAX_TOKENS,
      add_special_tokens=True,
      return_token_type_ids=True,
      pad_to_max_length=True,
      return_attention_mask=True,
    )
    
    tables_ids.append(row['table_id'])
    
    input_ids_tables_title.append(return_tokenizer1['input_ids'])
    input_masks_tables_title.append(return_tokenizer1['attention_mask'])
    input_token_id_tables_title.append(return_tokenizer1['token_type_ids']) 

In [None]:
input_ids_tables_title = np.array(input_ids_tables_title)
input_masks_tables_title = np.array(input_masks_tables_title)
input_token_id_tables_title = np.array(input_token_id_tables_title)

In [None]:
input_ids_tables_title.shape

In [None]:
tables_vector = []

first = 0

for i in tqdm(range(100,(len(input_ids_tables_title)+100),100)):
    
    tables_vector.append(np.asarray(bert_model([input_ids_tables_title[first:i],input_masks_tables_title[first:i],input_token_id_tables_title[first:i]])[1]))
    
    first = first + 100

In [None]:
tables_vector = np.array(tables_vector)

In [None]:
tables_vector.shape

In [None]:
# tables_vector_final = tables_vector.reshape(85900,768)
tables_vector_final = tables_vector.reshape(-1,768)

In [None]:
tables_vector_final.shape

## test the model on final task

In [None]:
def getAccuracy(idRankedTables, idQueryGoal):

    accuracy = 0

    for idTable in idRankedTables:
        
        if idTable[0] == idQueryGoal:
    
            accuracy = 1
            break;

    return accuracy

In [None]:
def saveAccuracy(k,accuracy):
    
    if k == 1:
            
        AverageTop1.append(accuracy)
    
    if k == 5:
            
        AverageTop5.append(accuracy)
        
    if k == 10:
            
        AverageTop10.append(accuracy)
        
    if k == 20:
            
        AverageTop20.append(accuracy)
    
    if k == 50:
            
        AverageTop50.append(accuracy)
    
    if k == 100:
            
        AverageTop100.append(accuracy)

In [None]:
AverageTop1 = []
AverageTop5 = []
AverageTop10 = []
AverageTop20 = []
AverageTop50 = []
AverageTop100 = []

topK = [1,5,10,20,50,100]

for i in tqdm(range(len(articles_vector))):
    
    idQueryGoal = articles_ids[i]
    
    distance_vector = pairwise_distances(articles_vector[i].reshape(1,768), tables_vector_final, metric='cosine')
    
    #creating the dataframe
    all_tables_score = []
    
    for j in range(len(tables_ids)):
        
        table_id = tables_ids[j]
        table_score = distance_vector[0][j]
        
        new_row = {"table_id": table_id,"table_score": table_score}
        
        all_tables_score.append(new_row)
        
    df_all_tables_scores = pd.DataFrame(all_tables_score)
    df_tables_sorting = df_all_tables_scores.sort_values('table_score')
    
    #compute the accuracy
    for accuracyK in topK:
        
        selected_top = df_tables_sorting.head(accuracyK)
        
        min_score = selected_top['table_score'].max()
        draw_tables_socres = df_tables_sorting[df_tables_sorting['table_score'] <= min_score]
        final_ranked_tables = draw_tables_socres.iloc[:,0:1].values
        
        accuracy_value = getAccuracy(final_ranked_tables,idQueryGoal)
        
        #save the accuracy on the list
        saveAccuracy(accuracyK,accuracy_value)

In [None]:
print("TOP@1 = "+ str(round(np.mean(AverageTop1),4)))
print("TOP@5 = "+ str(round(np.mean(AverageTop5),4)))
print("TOP@10 = "+ str(round(np.mean(AverageTop10),4)))
print("TOP@20 = "+ str(round(np.mean(AverageTop20),4)))
print("TOP@50 = "+ str(round(np.mean(AverageTop50),4)))
print("TOP@100 = "+ str(round(np.mean(AverageTop100),4)))

In [None]:
articles.head(1)

In [None]:
ancor_title = str(triplet_data.loc[10]['article_page_title'])
ancor_description = str(triplet_data.loc[10]['article_page_meta_description'])

In [None]:
input_ids_ancor = []
input_masks_ancor = []
input_token_id_ancor = []

return_tokenizer1 = bert_tokenizer.encode_plus(
ancor_title,
ancor_description,
max_length=MAX_TOKENS,
add_special_tokens=True,
return_token_type_ids=True,
pad_to_max_length=True,
return_attention_mask=True)

input_ids_ancor.append(return_tokenizer1['input_ids'])
input_masks_ancor.append(return_tokenizer1['attention_mask'])
input_token_id_ancor.append(return_tokenizer1['token_type_ids']) 

In [None]:
input_ids_ancor = np.array(input_ids_ancor)
input_masks_ancor = np.array(input_masks_ancor)
input_token_id_ancor = np.array(input_token_id_ancor)

In [None]:
ancor_vector = np.asarray(bert_model([input_ids_ancor,input_masks_ancor,input_token_id_ancor])[1])

In [None]:
ancor_true_title = str(triplet_data.loc[10]['true_table_page_title'])
ancor_true_description = str(triplet_data.loc[10]['true_table_page_summary'])

In [None]:
input_ids_ancor = []
input_masks_ancor = []
input_token_id_ancor = []

return_tokenizer1 = bert_tokenizer.encode_plus(
ancor_true_title,
ancor_true_description,
max_length=MAX_TOKENS,
add_special_tokens=True,
return_token_type_ids=True,
pad_to_max_length=True,
return_attention_mask=True)

input_ids_ancor.append(return_tokenizer1['input_ids'])
input_masks_ancor.append(return_tokenizer1['attention_mask'])
input_token_id_ancor.append(return_tokenizer1['token_type_ids']) 

In [None]:
input_ids_ancor = np.array(input_ids_ancor)
input_masks_ancor = np.array(input_masks_ancor)
input_token_id_ancor = np.array(input_token_id_ancor)

In [None]:
ancor_true_vector = np.asarray(bert_model([input_ids_ancor,input_masks_ancor,input_token_id_ancor])[1])

In [None]:
import numpy as np
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import pairwise_kernels

In [None]:
distance_ancor_true = pairwise_distances(ancor_vector, ancor_true_vector, metric='cosine')

In [None]:
distance_ancor_true

In [None]:
ancor_false_title = str(triplet_data.loc[10]['false_table_page_title'])
ancor_false_description = str(triplet_data.loc[10]['false_table_page_summary'])

In [None]:
input_ids_ancor = []
input_masks_ancor = []
input_token_id_ancor = []

return_tokenizer1 = bert_tokenizer.encode_plus(
ancor_false_title,
ancor_false_description,
max_length=MAX_TOKENS,
add_special_tokens=True,
return_token_type_ids=True,
pad_to_max_length=True,
return_attention_mask=True)

input_ids_ancor.append(return_tokenizer1['input_ids'])
input_masks_ancor.append(return_tokenizer1['attention_mask'])
input_token_id_ancor.append(return_tokenizer1['token_type_ids']) 

In [None]:
input_ids_ancor = np.array(input_ids_ancor)
input_masks_ancor = np.array(input_masks_ancor)
input_token_id_ancor = np.array(input_token_id_ancor)

In [None]:
ancor_false_vector = np.asarray(bert_model([input_ids_ancor,input_masks_ancor,input_token_id_ancor])[1])

In [None]:
distance_ancor_false = pairwise_distances(ancor_vector, ancor_false_vector, metric='cosine')

In [None]:
print("distance to true "+str(distance_ancor_true))
print("distance to false "+str(distance_ancor_false))

In [None]:
sample_text1 = "i love you"
sample_text2 = "glory finally nabs dodd league the world game sbs"

In [None]:
return_tokenizer1 = bert_tokenizer.encode_plus(
sample_text1,
sample_text2,
max_length=30,
add_special_tokens=True,
return_token_type_ids=True,
pad_to_max_length=True,
return_attention_mask=True,
)

In [None]:
return_tokenizer1['input_ids'])
return_tokenizer1['attention_mask']
return_tokenizer1['token_type_ids']

In [None]:
return_tokenizer1['input_ids']

In [None]:
return_tokenizer1['attention_mask']

In [None]:
return_tokenizer1['token_type_ids']