In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial import distance
from heapq import nsmallest
from sklearn.metrics import pairwise_distances
import random
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

In [2]:
articles = pd.read_csv('articlesDataset/articlesTestDataset', delimiter=',', header=None)
tables = pd.read_csv('articlesDataset/cleanDataTables', delimiter=',', header=None)

In [3]:
formattedArticles = articles.iloc[:,:].values
formattedTables = tables.iloc[:,:].values

In [4]:
embedding_model = Doc2Vec.load('pre-trained-models/doc2vec.bin')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [5]:
tablesByCharacteristics = []

for tables in tqdm(formattedTables):
    
    field = str(tables[1])
    
    vector_words = tknzr.tokenize(field)
    
    word_embedding = embedding_model.infer_vector(vector_words)
    
    tablesByCharacteristics.append(word_embedding)

100%|██████████| 298793/298793 [10:04<00:00, 494.43it/s]


In [6]:
corpusArticlesID = []
articlesByCharacteristics = []

for articles in tqdm(formattedArticles[0:1000]):
    
    articlePgID = articles[0]
    
    articleTitle = str(articles[1])
    
    articleText = str(articles[2])
    
    catchAll = articleTitle + ' ' + articleText
    
    corpusArticlesID.append(articlePgID)
        
    #embedding
    vector_words = tknzr.tokenize(articleText)
    
    word_embedding = embedding_model.infer_vector(vector_words)
    
    articlesByCharacteristics.append(word_embedding)

100%|██████████| 1000/1000 [06:29<00:00,  3.17it/s]


In [7]:
def getIdRankedTalbes(topK,distanceVector):

    idRankedTables = []

    for topkDistance in topK:
        
        index = np.where(distanceVector == topkDistance)
         
        indexColummun = index[0][0]
        
        idRankedTables.append(formattedTables[indexColummun][0])

    return idRankedTables

In [8]:
def getAccuracy(idRankedTables, idQueryGoal):

    accuracy = 0

    for idTable in idRankedTables:
    
        if idTable == idQueryGoal:
    
            accuracy = 1
            break;

    return accuracy

In [9]:
def saveAccuracy(k,accuracy):
    
    if k == 1:
            
        AverageTop1.append(accuracy)
        
    if k == 10:
            
        AverageTop10.append(accuracy)
        
    if k == 100:
            
        AverageTop100.append(accuracy)
        
    if k == 1000:
            
        AverageTop1000.append(accuracy)

In [10]:
AverageTop1 = []
AverageTop10 = []
AverageTop100 = []
AverageTop1000 = []

topK = [1,10,100,1000]

for i in tqdm(range(len(articlesByCharacteristics))):
    
    distanceVector = pairwise_distances(articlesByCharacteristics[i].reshape(1,300), tablesByCharacteristics, metric='cosine')
    
    idQueryGoal = int(corpusArticlesID[i])
    
    for accuracyK in topK:
        
        countTopTables = accuracyK
        
        topKRank = nsmallest(countTopTables, distanceVector[0])
    
        idRankedTables = getIdRankedTalbes(topKRank,distanceVector[0])
        
        accuracy_value = getAccuracy(idRankedTables,idQueryGoal)
        
        #save the accuracy on the list
        saveAccuracy(accuracyK,accuracy_value)

100%|██████████| 1000/1000 [2:09:14<00:00,  6.62s/it] 


In [11]:
print(str(round(np.mean(AverageTop1),4))+" (±) "+str(round(np.std(AverageTop1),4)))
print(str(round(np.mean(AverageTop10),4))+" (±) "+str(round(np.std(AverageTop10),4)))
print(str(round(np.mean(AverageTop100),4))+" (±) "+str(round(np.std(AverageTop100),4)))
print(str(round(np.mean(AverageTop1000),4))+" (±) "+str(round(np.std(AverageTop1000),4)))

0.002 (±) 0.0447
0.009 (±) 0.0944
0.034 (±) 0.1812
0.109 (±) 0.3116
