In [1]:
import pickle
import re
from tqdm import tqdm
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial import distance
from heapq import nsmallest
from sklearn.metrics import pairwise_distances

In the block below we are reading the raw data from articles and tables.

In [2]:
rawArticles = pickle.load(open("articlesDataset/final_article_tables.pkl","rb"))
rawTables = pickle.load(open("articlesDataset/final_dict_tables.pkl","rb"))

In the block next, we are formatting the articles data by removing links, multiples spaces and special characters. Each article has several links. Then we get each one. Finally, we read all text of each link.

In the next block, we remove empty values from articles.

In [4]:
for article in tqdm(formattedArticles):
    if (article[1] == ''):
        formattedArticles.remove(article)

 98%|█████████▊| 291676/298725 [00:47<00:01, 6117.12it/s]


In the block next, we are reading and formatting the raw table data. Each indice may have multiple tables. So, we collect each table separately. Next, we add tables in the corpus.

In [3]:
formattedTables = []

for i in tqdm(rawTables.keys()):
    
    dataTable = rawTables[i]
    
    totalTalbes = len(dataTable)
    
    for j in range(totalTalbes):
        
        keyTable = dataTable[j][0][0]
        
        keyCaption = dataTable[j][0][1]
        
        tableText = str(dataTable[j][1:])
    
        fullTextTable = re.sub('[^A-Za-z]+',' ',tableText)
        
        #removing left and right spaces
        fullTextTable = fullTextTable.lstrip()
        fullTextTable = fullTextTable.rstrip()
        
        textPlusCaption = fullTextTable +" "+keyCaption
        
        formattedTables.append([keyTable,keyCaption,fullTextTable,textPlusCaption])

100%|██████████| 90000/90000 [00:21<00:00, 4114.03it/s]


In [4]:
for tables in tqdm(formattedTables):
    if (tables[2] == ''):
        formattedTables.remove(tables)

 97%|█████████▋| 298855/308759 [00:31<00:01, 9541.27it/s] 


tf-idf method

In [5]:
corpusTables = []

for tables in formattedTables:
    
    corpusTables.append(tables[3])

In [8]:
corpusArticles = []

for articles in formattedArticles:
    
    corpusArticles.append(articles[1])

#temp1 = np.asarray(formattedArticles)
#corpusArticles = temp1[:,1]

In [7]:
vectorizer = TfidfVectorizer(stop_words='english', analyzer='word')
tablesByCharacteristics = vectorizer.fit_transform(corpusTables)

In [48]:
articlesByCharacteristics = vectorizer.transform(corpusArticles)

In [11]:
def getTotalTablesByArticle(idArticle):

    countTopTables = 0

    for tables in formattedTables:
    
        if tables[0] == int(idArticle):
        
             countTopTables = countTopTables + 1

    return countTopTables

In [12]:
def getIdRankedTalbes(topK,distanceVector):

    idRankedTables = []

    for topkDistance in topK:
    
        #index = distanceVector.index(topkDistance)
        
        index = np.where(distanceVector == topkDistance)
         
        indexColummun = index[0][0]
        
        idRankedTables.append(formattedTables[indexColummun][0])

    return idRankedTables

In [13]:
def getAccuracy(idRankedTables, idQueryGoal):

    totalRanquedTables = len(idRankedTables)

    match = 0

    for idTable in idRankedTables:
    
        if idTable == idQueryGoal:
    
            match = match + 1

    accuracy = match / totalRanquedTables
    
    return accuracy

In [49]:
accuracy = []

#articlesByCharacteristics.shape[0])

for j in tqdm(range(100)):

    distanceVector = pairwise_distances(articlesByCharacteristics[j], tablesByCharacteristics, metric='cosine')

    idQueryGoal = int(formattedArticles[j][0])

    countTopTables = getTotalTablesByArticle(idQueryGoal)
    
    topK = nsmallest(countTopTables, distanceVector[0])

    idRankedTables = getIdRankedTalbes(topK,distanceVector[0])
    
    accuracy.append(getAccuracy(idRankedTables,idQueryGoal))


  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [00:01<01:40,  1.01s/it][A
  2%|▏         | 2/100 [00:01<01:32,  1.06it/s][A
  3%|▎         | 3/100 [00:02<01:27,  1.11it/s][A
  4%|▍         | 4/100 [00:03<01:22,  1.16it/s][A
  5%|▌         | 5/100 [00:04<01:22,  1.15it/s][A
  6%|▌         | 6/100 [00:05<01:19,  1.18it/s][A
  7%|▋         | 7/100 [00:05<01:18,  1.19it/s][A
  8%|▊         | 8/100 [00:06<01:18,  1.17it/s][A
  9%|▉         | 9/100 [00:07<01:19,  1.15it/s][A
 10%|█         | 10/100 [00:08<01:15,  1.19it/s][A
 11%|█         | 11/100 [00:09<01:14,  1.20it/s][A
 12%|█▏        | 12/100 [00:10<01:14,  1.19it/s][A
 13%|█▎        | 13/100 [00:10<01:12,  1.20it/s][A
 14%|█▍        | 14/100 [00:11<01:10,  1.21it/s][A
 15%|█▌        | 15/100 [00:12<01:08,  1.24it/s][A
 16%|█▌        | 16/100 [00:13<01:09,  1.21it/s][A
 17%|█▋        | 17/100 [00:14<01:08,  1.20it/s][A
 18%|█▊        | 18/100 [00:15<01:09,  1.17it/s][A
 19%|█▉        | 19/100 [00:1

In [58]:
np.mean(accuracy)

0.05333333333333334

In [None]:
data1 = pd.read_csv('cleanArticles', delimiter=',', header=None)
x1 = data1.iloc[:,0].values