In [1]:
import warnings
import io
import random
import numpy as np
import mxnet as mx
import gluonnlp as nlp
from bert import data, model
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import pairwise_distances
from heapq import nsmallest

In [2]:
warnings.filterwarnings('ignore')

In [3]:
np.random.seed(100)
random.seed(100)
mx.random.seed(10000)
# change `ctx` to `mx.cpu()` if no GPU is available.
ctx = mx.cpu()
#ctx = mx.gpu(0)

In [4]:
read_file = pd.read_csv('dataset/train_dataset_1_1', delimiter=',', header=None)
train_dataset = read_file.iloc[:,:].values

In [5]:
bert_base, vocabulary = nlp.model.get_model('bert_12_768_12',
                         
                                            dataset_name='book_corpus_wiki_en_uncased',
                                             pretrained=True, ctx=ctx, use_pooler=True,
                                             use_decoder=False, use_classifier=False)

In [9]:
bert_classifier = model.classification.BERTClassifier(bert_base, num_classes=2, dropout=0.1)
# only need to initialize the classifier layer.
bert_classifier.classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
bert_classifier.hybridize(static_alloc=True)

# softmax cross entropy loss for classification
loss_function = mx.gluon.loss.SoftmaxCELoss()
loss_function.hybridize(static_alloc=True)

metric = mx.metric.Accuracy()

In [10]:
# Skip the first line, which is the schema
num_discard_samples = 1
# Split fields by tabs
field_separator = nlp.data.Splitter('\t')
# Fields to select from the file
field_indices = [0, 1, 2]
data_train_raw = nlp.data.TSVDataset(filename='fine_tuning_data.tsv',
                                 field_separator=field_separator,
                                 num_discard_samples=num_discard_samples,
                                 field_indices=field_indices)

In [11]:
# Use the vocabulary from pre-trained model for tokenization
bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=True)

# The maximum length of an input sequence
max_len = 128

# The labels for the two classes [(0 = not similar) or  (1 = similar)]
all_labels = ["0", "1"]

# whether to transform the data as sentence pairs.
# for single sentence classification, set pair=False
# for regression task, set class_labels=None
# for inference without label available, set has_label=False
pair = True
transform = data.transform.BERTDatasetTransform(bert_tokenizer, max_len,
                                                class_labels=all_labels,
                                                has_label=True,
                                                pad=True,
                                                pair=pair)
data_train = data_train_raw.transform(transform)

In [None]:
batch_size = 32
lr = 5e-6

# The FixedBucketSampler and the DataLoader for making the mini-batches
train_sampler = nlp.data.FixedBucketSampler(lengths=[int(item[1]) for item in data_train],
                                            batch_size=batch_size,
                                            shuffle=True)
bert_dataloader = mx.gluon.data.DataLoader(data_train, batch_sampler=train_sampler)

trainer = mx.gluon.Trainer(bert_classifier.collect_params(), 'adam',
                           {'learning_rate': lr, 'epsilon': 1e-9})

# Collect all differentiable parameters
# `grad_req == 'null'` indicates no gradients are calculated (e.g. constant parameters)
# The gradients for these params are clipped later
params = [p for p in bert_classifier.collect_params().values() if p.grad_req != 'null']
grad_clip = 1

# Training the model with only three epochs
log_interval = 4
num_epochs = 4
for epoch_id in range(num_epochs):
    metric.reset()
    step_loss = 0
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(bert_dataloader):
        with mx.autograd.record():

            # Load the data to the GPU
            token_ids = token_ids.as_in_context(ctx)
            valid_length = valid_length.as_in_context(ctx)
            segment_ids = segment_ids.as_in_context(ctx)
            label = label.as_in_context(ctx)

            # Forward computation
            out = bert_classifier(token_ids, segment_ids, valid_length.astype('float32'))
            ls = loss_function(out, label).mean()

        # And backwards computation
        ls.backward()

        # Gradient clipping
        trainer.allreduce_grads()
        nlp.utils.clip_grad_global_norm(params, 1)
        trainer.update(1)

        step_loss += ls.asscalar()
        metric.update([label], [out])

        # Printing vital information
        if (batch_id + 1) % (log_interval) == 0:
            print('[Epoch {} Batch {}/{}] loss={:.4f}, lr={:.7f}, acc={:.3f}'
                         .format(epoch_id, batch_id + 1, len(bert_dataloader),
                                 step_loss / log_interval,
                                 trainer.learning_rate, metric.get()[1]))
            step_loss = 0

In [6]:
articles = pd.read_csv('dataset/test_dataset', delimiter=',', header=None)
data_articles = articles.iloc[:,:].values

In [7]:
read = pd.read_csv('dataset/cleanDataTables', delimiter=',', header=None)
data_tables = read.iloc[:,:].values

In [8]:
tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=True);
transform = nlp.data.BERTSentenceTransform(tokenizer, max_seq_length=512, pair=False, pad=False);

In [None]:
articles_title = []
articles_id = []
article_dense_vector = []

for article_id, title, text in tqdm(data_articles):
    
    articles_id.append(article_id)
    
    articles_title.append(title)

    sample = transform(title)
    words, valid_len, segments = mx.nd.array([sample[0]]), mx.nd.array([sample[1]]), mx.nd.array([sample[2]])
    seq_encoding, cls_encoding = bert_base(words, segments, valid_len)
    
    article_dense_vector.append(cls_encoding[0].asnumpy())

In [None]:
type(cls_encoding[0].asnumpy().tolist())

In [14]:
tables_title = []
tables_dense_vector = []

for current_table in tqdm(data_tables):
    
    table_title = str(current_table[1])
    
    sample = transform(table_title)
    words, valid_len, segments = mx.nd.array([sample[0]]), mx.nd.array([sample[1]]), mx.nd.array([sample[2]])
    seq_encoding, cls_encoding = bert_base(words, segments, valid_len)
    
    tables_dense_vector.append(cls_encoding[0].asnumpy())

100%|██████████| 1000/1000 [03:29<00:00,  4.77it/s]


In [15]:
def get_id_ranked_tables(top_k,distance_vector):

    id_ranked_tables = []

    for current_top_k in top_k:
        
        index = np.where(distance_vector == current_top_k)
         
        index_colummun = index[0][0]
        
        id_ranked_tables.append(data_tables[index_colummun][0])

    return id_ranked_tables

In [16]:
def get_accuracy(id_ranked_tables, id_query_goal):

    accuracy = 0

    for id_table in id_ranked_tables:
    
        if id_table == id_query_goal:
    
            accuracy = 1
            
            break;

    return accuracy

In [17]:
def save_accuracy(k,accuracy):
    
    if k == 1:
            
        average_top1.append(accuracy)
        
    if k == 10:
            
        average_top10.append(accuracy)
        
    if k == 100:
            
        average_top100.append(accuracy)
        
    if k == 1000:
            
        average_top1000.append(accuracy)

In [18]:
average_top1 = []
average_top10 = []
average_top100 = []
average_top1000 = []

top_k = [1,10,100,1000]

for i in tqdm(range(len(article_dense_vector))):
    
    distance_vector = pairwise_distances(article_dense_vector[i].reshape(1,768), tables_dense_vector, metric='cosine')
    
    id_query_goal = int(articles_id[i])
    
    for accuracy_k in top_k:
        
        count_top_tables = accuracy_k
        
        top_k_rank = nsmallest(count_top_tables, distance_vector[0])
    
        id_ranked_tables = get_id_ranked_tables(top_k_rank,distance_vector[0])
        
        accuracy_value = get_accuracy(id_ranked_tables,id_query_goal)
        
        #save the accuracy on the list
        save_accuracy(accuracy_k,accuracy_value)

100%|██████████| 10/10 [00:00<00:00, 84.26it/s]


In [19]:
print(str(round(np.mean(average_top1),4))+" (±) "+str(round(np.std(average_top1),4)))
print(str(round(np.mean(average_top10),4))+" (±) "+str(round(np.std(average_top10),4)))
print(str(round(np.mean(average_top100),4))+" (±) "+str(round(np.std(average_top100),4)))
print(str(round(np.mean(average_top1000),4))+" (±) "+str(round(np.std(average_top1000),4)))

0.0 (±) 0.0
0.0 (±) 0.0
0.0 (±) 0.0
0.0 (±) 0.0


In [14]:
tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=True);
transform = nlp.data.BERTSentenceTransform(tokenizer, max_seq_length=512, pair=False, pad=False);

In [94]:
sample1 = transform('jame and jane run')
words, valid_len, segments = mx.nd.array([sample1[0]]), mx.nd.array([sample1[1]]), mx.nd.array([sample1[2]])
seq_encoding1, cls_encoding1 = bert_base(words, segments, valid_len)

In [81]:
seq_encoding1.shape

(1, 3, 768)

In [86]:
seq_encoding1[0][0]


[-6.02772892e-01 -1.72405541e-01 -1.84681669e-01 -6.55842200e-02
  3.42442006e-01 -6.24973364e-02  2.13785559e-01  9.78442729e-02
 -1.78641543e-01 -4.26937878e-01 -1.76355153e-01  1.28406122e-01
 -3.26469213e-01  2.90050786e-02  2.21558154e-01  1.21638276e-01
 -1.41841516e-01  7.30636716e-03 -4.27612402e-02 -1.52703419e-01
 -2.39086121e-01 -2.92528085e-02 -1.37085691e-01  2.60217935e-02
  3.06844085e-01 -1.22767858e-01 -3.39478910e-01  9.59354565e-02
  5.50924763e-02  1.10648148e-01 -7.75851980e-02  3.13839436e-01
 -3.34213197e-01  2.60411561e-01 -2.96429574e-01 -7.26973638e-03
 -6.12655580e-02  1.70845047e-01 -2.64047295e-01  4.10398424e-01
  1.21218324e-01  1.64628793e-02  2.14213841e-02 -3.47434670e-01
 -5.37217893e-02 -3.10769856e-01 -2.00013947e+00  1.65347621e-01
 -1.73762351e-01 -1.00587651e-01 -2.21746918e-02 -4.72985879e-02
 -3.22356820e-02  1.42506003e-01  1.08631931e-01  2.53788233e-01
 -2.83724815e-01  5.05539358e-01  7.10337684e-02 -2.57969618e-01
  4.26791534e-02 -3.6768

In [87]:
sample2 = transform('I love you')
words, valid_len, segments = mx.nd.array([sample2[0]]), mx.nd.array([sample2[1]]), mx.nd.array([sample2[2]])
seq_encoding2, cls_encoding2 = bert_base(words, segments, valid_len)

In [88]:
seq_encoding2.shape

(1, 3, 768)

In [89]:
seq_encoding2[0][0]


[-8.53405371e-02  1.72241807e-01  4.25227992e-02 -1.76750571e-01
 -5.46884090e-02 -1.53339043e-01  6.83971494e-02  1.43541589e-01
 -8.35241824e-02 -2.51421839e-01 -2.33200133e-01 -1.30194485e-01
 -1.00137986e-01  3.78875762e-01  1.77219927e-01  9.46984440e-03
 -1.00414135e-01  1.22511484e-01  2.59887129e-02 -1.79584563e-01
 -6.09559603e-02  5.48135638e-02 -1.65936962e-01  2.52952501e-02
 -1.15975579e-02 -5.36704361e-02 -5.11292405e-02  3.11615095e-02
 -3.20551023e-02  1.44424409e-01  4.28734757e-02  1.32411718e-01
 -1.43217862e-01  3.08850616e-01 -1.23955853e-01  6.58906177e-02
  1.08224571e-01  3.92213985e-02  6.74559399e-02  1.06134176e-01
  7.04268664e-02 -4.32595238e-02  1.91902086e-01  1.66571662e-02
  3.22372019e-02 -9.54633653e-02 -1.46588135e+00 -9.01058689e-02
 -8.18717629e-02 -1.62360668e-01 -7.76566863e-02  7.61733949e-02
  5.15031368e-02  2.74288654e-01 -6.70688003e-02  3.23203117e-01
 -1.22579895e-01  4.82781559e-01  6.24115951e-02 -1.45150051e-02
  2.53108680e-01 -5.3285