In [1]:
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import gensim as gs
from tqdm import tqdm
import tensorflow as tf
import matplotlib.pyplot as plt
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
warnings.filterwarnings('ignore')
import fasttext

In [2]:
train_data = pd.read_csv('../train_data/train_triple_all_signals.csv', delimiter=',')
validation_data = pd.read_csv('../train_data/validation_triple_all_signals.csv', delimiter=',')

In [3]:
Y_train_dummy = np.empty(len(train_data))
Y_validation_dummy = np.empty(len(validation_data))

In [4]:
embedding_model = gs.models.FastText.load('../../train_embedding_models/fasttext_embedding_50d_all_signals')

In [5]:
MAX_PAD = 31

def sequence_padding(X_DIM, value):
    
    value_padding = np.pad(value, ((0,MAX_PAD - X_DIM),(0,0)), 'constant')
    
    return value_padding

In [6]:
def create_embedding(value):

    value = tknzr.tokenize(str(value))
    
    if len(value) < MAX_PAD:
        
        embedding = embedding_model.wv[value]
        embedding = embedding.astype('float16')
        
        padding_embedding = sequence_padding(embedding.shape[0],embedding)
        
        return padding_embedding
        
    else:
        
        embedding = embedding_model.wv[value[0:MAX_PAD]]
        embedding = embedding.astype('float16')
        
        return embedding

In [7]:
train_article_page_title = []
train_true_table_page_title = []
train_false_table_page_title = []

for i,row in tqdm(train_data.iterrows()):
    
    train_article_page_title.append(create_embedding(row['article_page_title']))
    train_true_table_page_title.append(create_embedding(row['true_table_page_title']))
    train_false_table_page_title.append(create_embedding(row['false_table_page_title']))

84750it [00:34, 2461.49it/s]


In [8]:
train_article_page_title = np.array(train_article_page_title,dtype='float16')
train_true_table_page_title = np.array(train_true_table_page_title,dtype='float16')
train_false_table_page_title = np.array(train_false_table_page_title,dtype='float16')

In [9]:
validation_article_page_title = []
validation_true_table_page_title = []
validation_false_table_page_title = []

for i,row in tqdm(validation_data.iterrows()):
    
    validation_article_page_title.append(create_embedding(row['article_page_title']))
    validation_true_table_page_title.append(create_embedding(row['true_table_page_title']))
    validation_false_table_page_title.append(create_embedding(row['false_table_page_title']))

9417it [00:03, 2483.94it/s]


In [10]:
validation_article_page_title = np.array(validation_article_page_title,dtype='float16')
validation_true_table_page_title = np.array(validation_true_table_page_title,dtype='float16')
validation_false_table_page_title = np.array(validation_false_table_page_title,dtype='float16')

In [11]:
def triplet_loss(y_true, y_pred, alpha = 0.1):
     
    anchor = y_pred[:,0:64]
    positive = y_pred[:,64:128]
    negative = y_pred[:,128:192]
    
    # distance between the anchor and the positive
    #pos_dist = tf.keras.backend.sum(tf.keras.backend.square(anchor-positive),axis=1)
    pos_dist = tf.keras.layers.Dot(axes=1,normalize=True)([anchor, positive])

    # distance between the anchor and the negative
    #neg_dist = tf.keras.backend.sum(tf.keras.backend.square(anchor-negative),axis=1)
    neg_dist = tf.keras.layers.Dot(axes=1,normalize=True)([anchor, negative])

    # compute loss
    basic_loss = pos_dist-neg_dist+alpha
    loss = tf.keras.backend.maximum(basic_loss,0.0)
 
    return loss

In [12]:
def shared_encoder_model_title():
    
    article_page_title = tf.keras.Input(shape=(31,50), dtype='float32')
    true_table_page_title = tf.keras.Input(shape=(31,50), dtype='float32')
    false_table_page_title = tf.keras.Input(shape=(31,50), dtype='float32')

    context_layer = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32))

    #context vectors
    context_article_page_title = context_layer(article_page_title)
    context_true_table_page_title = context_layer(true_table_page_title)
    context_false_table_page_title = context_layer(false_table_page_title)

    concatenated = tf.keras.layers.Concatenate(axis=-1)([context_article_page_title, context_true_table_page_title, context_false_table_page_title])

    model = tf.keras.Model(inputs=[article_page_title,true_table_page_title,false_table_page_title],outputs=concatenated)

    return model

In [13]:
model = shared_encoder_model_title()

In [14]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 31, 50)]     0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 31, 50)]     0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 31, 50)]     0                                            
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 64)           16128       input_1[0][0]                    
                                                                 input_2[0][0]                

In [15]:
model.compile(loss=triplet_loss,optimizer="adam")

In [16]:
filepath="encoder_title_{epoch:02d}_{val_loss:.4f}.h5"
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [17]:
with tf.device("/cpu:0"):

    history = model.fit([train_article_page_title,train_true_table_page_title,train_false_table_page_title], Y_train_dummy, 
          epochs=20, 
          batch_size=32,
          verbose=1,
          validation_data=([validation_article_page_title,validation_true_table_page_title,validation_false_table_page_title], Y_validation_dummy),
          callbacks=callbacks_list)

Epoch 1/20
Epoch 00001: val_loss improved from inf to 0.10001, saving model to encoder_title_01_0.1000.h5
Epoch 2/20
Epoch 00002: val_loss improved from 0.10001 to 0.10000, saving model to encoder_title_02_0.1000.h5
Epoch 3/20
Epoch 00003: val_loss improved from 0.10000 to 0.10000, saving model to encoder_title_03_0.1000.h5
Epoch 4/20
Epoch 00004: val_loss improved from 0.10000 to 0.10000, saving model to encoder_title_04_0.1000.h5
Epoch 5/20
Epoch 00005: val_loss improved from 0.10000 to 0.10000, saving model to encoder_title_05_0.1000.h5
Epoch 6/20
Epoch 00006: val_loss improved from 0.10000 to 0.10000, saving model to encoder_title_06_0.1000.h5
Epoch 7/20
Epoch 00007: val_loss improved from 0.10000 to 0.08712, saving model to encoder_title_07_0.0871.h5
Epoch 8/20
Epoch 00008: val_loss improved from 0.08712 to 0.08590, saving model to encoder_title_08_0.0859.h5
Epoch 9/20
Epoch 00009: val_loss improved from 0.08590 to 0.08489, saving model to encoder_title_09_0.0849.h5
Epoch 10/20
Ep