In [None]:
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import gensim as gs
from tqdm import tqdm
import tensorflow as tf
import matplotlib.pyplot as plt
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
warnings.filterwarnings('ignore')

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU not found')
print('found GPU at {}'.format(device_name))

In [None]:
physical_devices = tf.config.list_physical_devices('GPU')

In [None]:
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [None]:
train_data = pd.read_csv('../train_data/train_triple_all_signals.csv', delimiter=',')
validation_data = pd.read_csv('../train_data/validation_triple_all_signals.csv', delimiter=',')

In [None]:
Y_train_dummy = np.empty(len(train_data))
Y_validation_dummy = np.empty(len(validation_data))

In [None]:
embedding_model = gs.models.FastText.load('../../train_embedding_models/fasttext_embedding_50d_all_signals')

In [None]:
MAX_PAD = 161

def sequence_padding(X_DIM, value):
    
    value_padding = np.pad(value, ((0,MAX_PAD - X_DIM),(0,0)), 'constant')
    
    return value_padding

In [None]:
def create_embedding(value):

    value = tknzr.tokenize(str(value))
    
    if len(value) < MAX_PAD:
        
        embedding = embedding_model.wv[value]
        embedding = embedding.astype('float16')
        
        padding_embedding = sequence_padding(embedding.shape[0],embedding)
        
        return padding_embedding
        
    else:
        
        embedding = embedding_model.wv[value[0:MAX_PAD]]
        embedding = embedding.astype('float16')
        
        return embedding

In [None]:
train_data.head(1)

In [None]:
train_article_page_title = []
train_true_table_page_title = []
train_false_table_page_title = []

for i,row in tqdm(train_data.iterrows()):
    
    article_catch_all = str(row['article_page_title'])+" "+str(row['article_page_meta_description'])+" "+str(row['article_page_keywords'])
    true_table_catch_all = str(row['true_table_page_title'])+" "+str(row['true_table_page_summary'])+" "+str(row['true_table_page_keywords'])
    false_table_catch_all = str(row['false_table_page_title'])+" "+str(row['false_table_page_summary'])+" "+str(row['false_table_page_keywords'])
    
    train_article_page_title.append(create_embedding(article_catch_all))
    train_true_table_page_title.append(create_embedding(true_table_catch_all))
    train_false_table_page_title.append(create_embedding(false_table_catch_all))

In [None]:
train_article_page_title = np.array(train_article_page_title,dtype='float16')
train_true_table_page_title = np.array(train_true_table_page_title,dtype='float16')
train_false_table_page_title = np.array(train_false_table_page_title,dtype='float16')

In [None]:
validation_article_page_title = []
validation_true_table_page_title = []
validation_false_table_page_title = []

for i,row in tqdm(validation_data.iterrows()):
    
    article_catch_all = str(row['article_page_title'])+" "+str(row['article_page_meta_description'])+" "+str(row['article_page_keywords'])
    true_table_catch_all = str(row['true_table_page_title'])+" "+str(row['true_table_page_summary'])+" "+str(row['true_table_page_keywords'])
    false_table_catch_all = str(row['false_table_page_title'])+" "+str(row['false_table_page_summary'])+" "+str(row['false_table_page_keywords'])
    
    validation_article_page_title.append(create_embedding(article_catch_all))
    validation_true_table_page_title.append(create_embedding(true_table_catch_all))
    validation_false_table_page_title.append(create_embedding(false_table_catch_all))

In [None]:
validation_article_page_title = np.array(validation_article_page_title,dtype='float16')
validation_true_table_page_title = np.array(validation_true_table_page_title,dtype='float16')
validation_false_table_page_title = np.array(validation_false_table_page_title,dtype='float16')

In [None]:
def triplet_loss(y_true, y_pred, alpha = 0.5):
     
    anchor = y_pred[:,0:1024]
    positive = y_pred[:,1024:2048]
    negative = y_pred[:,2048:3072]
        
    # distance between the anchor and the positive
    #pos_dist = tf.keras.backend.sum(tf.keras.backend.square(anchor-positive),axis=1)
    pos_dist = tf.keras.layers.Dot(axes=1,normalize=True)([anchor, positive])
    
    # distance between the anchor and the negative
    #neg_dist = tf.keras.backend.sum(tf.keras.backend.square(anchor-negative),axis=1)
    neg_dist = tf.keras.layers.Dot(axes=1,normalize=True)([anchor, negative])
    
    # compute loss
    #basic_loss = pos_dist-neg_dist+alpha
    basic_loss = (1 - pos_dist) - (1 - neg_dist) + alpha
    loss = tf.keras.backend.maximum(basic_loss,0.0)
 
    return loss

In [None]:
def shared_encoder_model_title():
    
    article_page_title = tf.keras.Input(shape=(161,50), dtype='float32')
    true_table_page_title = tf.keras.Input(shape=(161,50), dtype='float32')
    false_table_page_title = tf.keras.Input(shape=(161,50), dtype='float32')

    context_layer = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(512))
   
    #context vectors
    context_article_page_title = context_layer(article_page_title)
    context_true_table_page_title = context_layer(true_table_page_title)
    context_false_table_page_title = context_layer(false_table_page_title)

    concatenated = tf.keras.layers.Concatenate(axis=-1)([context_article_page_title, context_true_table_page_title, context_false_table_page_title])

    model = tf.keras.Model(inputs=[article_page_title,true_table_page_title,false_table_page_title],outputs=concatenated)

    return model

In [None]:
model = shared_encoder_model_title()

In [None]:
model.summary()

In [None]:
model.compile(loss=triplet_loss,optimizer="adam")

In [None]:
filepath="encoder_cosine_all_signals_{epoch:02d}_{val_loss:.4f}.h5"
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [None]:
history = model.fit([train_article_page_title,train_true_table_page_title,train_false_table_page_title], Y_train_dummy, 
      epochs=20, 
      batch_size=32,
      verbose=1,
      validation_data=([validation_article_page_title,validation_true_table_page_title,validation_false_table_page_title], Y_validation_dummy),
      callbacks=callbacks_list)