In [1]:
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import gensim as gs
from tqdm import tqdm
import tensorflow as tf
import matplotlib.pyplot as plt
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
warnings.filterwarnings('ignore')
import fasttext

In [2]:
train_data = pd.read_csv('../train_data/train_true_false_pairs_encoders.csv', delimiter=',')

In [4]:
embedding_model = gs.models.FastText.load('../../train_embedding_models/fasttext_embedding_50d_all_signals')

In [5]:
MAX_PAD = 31

def sequence_padding(X_DIM, value):
    
    value_padding = np.pad(value, ((0,MAX_PAD - X_DIM),(0,0)), 'constant')
    
    return value_padding

In [6]:
def create_embedding(value):

    value = tknzr.tokenize(str(value))
    
    if len(value) < MAX_PAD:
        
        embedding = embedding_model.wv[value]
        embedding = embedding.astype('float16')
        
        padding_embedding = sequence_padding(embedding.shape[0],embedding)
        
        return padding_embedding
        
    else:
        
        embedding = embedding_model.wv[value[0:MAX_PAD]]
        embedding = embedding.astype('float16')
        
        return embedding

In [9]:
article_page_title = []
true_table_page_title = []
false_table_page_title = []

for i,row in tqdm(train_data.iterrows()):
    
    article_page_title.append(create_embedding(row['article_page_title']))
    true_table_page_title.append(create_embedding(row['true_table_page_title']))
    false_table_page_title.append(create_embedding(row['false_table_page_title']))
   

94167it [00:37, 2537.55it/s]


In [10]:
article_page_title = np.array(article_page_title,dtype='float16')
true_table_page_title = np.array(true_table_page_title,dtype='float16')
false_table_page_title = np.array(false_table_page_title,dtype='float16')

In [19]:
def encoder_model_title():
    
    article_page_title = tf.keras.Input(shape=(31,50), dtype='float32')
    true_table_page_title = tf.keras.Input(shape=(31,50), dtype='float32')
    false_table_page_title = tf.keras.Input(shape=(31,50), dtype='float32')
    
    context_layer = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32))
    
    #context vectors
    context_article_page_title = context_layer(article_page_title)
    context_true_table_page_title = context_layer(true_table_page_title)
    context_false_table_page_title = context_layer(false_table_page_title)
    
    
    #similarity article to true
    similaity_article_true_table = tf.keras.layers.Dot(axes=1,normalize=True)([context_article_page_title, context_true_table_page_title])
    #similarity article to false
    similaity_article_false_table = tf.keras.layers.Dot(axes=1,normalize=True)([context_article_page_title, context_false_table_page_title])
    
    model = tf.keras.Model(inputs=[article_page_title,true_table_page_title,false_table_page_title],outputs=similaity_article_false_table)
    
    return model

In [20]:
model = encoder_model_title()

In [21]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_10 (InputLayer)           [(None, 31, 50)]     0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           [(None, 31, 50)]     0                                            
__________________________________________________________________________________________________
bidirectional_3 (Bidirectional) (None, 64)           16128       input_10[0][0]                   
                                                                 input_12[0][0]                   
__________________________________________________________________________________________________
input_11 (InputLayer)           [(None, 31, 50)]     0                                      

In [None]:
model.compile(loss="binary_crossentropy",optimizer="adam", metrics=['accuracy'])

In [None]:
filepath="best_model_title_affinity_{epoch:02d}_{val_accuracy:.4f}.h5"
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='val_accuracy', verbose=2, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [None]:
history = model.fit([train_article_title, train_table_title], train_label, 
          epochs=20, 
          batch_size=32,
          verbose=2,
          validation_data=([validation_article_title, validation_table_title], validation_label),
          callbacks=callbacks_list)