In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import gensim as gs
import tensorflow as tf
import matplotlib.pyplot as plt
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
from sklearn.metrics import accuracy_score

In [2]:
read_file = pd.read_csv('dataset/new_train_dataset_1_5', delimiter=',', header=None)
train_dataset = read_file.iloc[:,:].values

In [3]:
read_file = pd.read_csv('dataset/new_validation_dataset_1_5', delimiter=',', header=None)
validation_dataset = read_file.iloc[:,:].values

In [17]:
def sequence_padding(X_DIM, value):
    
    value_padding = np.pad(value, ((0,9 - X_DIM),(0,0)), 'constant')
    
    return value_padding

In [5]:
embedding_model = gs.models.FastText.load_fasttext_format('pre_trained_models/cc.en.300.bin')

  """Entry point for launching an IPython kernel.
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [18]:
left_train = []
rigth_train = []
label = []

for articleID, articleTitle, tableID, tableTitle, match in tqdm(train_dataset):
    
    value1 = tknzr.tokenize(str(articleTitle))
    
    if len(value1) < 9:
        
        embedding1 = embedding_model.wv[value1]
        padding_embedding1 = sequence_padding(embedding1.shape[0],embedding1)
        left_train.append(padding_embedding1)
    
    else:
        
        embedding1 = embedding_model.wv[value1[0:9]]
        left_train.append(embedding1)
    
    
    value2 = tknzr.tokenize(str(tableTitle))
    
    if len(value2) < 9:
        
        embedding2 = embedding_model.wv[value2]
        padding_embedding2 = sequence_padding(embedding2.shape[0],embedding2)
        rigth_train.append(padding_embedding2)
    
    else:
        
        embedding2 = embedding_model.wv[value2[0:9]]
        rigth_train.append(embedding2)
    
    
    label.append(match)

100%|██████████| 316183/316183 [00:54<00:00, 5812.17it/s]


In [19]:
left_train = np.array(left_train)
rigth_train = np.array(rigth_train)
label = np.array(label)

In [20]:
left_validation = []
rigth_validation = []
label_validation = []

for articleID, articleTitle, tableID, tableTitle, match in tqdm(validation_dataset):
    
    value1 = tknzr.tokenize(str(articleTitle))
    
    if len(value1) < 9:
        
        embedding1 = embedding_model.wv[value1]
        padding_embedding1 = sequence_padding(embedding1.shape[0],embedding1)
        left_validation.append(padding_embedding1)
    
    else:
        
        embedding1 = embedding_model.wv[value1[0:9]]
        left_validation.append(embedding1)
    
    
    value2 = tknzr.tokenize(str(tableTitle))
    
    if len(value2) < 9:
        
        embedding2 = embedding_model.wv[value2]
        padding_embedding2 = sequence_padding(embedding2.shape[0],embedding2)
        rigth_validation.append(padding_embedding2)
    
    else:
        
        embedding2 = embedding_model.wv[value2[0:9]]
        rigth_validation.append(embedding2)
    
    
    label_validation.append(match)

100%|██████████| 16623/16623 [00:02<00:00, 5989.98it/s]


In [21]:
left_validation = np.array(left_validation)
rigth_validation = np.array(rigth_validation)
label_validation = np.array(label_validation)

In [23]:
def siamese_model(input_shape):
    
    left = tf.keras.Input(input_shape)
    right = tf.keras.Input(input_shape)
    
    model = tf.keras.Sequential()
    
    model.add(tf.keras.layers.GRU(16, activation='relu',  dropout=0.2, input_shape=input_shape, return_sequences=True))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.GRU(32, activation='relu',  dropout=0.2, return_sequences=True))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.GRU(64, activation='relu',  dropout=0.2, return_sequences=True))
    model.add(tf.keras.layers.BatchNormalization())
    #model.add(tf.keras.layers.GRU(128, activation='relu', dropout=0.2, return_sequences=True))
    #model.add(tf.keras.layers.MaxPooling1D())
    #model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(64, activation='sigmoid'))
    
    left_encoded = model(left)
    right_encoded = model(right)
    
    L1_distance = tf.keras.layers.dot([left_encoded,right_encoded], axes=1, normalize=True)
    
    prediction = tf.keras.layers.Dense(1,activation='sigmoid')(L1_distance)
    
    siamese_net = tf.keras.Model(inputs=[left,right],outputs=prediction)
    
    return siamese_net

In [24]:
model = siamese_model((9,300))

In [25]:
model.compile(loss="binary_crossentropy",optimizer="adam", metrics=['accuracy'])

In [26]:
history = model.fit([left_train, rigth_train], label, 
          epochs=10, 
          batch_size=32, 
          validation_data=([left_validation, rigth_validation], label_validation))

Train on 316183 samples, validate on 16623 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [27]:
model.save('model_siamese_gru.h5')

In [None]:
# # summarize history for accuracy
# plt.plot(history.history['accuracy'])
# plt.plot(history.history['val_accuracy'])
# plt.title('model accuracy')
# plt.ylabel('accuracy')
# plt.xlabel('epoch')
# plt.legend(['train', 'validation'], loc='upper left')
# plt.show()

In [None]:
# # summarize history for loss
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('model loss')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['train', 'validation'], loc='upper left')
# plt.show()

In [None]:
# y_prob = model.predict([left_test,rigth_test])

In [None]:
# y_prob[y_prob >= 0.5] = 1

In [None]:
# y_prob[y_prob < 0.5] = 0

In [None]:
# accuracy_score(label_test, y_prob)