In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import gensim as gs
import tensorflow as tf
import matplotlib.pyplot as plt
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

In [3]:
read_file = pd.read_csv('articlesDataset/trainDatasetTitle', delimiter=',', header=None)
train_dataset = read_file.iloc[:,:].values

In [4]:
read_file = pd.read_csv('articlesDataset/validationDatasetTitle', delimiter=',', header=None)
validation_dataset = read_file.iloc[:,:].values

In [5]:
def sequence_padding(X_DIM, value):
    
    value_padding = np.pad(value, ((0,12 - X_DIM),(0,0)), 'constant')
    
    return value_padding

In [6]:
corpus = []
max_length = []

def create_corpus():
    
    for rowID, title1, title2, match in train_dataset:
    
        value1 = tknzr.tokenize(str(title1))
        value2 = tknzr.tokenize(str(title2))
        
        corpus.append(value1)
        corpus.append(value2)
        
        max_length.append(len(value1))
        max_length.append(len(value2))
        
create_corpus()

In [7]:
np.std(max_length)

6.385340074577648

In [None]:
embedding_model = gs.models.KeyedVectors.load_word2vec_format('pre-trained-models/crawl-300d-2M.vec')
#embedding_model = gs.models.FastText(size=300, window=3, min_count=1)

#embedding_model = gs.models.FastText.load_fasttext_format('pre-trained-models/crawl-300d-2M.vec')

In [None]:
embedding_model.build_vocab(sentences=corpus)

In [None]:
embedding_model.train(sentences=corpus, total_examples=len(corpus), epochs=100)

In [None]:
left_train = []
rigth_train = []
label = []

for rowID, title1, title2, match in tqdm(train_dataset):
    
    value1 = tknzr.tokenize(str(title1))
    
    if len(value1) < 12:
        
        embedding1 = embedding_model.wv[value1]
        padding_embedding1 = sequence_padding(embedding1.shape[0],embedding1)
        left_train.append(padding_embedding1)
    
    else:
        
        embedding1 = embedding_model.wv[value1[0:12]]
        left_train.append(embedding1)
    
    
    value2 = tknzr.tokenize(str(title2))
    
    if len(value2) < 12:
        
        embedding2 = embedding_model.wv[value2]
        padding_embedding2 = sequence_padding(embedding2.shape[0],embedding2)
        rigth_train.append(padding_embedding2)
    
    else:
        
        embedding2 = embedding_model.wv[value2[0:12]]
        rigth_train.append(embedding2)
    
    
    label.append(match)

In [None]:
left_train = np.array(left_train)
rigth_train = np.array(rigth_train)
label = np.array(label)

In [None]:
left_validation = []
rigth_validation = []
label_validation = []

for rowID, title1, title2, match in tqdm(validation_dataset):
    
    value1 = tknzr.tokenize(str(title1))
    
    if len(value1) < 12:
        
        embedding1 = embedding_model.wv[value1]
        padding_embedding1 = sequence_padding(embedding1.shape[0],embedding1)
        left_validation.append(padding_embedding1)
    
    else:
        
        embedding1 = embedding_model.wv[value1[0:12]]
        left_validation.append(embedding1)
    
    
    value2 = tknzr.tokenize(str(title2))
    
    if len(value2) < 12:
        
        embedding2 = embedding_model.wv[value2]
        padding_embedding2 = sequence_padding(embedding2.shape[0],embedding2)
        rigth_validation.append(padding_embedding2)
    
    else:
        
        embedding2 = embedding_model.wv[value2[0:12]]
        rigth_validation.append(embedding2)
    
    
    label_validation.append(match)

In [None]:
left_validation = np.array(left_validation)
rigth_validation = np.array(rigth_validation)
label_validation = np.array(label_validation)

In [None]:
def siamese_model(input_shape):
    
    left = tf.keras.Input(input_shape)
    right = tf.keras.Input(input_shape)
    
    model = tf.keras.Sequential()
    
    model.add(tf.keras.layers.Conv1D(16, kernel_size=3, activation='relu', input_shape=input_shape))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Conv1D(32, kernel_size=3, activation='relu'))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Conv1D(64, kernel_size=3, activation='relu'))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Conv1D(128, kernel_size=3, activation='relu'))
    model.add(tf.keras.layers.MaxPooling1D())
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(512, activation='sigmoid'))
    
    left_encoded = model(left)
    right_encoded = model(right)
    
    L1_layer = tf.keras.layers.Lambda(lambda tensors:tf.keras.backend.abs(tensors[0] - tensors[1]))
    
    L1_distance = L1_layer([left_encoded, right_encoded])
    
    prediction = tf.keras.layers.Dense(1,activation='sigmoid')(L1_distance)
    
    siamese_net = tf.keras.Model(inputs=[left,right],outputs=prediction)
    
    return siamese_net

In [None]:
model = siamese_model((12,300))

In [None]:
model.compile(loss="binary_crossentropy",optimizer="adam", metrics=['accuracy'])

In [None]:
history = model.fit([left_train, rigth_train], label, 
          epochs=100, 
          batch_size=100, 
          validation_data=([left_validation, rigth_validation], label_validation))

In [None]:
history.history.keys()

In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()