In [32]:
#IMPORTING NECESSARY LIBRARIES
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [33]:
# LOADING THE DATASET
df = pd.read_csv('preprocessed.csv')
labels = df['is_duplicate']
corpus = df.loc[:,['question1_non_ascii','question2_non_ascii']]
# CREATE TRAIN/VAL/TEST SETS
X_train, X_test, y_train, y_test = train_test_split(corpus, labels, test_size=0.3, stratify=labels, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, stratify=y_test, random_state=42)


In [34]:
# DEFINE PROCESSING DATA FUNCTION FOR FASTER TENSORFLOW LOADING
# WE ARE CREATING SIAMESE NETWORK. SIAMESE NETWORK NEED TWO DATA INPUTS.
# WE ARE CREATING SENTENCE PAIR FOR THE MODEL TO LEARN
def process_data(dataFrame, labels):
    # GETTING THE FIRST SENTENCE
    first_sentence = tf.gather(dataFrame, 0)
    # GETTING THE SECOND SENTENCE
    second_sentence = tf.gather(dataFrame, 1)
    # label = tf.convert_to_tensor(dataFrame['is_duplicate'])
    # RETURN THE PAIR WITH THE LABEL
    return (first_sentence, second_sentence), labels

In [36]:
batch = 512
# CREATING DATA PIPELINE FOR SIAMESE NETWORK FOR FASTER TRAINING IN TRAINING SET, VALIDATION SET, TEST SET
X_train_tf = tf.data.Dataset.from_tensor_slices((X_train, y_train))
X_train_tf = (X_train_tf
              .map(process_data, num_parallel_calls=tf.data.AUTOTUNE)
              .cache()
              .batch(batch)
              .prefetch(tf.data.AUTOTUNE))
X_val_tf = tf.data.Dataset.from_tensor_slices((X_val, y_val))
X_val_tf = (X_val_tf
            .map(process_data, num_parallel_calls=tf.data.AUTOTUNE)
            .cache()
            .batch(batch)
            .prefetch(tf.data.AUTOTUNE))

X_test_tf = tf.data.Dataset.from_tensor_slices((X_test, y_test))
X_test_tf = (X_test_tf
            .map(process_data, num_parallel_calls=tf.data.AUTOTUNE)
             .cache()
             .batch(batch)
             .prefetch(tf.data.AUTOTUNE))


In [37]:
# GETTING THE BERT PROCESS FUNCTION
bert_preprocess = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
# GETTING THE BERT ENCODER FUNCTION FOR TEXT FEATURE EXTRACTION
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')

In [38]:
# CREATE BERT EXTRACTOR LAYER
def bert_extractor(input_layer):
    preprocessed = bert_preprocess(input_layer)
    return bert_encoder(preprocessed)
# CREATE COSINE SIMILARITY DISTANCE CALCULATION LAYER
class cosine_dist_layer(tf.keras.layers.Layer):
    def __init__(self):
        super(cosine_dist_layer, self).__init__()

    def call(self, inputs):
        featsA, featsB = inputs
        A = tf.math.l2_normalize(featsA, axis = 1)
        B = tf.math.l2_normalize(featsB, axis = 1)
        return tf.math.maximum(tf.reduce_sum(tf.math.multiply(A, B), keepdims=True, axis=1), tf.keras.backend.epsilon())


In [39]:
# INITIALIZE ALL INPUTS FOR SIAMESE NETWORK
first_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name="first_sentence")
second_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name="second_sentence")
# EXTRACTING TEXT FEATURES FROM BERT ENCODER
bert_extr_first_sent = bert_extractor(first_input)
bert_extr_second_sent = bert_extractor(first_input)
# SETTING THE COSINE SIMILARITY FOR CALCULATING THE SIMILARITY BETWEEN TWO TEXT FEATURES
cosine_dist = cosine_dist_layer()

distLayer = cosine_dist([bert_extr_first_sent['pooled_output'], bert_extr_second_sent['pooled_output']])
# INITIALIZE THE OUTPUT TO LEARN THE DISTANCE VALUE
output = tf.keras.layers.Dense(1, activation="sigmoid")(distLayer)
# CREATE SIAMESE NETWORK
siamese_model = tf.keras.Model(inputs=[first_input, second_input], outputs=output)
# SET THE LOSS, OPTIMIIZER, AND METRICS
siamese_model.compile(loss="binary_crossentropy", optimizer="adam", metrics="accuracy")
# SET CHECKPOINT FOR SAVING THE DL MODEL.
checkpoint_filepath = '/home/thanhle/Downloads/feature_engineering_class_project/checkpoints/contrasitive_Model' \
                          '.hdf5'

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_accuracy',
        mode='max',
        save_best_only=True)

In [40]:
# GET THE LAYERS IN THE SIAMESE MODEL
siamese_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 first_sentence (InputLayer)    [(None,)]            0           []                               
                                                                                                  
 keras_layer_4 (KerasLayer)     {'input_mask': (Non  0           ['first_sentence[0][0]',         
                                e, 128),                          'first_sentence[0][0]']         
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128)}                                                

In [41]:
# TRAIN THE SIAMESE MODEL
siamese_model.fit(X_train_tf, validation_data=X_val_tf, epochs=2, callbacks=[model_checkpoint_callback])

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f29ca3b1070>

In [43]:
# GET THE MODEL PREDICTION
model_prediction_result = siamese_model.predict(X_test_tf)



In [54]:
# BECAUSE THE OUTPUT IS USING SIGMOID => THE PREDICTION IS IN THE RANGE FROM 0 TO 1  SO WE SET A THRESHOLD IF ANYTHING > 0.5 IT WILL BE LABELLED TO 1 AND <= 0.5 WILL BE LABELLED TO 0
prediction = (model_prediction_result > 0.5).astype(int)

In [77]:
# COUNTING ALL CORRECT PREDICTION
corrected = prediction == y_test.to_numpy().transpose()

In [81]:
# GETTING THE ACCURACY (CORRECTED / TOTAL SAMPLES)
np.count_nonzero(corrected[0]) / len(corrected[0])

0.6307961216278609