In [1]:
# Author: Zhengxiang (Jack) Wang 
# Date: 2022-01-19
# GitHub: https://github.com/jaaack-wang 

## Get TensorFlow

In case you have not installed TensorFlow, run the following cell.

In [2]:
# # Requires the latest pip
# !pip3 install --upgrade pip
# !pip3 install tensorflow

## Get Data

In case you have not run the `1 - get_data.ipynb`, run the following cell.

In [3]:
# !wget http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv
# import get_data

# get_data.get_quora_data()

## Preprocess and numericalize text data

In case you have not run the `2 - preprocess_data.ipynb`, run the following cell.

In [4]:
from utils import *

# ---- load dataset ----
train_set, dev_set, test_set = load_dataset(['train.txt', 'dev.txt', 'test.txt'])

# ---- numericalize the train set ----
V = TextVectorizer(tokenize) 
text = gather_text(train_set) # for collecting texts from train set
V.build_vocab(text) # for building mapping vocab_to_idx dictionary and text_encoder
train_set_encoded = list(encode_dataset(train_set, encoder=V.text_encoder)) # encodoing train set
dev_set_encoded = list(encode_dataset(dev_set, encoder=V.text_encoder)) # encodoing dev set for validation
test_set_encoded  = list(encode_dataset(test_set, encoder=V.text_encoder)) # encodoing dev set for prediction

Two vocabulary dictionaries have been built!
Please call [1mX.vocab_to_idx | X.idx_to_vocab[0m to find out more where [X] stands for the name you used for this TextVectorizer class.


### A note

There are multiple ways to use tensorflow to train a model, but the easiest one is to employ the `fit` method. In this `fit` function, the `inputs` and `targets` (labels) from the train set should be separately provided, and the `inputs` and `targets` from the dev set should be put inside a list or tuple. And both the `inputs` and `targets` should not be batched, as there is another builtin parameter called `batch_size` that will create mini batches for us. Nevertheless, to maintain consistency, this tutorial decided to still use the `build_batches` function that we will build together later in other tutorials. This `build_batches` function will help normalize the text seq length, which tensorflow's `fit` method does not provide. 

A better way of using packages in the tensorflow ecosystem to preprocess and numericalize text data will be introduced separately, just as what I intended to do for the other two deep learning frameworks.

In [5]:
# ---- build mini batches for the train and dev set ----
train_set_batched = build_batches(train_set_encoded, batch_size=3000, include_seq_len=False)
dev_set_batched = build_batches(dev_set_encoded, batch_size=1000, include_seq_len=False)
test_set_batched = build_batches(test_set_encoded, batch_size=1000, include_seq_len=False)

train_X1, train_X2, train_Y = train_set_batched[0]
dev_X1, dev_X2, dev_Y = dev_set_batched[0]
test_X1, test_X2, test_Y = test_set_batched[0]

## Training and evaluating models 

### BoW (Bag of Words) model

#### Training

In [6]:
from tf_models.BoW import BoW
from tensorflow import keras

In [7]:
model = BoW(len(V.vocab_to_idx), 1)
model.compile(optimizer=keras.optimizers.Adam(5e-4),
              loss=keras.losses.BinaryCrossentropy(),
              metrics=[["accuracy"]])

%time model.fit([train_X1, train_X2], train_Y, epochs=5, batch_size=64, validation_data=([dev_X1, dev_X2], dev_Y))

2022-01-19 23:47:39.949779: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-19 23:47:40.685039: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 6.15 s, sys: 999 ms, total: 7.15 s
Wall time: 3.21 s


<tensorflow.python.keras.callbacks.History at 0x7fe6135389a0>

#### Evaluation on the test set

In [8]:
model.evaluate([test_X1, test_X2], test_Y)



[0.6093695759773254, 0.6679999828338623]

### CNN (Convolutional Neural Network) model

#### Training

In [9]:
from tf_models.CNN import CNN

model = CNN(len(V.vocab_to_idx), 1)
model.compile(optimizer=keras.optimizers.Adam(5e-4),
              loss=keras.losses.BinaryCrossentropy(),
              metrics=[["accuracy"]])

%time model.fit([train_X1, train_X2], train_Y, epochs=5, batch_size=64, validation_data=([dev_X1, dev_X2], dev_Y))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 28.3 s, sys: 7.88 s, total: 36.2 s
Wall time: 7.77 s


<tensorflow.python.keras.callbacks.History at 0x7fe5f671ff70>

#### Evaluation on the test set

In [10]:
model.evaluate([test_X1, test_X2], test_Y)



[0.9565356969833374, 0.6190000176429749]

## RNN (Recurrent neural network) models

### Simple RNN model

#### Training

In [11]:
from tf_models.S_RNN import SimpleRNN

model = SimpleRNN(len(V.vocab_to_idx), 1, bidirectional=False)

model.compile(optimizer=keras.optimizers.Adam(5e-4),
              loss=keras.losses.BinaryCrossentropy(),
              metrics=[["accuracy"]])

%time model.fit([train_X1, train_X2], train_Y, epochs=5, batch_size=64, validation_data=([dev_X1, dev_X2], dev_Y))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 33.2 s, sys: 9.99 s, total: 43.2 s
Wall time: 10.4 s


<tensorflow.python.keras.callbacks.History at 0x7fe5f68e4130>

#### Evaluation on the test set

In [12]:
model.evaluate([test_X1, test_X2], test_Y)



[1.221795916557312, 0.6190000176429749]

### GRU (Gated recurrent units) model 

#### Training

In [13]:
from tf_models.GRU import GRU

model = GRU(len(V.vocab_to_idx), 1, bidirectional=False)
model.compile(optimizer=keras.optimizers.Adam(5e-4),
              loss=keras.losses.BinaryCrossentropy(),
              metrics=[["accuracy"]])

%time model.fit([train_X1, train_X2], train_Y, epochs=5, batch_size=64, validation_data=([dev_X1, dev_X2], dev_Y))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 1min 15s, sys: 19.8 s, total: 1min 34s
Wall time: 22.4 s


<tensorflow.python.keras.callbacks.History at 0x7fe5f7b10580>

#### Evaluation on the test set

In [14]:
model.evaluate([test_X1, test_X2], test_Y)



[1.3175928592681885, 0.6110000014305115]

### LSTM (Long short-term memory) model

#### Training

In [15]:
from tf_models.LSTM import LSTM

model = LSTM(len(V.vocab_to_idx), 1, bidirectional=False)
model.compile(optimizer=keras.optimizers.Adam(5e-4),
              loss=keras.losses.BinaryCrossentropy(),
              metrics=[["accuracy"]])

%time model.fit([train_X1, train_X2], train_Y, epochs=5, batch_size=64, validation_data=([dev_X1, dev_X2], dev_Y))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 1min 33s, sys: 26.3 s, total: 1min 59s
Wall time: 28.2 s


<tensorflow.python.keras.callbacks.History at 0x7fe6004d1670>

#### Evaluation on the test set

In [16]:
model.evaluate([test_X1, test_X2], test_Y)



[1.0836219787597656, 0.6570000052452087]