In [1]:
# Author: Zhengxiang (Jack) Wang 
# Date: 2022-01-19
# GitHub: https://github.com/jaaack-wang 

## Get TensorFlow

In case you have not installed TensorFlow, run the following cell.

In [2]:
# # Requires the latest pip
# !pip3 install --upgrade pip
# !pip3 install tensorflow

## Preprocess and numericalize text data

In [3]:
from utils import *
import jieba  # ---> tokenizer for Chinese

# ---- load dataset ----
train, dev, test = load_dataset(['train.tsv', 'dev.tsv', 'test.tsv'])

# ---- numericalize the train set ----
V = TextVectorizer(jieba.lcut) 
text = gather_text(train) # for collecting texts from train set
V.build_vocab(text) # for building mapping vocab_to_idx dictionary and text_encoder

train_encoded = list(encode_dataset(train, encoder=V)) # encodoing train set
dev_encoded = list(encode_dataset(dev, encoder=V)) # encodoing dev set for validation
test_encoded  = list(encode_dataset(test, encoder=V)) # encodoing dev set for prediction

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/w9/d_nplhzj4qx35xxlgljgdtjh0000gn/T/jieba.cache
Loading model cost 0.705 seconds.
Prefix dict has been built successfully.


Two vocabulary dictionaries have been built!
Please call [1mX.vocab_to_idx | X.idx_to_vocab[0m to find out more where [X] stands for the name you used for this TextVectorizer class.


### A note

There are multiple ways to use tensorflow to train a model, but the easiest one is to employ the `fit` method. In this `fit` function, the `inputs` and `targets` (labels) from the train set should be separately provided, and the `inputs` and `targets` from the dev set should be put inside a list or tuple. And both the `inputs` and `targets` should not be batched, as there is another builtin parameter called `batch_size` that will create mini batches for us. Nevertheless, to maintain consistency, this tutorial decided to still use the `build_batches` function that we will build together later in other tutorials. This `build_batches` function will help normalize the text seq length, which tensorflow's `fit` method does not provide. 

A better way of using packages in the tensorflow ecosystem to preprocess and numericalize text data will be introduced separately, just as what I intended to do for the other two deep learning frameworks.

In [4]:
# ---- build mini batches for the train and dev set ----
train_batched = build_batches(train_encoded, batch_size=10000, 
                              max_seq_len=128, include_seq_len=False)

dev_batched = build_batches(dev_encoded, batch_size=10000, 
                            max_seq_len=128, include_seq_len=False)

test_batched = build_batches(test_encoded, batch_size=10000, 
                             max_seq_len=128, include_seq_len=False)

train_X, train_Y = train_batched[0]
dev_X, dev_Y = dev_batched[0]
test_X, test_Y = test_batched[0]

## Training and evaluating models

In [5]:
from tensorflow import keras

## BoW (Bag of Words)

In [6]:
from tf_models.BoW import BoW


model = BoW(len(V.vocab_to_idx), 1)
model.compile(optimizer=keras.optimizers.Adam(5e-4),
              loss=keras.losses.BinaryCrossentropy(),
              metrics=[["accuracy"]])

%time model.fit(train_X, train_Y, epochs=5, batch_size=64, validation_data=(dev_X, dev_Y))

2022-01-19 23:56:48.788658: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-19 23:56:49.143742: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 17.4 s, sys: 2.64 s, total: 20 s
Wall time: 6.3 s


<tensorflow.python.keras.callbacks.History at 0x7fcc1ed45970>

In [7]:
model.evaluate(test_X, test_Y)



[0.28718623518943787, 0.8700000047683716]

## CNN (Convolutional Neural Network)

In [8]:
from tf_models.CNN import CNN

In [9]:
model = CNN(len(V.vocab_to_idx), 1)
model.compile(optimizer=keras.optimizers.Adam(5e-4),
              loss=keras.losses.BinaryCrossentropy(),
              metrics=[["accuracy"]])

%time model.fit(train_X, train_Y, epochs=5, batch_size=64, validation_data=(dev_X, dev_Y))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 49.6 s, sys: 14.5 s, total: 1min 4s
Wall time: 14 s


<tensorflow.python.keras.callbacks.History at 0x7fcc204f7d00>

In [10]:
model.evaluate(test_X, test_Y)



[0.2617317736148834, 0.8949999809265137]

## RNN (Recurrent neural network)

## SimpleRNN

In [11]:
from tf_models.S_RNN import SimpleRNN

In [12]:
model = SimpleRNN(len(V.vocab_to_idx), 1, bidirectional=False)
model.compile(optimizer=keras.optimizers.Adam(5e-4),
              loss=keras.losses.BinaryCrossentropy(),
              metrics=[["accuracy"]])

%time model.fit(train_X, train_Y, epochs=5, batch_size=64, validation_data=(dev_X, dev_Y))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 58.4 s, sys: 18 s, total: 1min 16s
Wall time: 19.9 s


<tensorflow.python.keras.callbacks.History at 0x7fcc1fd531c0>

In [13]:
model.evaluate(test_X, test_Y)



[0.4580153822898865, 0.8050000071525574]

## LSTM (Long short-term memory)

In [14]:
from tf_models.LSTM import LSTM

In [15]:
model = LSTM(len(V.vocab_to_idx), 1, bidirectional=False)
model.compile(optimizer=keras.optimizers.Adam(5e-4),
              loss=keras.losses.BinaryCrossentropy(),
              metrics=[["accuracy"]])

%time model.fit(train_X, train_Y, epochs=5, batch_size=64, validation_data=(dev_X, dev_Y))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 2min 13s, sys: 40.7 s, total: 2min 54s
Wall time: 46.1 s


<tensorflow.python.keras.callbacks.History at 0x7fcc0b7adb80>

In [16]:
model.evaluate(test_X, test_Y)



[0.544813334941864, 0.8683333396911621]

## GRU (Gated recurrent units) 

In [17]:
from tf_models.GRU import GRU

In [18]:
model = GRU(len(V.vocab_to_idx), 1, bidirectional=False)
model.compile(optimizer=keras.optimizers.Adam(5e-4),
              loss=keras.losses.BinaryCrossentropy(),
              metrics=[["accuracy"]])

%time model.fit(train_X, train_Y, epochs=5, batch_size=64, validation_data=(dev_X, dev_Y))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 1min 49s, sys: 34.4 s, total: 2min 23s
Wall time: 37.5 s


<tensorflow.python.keras.callbacks.History at 0x7fcc0dc31730>

In [19]:
model.evaluate(test_X, test_Y)



[0.3990863561630249, 0.8608333468437195]