In [1]:
# Author: Zhengxiang (Jack) Wang 
# Date: 2022-01-19
# GitHub: https://github.com/jaaack-wang 

## Get paddle

In [2]:
# !python3 -m pip install paddlepaddle
# #pip3 install paddlepaddle

## Preprocess and numericalize text data

In [3]:
from utils import *
import jieba  # ---> tokenizer for Chinese

# ---- load dataset ----
train, dev, test = load_dataset(['train.tsv', 'dev.tsv', 'test.tsv'])

# ---- numericalize the train set ----
V = TextVectorizer(jieba.lcut) 
text = gather_text(train) # for collecting texts from train set
V.build_vocab(text) # for building mapping vocab_to_idx dictionary and text_encoder

train_encoded = list(encode_dataset(train, encoder=V)) # encodoing train set
dev_encoded = list(encode_dataset(dev, encoder=V)) # encodoing dev set for validation
test_encoded  = list(encode_dataset(test, encoder=V)) # encodoing dev set for prediction

# ---- build mini batches for the train and dev set ----
train_batched = build_batches(train_encoded, batch_size=64, 
                              max_seq_len=128, include_seq_len=False)

dev_batched = build_batches(dev_encoded, batch_size=64, 
                            max_seq_len=128, include_seq_len=False)

test_batched = build_batches(test_encoded, batch_size=64, 
                             max_seq_len=128, include_seq_len=False)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/w9/d_nplhzj4qx35xxlgljgdtjh0000gn/T/jieba.cache
Loading model cost 0.995 seconds.
Prefix dict has been built successfully.


Two vocabulary dictionaries have been built!
Please call [1mX.vocab_to_idx | X.idx_to_vocab[0m to find out more where [X] stands for the name you used for this TextVectorizer class.


## Training and evaluating models

In [4]:
import paddle 

def get_model(model):
    model = paddle.Model(model)
    optimizer = paddle.optimizer.Adam(
    parameters=model.parameters(), learning_rate=5e-4)
    criterion = paddle.nn.CrossEntropyLoss()
    metric = paddle.metric.Accuracy()
    model.prepare(optimizer, criterion, metric)
    return model

## BoW (Bag of Words)

In [5]:
from paddle_models.BoW import BoW

In [6]:
model = BoW(len(V.vocab_to_idx), 2)
model = get_model(model)
%time model.fit(train_batched, dev_batched, epochs=5, verbose=1)

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/5
step 10/63 [===>..........................] - loss: 0.6363 - acc: 0.6109 - ETA: 0s - 15ms/step

  return (isinstance(seq, collections.Sequence) and


Eval begin...
Eval samples: 1200
Epoch 2/5
Eval begin...
Eval samples: 1200
Epoch 3/5
Eval begin...
Eval samples: 1200
Epoch 4/5
Eval begin...
Eval samples: 1200
Epoch 5/5
Eval begin...
Eval samples: 1200
CPU times: user 3.82 s, sys: 94.7 ms, total: 3.91 s
Wall time: 3.98 s


In [7]:
model.evaluate(test_batched)

Eval begin...
step 10/19 - loss: 0.2706 - acc: 0.8922 - 3ms/step
step 19/19 - loss: 0.3059 - acc: 0.8775 - 3ms/step
Eval samples: 1200


{'loss': [0.30594957], 'acc': 0.8775}

## CNN (Convolutional Neural Network)

In [8]:
from paddle_models.CNN import CNN

In [9]:
model = CNN(len(V.vocab_to_idx), 2)
model = get_model(model)
%time model.fit(train_batched, dev_batched, epochs=5, verbose=1)

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/5
Eval begin...
Eval samples: 1200
Epoch 2/5
Eval begin...
Eval samples: 1200
Epoch 3/5
Eval begin...
Eval samples: 1200
Epoch 4/5
Eval begin...
Eval samples: 1200
Epoch 5/5
Eval begin...
Eval samples: 1200
CPU times: user 57.1 s, sys: 683 ms, total: 57.8 s
Wall time: 58.7 s


In [10]:
model.evaluate(test_batched)

Eval begin...
step 10/19 - loss: 0.3241 - acc: 0.9078 - 61ms/step
step 19/19 - loss: 0.3437 - acc: 0.8892 - 59ms/step
Eval samples: 1200


{'loss': [0.34374633], 'acc': 0.8891666666666667}

## RNN (Recurrent neural network)

As the RNN models also take as an input the sequence length, we need to re-encode the train set, dev set, and test set.

In [11]:
# ---- build mini batches for the train and dev set ----
train_batched = build_batches(train_encoded, batch_size=64, 
                              max_seq_len=128, include_seq_len=True)

dev_batched = build_batches(dev_encoded, batch_size=64, 
                            max_seq_len=128, include_seq_len=True)

test_batched = build_batches(test_encoded, batch_size=64, 
                             max_seq_len=128, include_seq_len=True)

## SimpleRNN

In [12]:
from paddle_models.S_RNN import SimpleRNN

In [13]:
model = SimpleRNN(len(V.vocab_to_idx), 2, bidirectional=True)
model = get_model(model)
%time model.fit(train_batched, dev_batched, epochs=5, verbose=1)

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/5
Eval begin...
Eval samples: 1200
Epoch 2/5
Eval begin...
Eval samples: 1200
Epoch 3/5
Eval begin...
Eval samples: 1200
Epoch 4/5
Eval begin...
Eval samples: 1200
Epoch 5/5
Eval begin...
Eval samples: 1200
CPU times: user 31.3 s, sys: 555 ms, total: 31.8 s
Wall time: 32.3 s


In [14]:
model.evaluate(test_batched)

Eval begin...
step 10/19 - loss: 0.5750 - acc: 0.7734 - 41ms/step
step 19/19 - loss: 0.4688 - acc: 0.7792 - 41ms/step
Eval samples: 1200


{'loss': [0.46877885], 'acc': 0.7791666666666667}

## LSTM (Long short-term memory)

In [15]:
from paddle_models.LSTM import LSTM

In [16]:
model = LSTM(len(V.vocab_to_idx), 2, bidirectional=True)
model = get_model(model)
%time model.fit(train_batched, dev_batched, epochs=5, verbose=1)

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/5
Eval begin...
Eval samples: 1200
Epoch 2/5
Eval begin...
Eval samples: 1200
Epoch 3/5
Eval begin...
Eval samples: 1200
Epoch 4/5
Eval begin...
Eval samples: 1200
Epoch 5/5
Eval begin...
Eval samples: 1200
CPU times: user 1min 41s, sys: 1.3 s, total: 1min 42s
Wall time: 1min 43s


In [17]:
model.evaluate(test_batched)

Eval begin...
step 10/19 - loss: 0.6178 - acc: 0.8812 - 111ms/step
step 19/19 - loss: 0.2819 - acc: 0.8875 - 109ms/step
Eval samples: 1200


{'loss': [0.2819071], 'acc': 0.8875}

## GUR (Gated recurrent units) 

In [18]:
from paddle_models.GRU import GRU

In [19]:
model = GRU(len(V.vocab_to_idx), 2, bidirectional=True)
model = get_model(model)
%time model.fit(train_batched, dev_batched, epochs=5, verbose=1)

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/5
Eval begin...
Eval samples: 1200
Epoch 2/5
Eval begin...
Eval samples: 1200
Epoch 3/5
Eval begin...
Eval samples: 1200
Epoch 4/5
Eval begin...
Eval samples: 1200
Epoch 5/5
Eval begin...
Eval samples: 1200
CPU times: user 1min 19s, sys: 1.19 s, total: 1min 20s
Wall time: 1min 21s


In [20]:
model.evaluate(test_batched)

Eval begin...
step 10/19 - loss: 0.6577 - acc: 0.8719 - 96ms/step
step 19/19 - loss: 0.2988 - acc: 0.8700 - 93ms/step
Eval samples: 1200


{'loss': [0.2987765], 'acc': 0.87}