In [1]:
# Author: Zhengxiang (Jack) Wang 
# Date: 2022-01-19
# GitHub: https://github.com/jaaack-wang 

## Get PyTorch

In case you have not installed PyTorch,run the following cell.

In [2]:
# !pip3 install torch torchvision

## Preprocess and numericalize text data

In [3]:
from utils import *
import jieba  # ---> tokenizer for Chinese

# ---- load dataset ----
train, dev, test = load_dataset(['train.tsv', 'dev.tsv', 'test.tsv'])

# ---- numericalize the train set ----
V = TextVectorizer(jieba.lcut) 
text = gather_text(train) # for collecting texts from train set
V.build_vocab(text) # for building mapping vocab_to_idx dictionary and text_encoder

train_encoded = list(encode_dataset(train, encoder=V)) # encodoing train set
dev_encoded = list(encode_dataset(dev, encoder=V)) # encodoing dev set for validation
test_encoded  = list(encode_dataset(test, encoder=V)) # encodoing dev set for prediction

# ---- build mini batches for the train and dev set ----
train_batched = build_batches(train_encoded, batch_size=64, 
                              max_seq_len=128, include_seq_len=False)

dev_batched = build_batches(dev_encoded, batch_size=64, 
                            max_seq_len=128, include_seq_len=False)

test_batched = build_batches(test_encoded, batch_size=64, 
                             max_seq_len=128, include_seq_len=False)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/w9/d_nplhzj4qx35xxlgljgdtjh0000gn/T/jieba.cache
Loading model cost 1.043 seconds.
Prefix dict has been built successfully.


Two vocabulary dictionaries have been built!
Please call [1mX.vocab_to_idx | X.idx_to_vocab[0m to find out more where [X] stands for the name you used for this TextVectorizer class.


### Convert numpy arrays into tensors

It turns out that pytorch models do not accept numpy arrays during model training. The problem seems to be an attribute associated with `torch.Tensor` that has been named differently in `numpy.ndarray`, unlike `paddle`. 

To maintain consistency, this tutorial decided to not change the functions we will build together in the later tutorials. A better way of using packages in the pytorch ecosystem to preprocess and numericalize text data will be introduced separately, just as what I intended to do for the other two deep learning frameworks.

Likewise, `PyTorchUtils` is also a wrapped up class I wrote up just to get this quick starts going, which will also be introduced later. Although this is not the best practice of using `pytorch`, you will find it useful when realizing the very nuanced differences between different deep learning frameworks.

In [4]:
from pytorch_utils import to_tensor

train_batched = to_tensor(train_batched)
dev_batched = to_tensor(dev_batched)
test_batched = to_tensor(test_batched)

## Training and evaluating models 

In [5]:
from pytorch_utils import PyTorchUtils
import torch.optim as optim
import torch.nn as nn

### BoW (Bag of Words) 

In [6]:
from pytorch_models.BoW import BoW


model = BoW(len(V.vocab_to_idx), 2)
optimizer = optim.Adam(model.parameters(), lr=5e-4)
criterion = nn.BCEWithLogitsLoss()
PT = PyTorchUtils(model, optimizer, criterion, include_seq_len=False)
%time PT.train(train_batched, dev_batched, epochs=5)

Epoch 1/5 {'Train loss': '0.83154', 'Train accu': '33.28'}
Validation... {'Dev loss': '0.75056', 'Dev accu': '45.07'}

Epoch 2/5 {'Train loss': '0.63411', 'Train accu': '51.41'}
Validation... {'Dev loss': '0.70763', 'Dev accu': '51.34'}

Epoch 3/5 {'Train loss': '0.55664', 'Train accu': '60.86'}
Validation... {'Dev loss': '0.69626', 'Dev accu': '56.61'}

Epoch 4/5 {'Train loss': '0.49809', 'Train accu': '68.38'}
Validation... {'Dev loss': '0.69232', 'Dev accu': '60.88'}

Epoch 5/5 {'Train loss': '0.44770', 'Train accu': '73.71'}
Validation... {'Dev loss': '0.69498', 'Dev accu': '63.65'}

CPU times: user 10.4 s, sys: 1.3 s, total: 11.7 s
Wall time: 5.1 s


In [7]:
PT.evaluate(test_batched)

{'Test loss': '0.64784', 'Test accu': '65.68'}

### CNN (Convolutional Neural Network) 

In [8]:
from pytorch_models.CNN import CNN


model = CNN(len(V.vocab_to_idx), 2)
optimizer = optim.Adam(model.parameters(), lr=5e-4)
criterion = nn.BCEWithLogitsLoss()
PT = PyTorchUtils(model, optimizer, criterion, include_seq_len=False)
%time PT.train(train_batched, dev_batched, epochs=5)

Epoch 1/5 {'Train loss': '0.67456', 'Train accu': '46.83'}
Validation... {'Dev loss': '0.65148', 'Dev accu': '53.65'}

Epoch 2/5 {'Train loss': '0.55648', 'Train accu': '71.35'}
Validation... {'Dev loss': '0.52256', 'Dev accu': '72.64'}

Epoch 3/5 {'Train loss': '0.40462', 'Train accu': '82.96'}
Validation... {'Dev loss': '0.43706', 'Dev accu': '79.55'}

Epoch 4/5 {'Train loss': '0.25230', 'Train accu': '92.81'}
Validation... {'Dev loss': '0.39443', 'Dev accu': '82.43'}

Epoch 5/5 {'Train loss': '0.13678', 'Train accu': '97.42'}
Validation... {'Dev loss': '0.46483', 'Dev accu': '78.40'}

CPU times: user 40.1 s, sys: 3.53 s, total: 43.6 s
Wall time: 32.7 s


In [9]:
PT.evaluate(test_batched)

{'Test loss': '0.43934', 'Test accu': '80.07'}

## RNN (Recurrent neural network) 

As the RNN models also take as an input the sequence length, we need to re-encode the train set, dev set, and test set. 

In [10]:
# ---- build mini batches for the train and dev set ----
train_batched = build_batches(train_encoded, batch_size=64, 
                              max_seq_len=128, include_seq_len=True)

dev_batched = build_batches(dev_encoded, batch_size=64, 
                            max_seq_len=128, include_seq_len=True)

test_batched = build_batches(test_encoded, batch_size=64, 
                             max_seq_len=128, include_seq_len=True)

train_batched = to_tensor(train_batched)
dev_batched = to_tensor(dev_batched)
test_batched = to_tensor(test_batched)

### Simple RNN

In [11]:
from pytorch_models.S_RNN import SimpleRNN


model = SimpleRNN(len(V.vocab_to_idx), 2, bidirectional=False)
optimizer = optim.Adam(model.parameters(), lr=5e-4)
criterion = nn.BCEWithLogitsLoss()
PT = PyTorchUtils(model, optimizer, criterion, include_seq_len=True)
%time PT.train(train_batched, dev_batched, epochs=5)

Epoch 1/5 {'Train loss': '0.69202', 'Train accu': '45.34'}
Validation... {'Dev loss': '0.69197', 'Dev accu': '49.12'}

Epoch 2/5 {'Train loss': '0.68519', 'Train accu': '51.04'}
Validation... {'Dev loss': '0.69384', 'Dev accu': '50.85'}

Epoch 3/5 {'Train loss': '0.67558', 'Train accu': '53.00'}
Validation... {'Dev loss': '0.66694', 'Dev accu': '61.35'}

Epoch 4/5 {'Train loss': '0.67112', 'Train accu': '54.17'}
Validation... {'Dev loss': '0.71120', 'Dev accu': '50.55'}

Epoch 5/5 {'Train loss': '0.66367', 'Train accu': '54.56'}
Validation... {'Dev loss': '0.69764', 'Dev accu': '49.67'}

CPU times: user 1min 53s, sys: 8.53 s, total: 2min 2s
Wall time: 58.4 s


In [12]:
PT.evaluate(test_batched)

{'Test loss': '0.68780', 'Test accu': '51.75'}

### LSTM (Long short-term memory)

In [13]:
from pytorch_models.LSTM import LSTM


model = LSTM(len(V.vocab_to_idx), 2, bidirectional=False)
optimizer = optim.Adam(model.parameters(), lr=5e-4)
criterion = nn.BCEWithLogitsLoss()
PT = PyTorchUtils(model, optimizer, criterion, include_seq_len=True)
%time PT.train(train_batched, dev_batched, epochs=5)

Epoch 1/5 {'Train loss': '0.69281', 'Train accu': '42.83'}
Validation... {'Dev loss': '0.69248', 'Dev accu': '49.70'}

Epoch 2/5 {'Train loss': '0.68699', 'Train accu': '53.17'}
Validation... {'Dev loss': '0.69406', 'Dev accu': '50.69'}

Epoch 3/5 {'Train loss': '0.67005', 'Train accu': '52.60'}
Validation... {'Dev loss': '0.68168', 'Dev accu': '56.88'}

Epoch 4/5 {'Train loss': '0.63780', 'Train accu': '60.52'}
Validation... {'Dev loss': '0.64299', 'Dev accu': '64.20'}

Epoch 5/5 {'Train loss': '0.61705', 'Train accu': '62.87'}
Validation... {'Dev loss': '0.64244', 'Dev accu': '63.82'}

CPU times: user 8min 11s, sys: 2min 9s, total: 10min 20s
Wall time: 5min 2s


In [14]:
PT.evaluate(test_batched)

{'Test loss': '0.64369', 'Test accu': '61.57'}

### GRU (Gated recurrent units)  

In [15]:
from pytorch_models.GRU import GRU


model = GRU(len(V.vocab_to_idx), 2, bidirectional=False)
optimizer = optim.Adam(model.parameters(), lr=5e-4)
criterion = nn.BCEWithLogitsLoss()
PT = PyTorchUtils(model, optimizer, criterion, include_seq_len=True)
%time PT.train(train_batched, dev_batched, epochs=5)

Epoch 1/5 {'Train loss': '0.69216', 'Train accu': '42.46'}
Validation... {'Dev loss': '0.69208', 'Dev accu': '50.27'}

Epoch 2/5 {'Train loss': '0.68622', 'Train accu': '53.30'}
Validation... {'Dev loss': '0.69203', 'Dev accu': '50.27'}

Epoch 3/5 {'Train loss': '0.66835', 'Train accu': '56.89'}
Validation... {'Dev loss': '0.64390', 'Dev accu': '64.17'}

Epoch 4/5 {'Train loss': '0.62423', 'Train accu': '62.10'}
Validation... {'Dev loss': '0.64492', 'Dev accu': '63.65'}

Epoch 5/5 {'Train loss': '0.60530', 'Train accu': '63.14'}
Validation... {'Dev loss': '0.64059', 'Dev accu': '64.09'}

CPU times: user 6min 4s, sys: 59.4 s, total: 7min 4s
Wall time: 3min 11s


In [16]:
PT.evaluate(test_batched)

{'Test loss': '0.64588', 'Test accu': '61.35'}