* What is the standard working process with NLP chinese corpus?
* What is the difference between BERT chinese and baidu ERNIE?
* How does tokenizer work and the relation with embedding layer?

In [18]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
import transformers
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from keras.preprocessing import sequence, text
from keras.layers.recurrent import LSTM, GRU,SimpleRNN
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import pandas as pd
import os
import shutil

In [2]:
import sys
sys.path.append('./ERNIE')
from ernie.tokenizing_ernie import ErnieTokenizer

In [3]:
def generate_bert_ids(tokenizer, texts, labels, output_length=128):
    bert_ids = tokenizer(texts, padding=True, truncation=True, return_tensors="tf", max_length=output_length)
    input_ids = np.asarray(bert_ids['input_ids'])
    attention_mask = np.asarray(bert_ids['attention_mask'])
    return input_ids, attention_mask, np.array(labels), bert_ids

In [4]:
def roc_auc(predictions,target):
    '''
    This methods returns the AUC Score when given the Predictions
    and Labels
    '''
    
    fpr, tpr, thresholds = roc_curve(target, predictions)
    roc_auc = auc(fpr, tpr)
    return roc_auc

## 1 data processing

In [5]:
df = pd.read_csv('data/ChineseNlpCorpus/datasets/ChnSentiCorp_htl_all/ChnSentiCorp_htl_all.csv')
df.head(3)

Unnamed: 0,label,review
0,1,"距离川沙公路较近,但是公交指示不对,如果是""蔡陆线""的话,会非常麻烦.建议用别的路线.房间较..."
1,1,商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!
2,1,早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。


In [6]:
df.shape

(7766, 2)

In [7]:
df.label.value_counts()

1    5322
0    2444
Name: label, dtype: int64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df['review'], np.array(df['label']), test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=2021)

In [9]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(4348,)
(1088,)
(2330,)


## 2 BERT Chinese
###  tokenizer

In [10]:
# configuration
max_length = 128  # Maximum length of input sentence to the model.
max_features = 20000  # Only consider the top 20k words
batch_size = 32
epochs = 3
callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, mode='auto', restore_best_weights=True)]

In [11]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-chinese", model_max_length=5000)
bert_model = TFBertForSequenceClassification.from_pretrained('/Users/wegzheng/Downloads/bert-base-chinses',num_labels=2)
# bert_model = transformers.TFBertModel.from_pretrained("/Users/wegzheng/Downloads/bert-base-chinses")

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at /Users/wegzheng/Downloads/bert-base-chinses and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# vocabulary sieze
bert_tokenizer.vocab_size

21128

In [15]:
# transforming texts to ids
# %%time
input_ids_tr, att_mask_tr, label_tr, tr_dict = generate_bert_ids(bert_tokenizer, X_train.astype('str').to_list(), y_train)
input_ids_va, att_mask_va, label_va, va_dict = generate_bert_ids(bert_tokenizer, X_val.astype('str').to_list(), y_val)
input_ids_ts, att_mask_ts, label_ts, ts_dict = generate_bert_ids(bert_tokenizer, X_test.astype('str').to_list(), y_test)

In [16]:
tr_dict.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [139]:
print(len(label_tr))
print(len(label_va))
print(len(label_ts))

4348
1088
2330


### 2.1 GRU model

In [20]:
model = tf.keras.Sequential()
model.add(layers.Embedding(max_features, 128))
model.add(SpatialDropout1D(0.3))
model.add(GRU(300))
model.add(layers.Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

In [21]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 128)         2560000   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, None, 128)         0         
_________________________________________________________________
gru_1 (GRU)                  (None, 300)               387000    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 301       
Total params: 2,947,301
Trainable params: 2,947,301
Non-trainable params: 0
_________________________________________________________________


In [25]:
%%time
model.fit(input_ids_tr, label_tr, 
          batch_size=32, 
          epochs=100, 
          validation_data=(input_ids_va, label_va),
         callbacks=callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
CPU times: user 7min 19s, sys: 2min 49s, total: 10min 8s
Wall time: 1min 41s


<tensorflow.python.keras.callbacks.History at 0x1a8f2b310>

In [26]:
%time preds = model.predict(input_ids_ts)

CPU times: user 9.81 s, sys: 3.77 s, total: 13.6 s
Wall time: 2.25 s


In [27]:
pred_labels = [1 if i >=0.5 else 0 for i in pd.DataFrame(preds)[0].to_list()]
print(accuracy_score(label_ts, pred_labels))
print(roc_auc(label_ts, pred_labels))

0.688412017167382
0.614281247734358


### 2.2 self-defined BI-LSTM model

In [95]:
# Input for variable-length sequences of integers
inputs = tf.keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = layers.Embedding(max_features, 128)(inputs)
# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
# Add a classifier
outputs = layers.Dense(1, activation="sigmoid")(x)
model = tf.keras.Model(inputs, outputs)
model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_3 (Embedding)      (None, None, 128)         2560000   
_________________________________________________________________
bidirectional_9 (Bidirection (None, None, 128)         98816     
_________________________________________________________________
bidirectional_10 (Bidirectio (None, 128)               98816     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 129       
Total params: 2,757,761
Trainable params: 2,757,761
Non-trainable params: 0
_________________________________________________________________


In [96]:
%%time
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])
model.fit(input_ids_tr, label_tr, 
          batch_size=32, 
          epochs=100, 
          validation_data=(input_ids_va, label_va),
         callbacks=callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
CPU times: user 5min, sys: 1min 59s, total: 7min
Wall time: 1min 6s


<tensorflow.python.keras.callbacks.History at 0x1abaccc10>

In [105]:
%time preds = model.predict(input_ids_ts)

CPU times: user 5.37 s, sys: 2.48 s, total: 7.84 s
Wall time: 1.45 s


In [124]:
pred_labels = [1 if i >=0.5 else 0 for i in pd.DataFrame(preds)[0].to_list()]

In [126]:
accuracy_score(label_ts, pred_labels)

0.8523605150214593

In [132]:
roc_auc(label_ts, pred_labels)

0.8295590753424658

### 2.3 BERT chinese

In [150]:
bert_model.trainable = True
print('\nBert Model',bert_model.summary())

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5,epsilon=1e-04)
bert_model.compile(loss=loss,optimizer=optimizer,metrics=[metric])

# loss = losses.BinaryCrossentropy(from_logits=True)
# metric = tf.keras.metrics.BinaryAccuracy('accuracy')
# optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
# bert_model.compile(loss=loss,optimizer=optimizer,metrics=[metric])

Model: "tf_bert_for_sequence_classification_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  102267648 
_________________________________________________________________
dropout_334 (Dropout)        multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 102,269,186
Trainable params: 102,269,186
Non-trainable params: 0
_________________________________________________________________

Bert Model None


In [151]:
%%time
history = bert_model.fit([input_ids_tr,att_mask_tr],label_tr,
                       batch_size=32, 
                       epochs=3, 
                       validation_data=([input_ids_va,att_mask_va],label_va),
                       callbacks=callbacks)

Epoch 1/3


  # not be used in advertising or publicity pertaining to distribution






















Epoch 2/3
Epoch 3/3
CPU times: user 9h 15min 43s, sys: 2h 26min 23s, total: 11h 42min 7s
Wall time: 1h 21min 5s


In [152]:
%time preds = bert_model.predict([input_ids_ts,att_mask_ts],batch_size=64)









CPU times: user 28min 36s, sys: 9min 5s, total: 37min 41s
Wall time: 4min 48s


In [153]:
pred_arr = tf.nn.softmax(preds[0], axis=-1)
pred_labels = tf.argmax(pred_arr, axis=1).numpy()
accuracy_score(label_ts, pred_labels,)

0.9072961373390558

In [154]:
roc_auc(label_ts, pred_labels)

0.8976139225368956

In [155]:
# bert_model.save_pretrained('model/finetuned/bert-base-chinese/')

## 3 baidu ERNIE

In [156]:
tokenizer_baidu = ErnieTokenizer.from_pretrained('ernie-1.0')

2021-03-22 17:06:57,384 - INFO - get pretrain dir from https://ernie-github.cdn.bcebos.com/model-ernie1.0.1.tar.gz


In [160]:
X_train.astype('str').to_list()[:3]

['房间很大，服务也很好，地理位置比较好，但宾馆门前停车很不方便，比较窄。',
 '酒店非常的好，位子也还不错。楼上的餐厅可以看到很美的上海夜景非常喜欢，价格太高啦',
 '本想订汉庭，地理位置不错的酒店，从汽车西站叫车过去15元，离文昌阁几步之遥，10分钟可到有名的福满楼，金鹰国际购物中心、时代广场近在咫尺。个园、何园、瘦西湖步行20分钟内均可到达，三轮的话5元没问题。大堂宽敞，房间设施符合三星要求，床铺整洁。只是卫生间略显成旧，用的是浴缸，不过坐厕、龙头都是美标的，水也不是想象中那么小、凉，总体还算不错。10元的早餐内容较少，不过咱也不图这个对吧！哈哈，下次去还订那儿。补充点评2008年5月13日：本想订汉庭，预定已满订了红杉树。地理位置不错的酒店，从汽车西站叫车过去15元，离文昌阁几步之遥，10分钟可到有名的福满楼，金鹰国际购物中心、时代广场近在咫尺。个园、何园、瘦西湖步行20分钟内均可到达，三轮的话5元没问题。大堂宽敞，房间设施符合三星要求，床铺整洁。只是卫生间略显成旧，用的是浴缸，不过坐厕、龙头都是美标的，水也不是想象中那么小、凉，总体还算不错。10元的早餐内容较少，不过咱也不图这个对吧！哈哈，下次去还订那儿。']

In [164]:
tokenizer_baidu.encode(X_train.astype('str').to_list()[0])

(array([    1,   458,   143,   321,    19,     4,   231,   112,   105,
          321,   170,     4,    31,    38,   144,   521,   277,   420,
          170,     4,   255,  1468,   774,   232,   152,  1025,   320,
          321,    16,    58,   518,     4,   277,   420,  2325, 12043,
            2]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [166]:
def generate_ernie_ids(tokenizer, texts, labels, max_len):
    assert isinstance(texts, list) == True
    ids = []
    for i in range(len(texts)):
        text = texts[i]
        text_id, _ = tokenizer.encode(text) # ErnieTokenizer 会自动添加ERNIE所需要的特殊token，如[CLS], [SEP]
        text_id = text_id[:max_len]
        text_id = np.pad(text_id, [0, max_len-len(text_id)], mode='constant')
        label = labels[i]
        ids.append(text_id)
    return np.array(ids), np.array(labels)

In [170]:
%%time
ernie_ids_tr, label_tr = generate_ernie_ids(tokenizer_baidu, X_train.astype('str').to_list(), label_tr, max_len=128)
ernie_ids_va, label_va = generate_ernie_ids(tokenizer_baidu, X_val.astype('str').to_list(), label_va, max_len=128)
ernie_ids_ts, label_ts = generate_ernie_ids(tokenizer_baidu, X_test.astype('str').to_list(), label_ts, max_len=128)

CPU times: user 2.82 s, sys: 14 ms, total: 2.84 s
Wall time: 2.84 s


### 3.1 finetune with bi-lstm

In [177]:
%%time
model.compile("adam", "binary_crossentropy", metrics=["accuracy"])
model.fit(ernie_ids_tr, label_tr, 
          batch_size=32, 
          epochs=100, 
          validation_data=(ernie_ids_va, label_va),
         callbacks=callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
CPU times: user 5min 6s, sys: 2min 3s, total: 7min 9s
Wall time: 1min 8s


<tensorflow.python.keras.callbacks.History at 0x1af6a2ed0>

In [178]:
%time preds = model.predict(ernie_ids_ts)

CPU times: user 6.99 s, sys: 2.92 s, total: 9.9 s
Wall time: 2.6 s


In [179]:
pred_labels = [1 if i >=0.5 else 0 for i in pd.DataFrame(preds)[0].to_list()]
print(accuracy_score(label_ts, pred_labels))
print(roc_auc(label_ts, pred_labels))

0.863519313304721
0.8446244618696187


### 3.2 ernie on distilBERT

* ernie_ids_tr & att_mask_tr as input

In [183]:
distil_model = transformers.TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_projector', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'pre_classifier', 'dropout_354']
You should probably TRAIN this model on a down-stream task to be able to use 

In [191]:
input_distil_tr = [ernie_ids_tr, att_mask_tr]
input_distil_va = [ernie_ids_va, att_mask_va]
input_distil_ts = [ernie_ids_ts, att_mask_ts]

In [185]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5,epsilon=1e-06)
distil_model.compile(loss=loss,optimizer=optimizer,metrics=[metric])

In [187]:
%%time
history = distil_model.fit(input_distil_tr,label_tr,
                       batch_size=64, 
                       epochs=3, 
                       validation_data=(input_distil_va,label_va),
                       callbacks=callbacks)

Epoch 1/3
Epoch 2/3
Epoch 3/3
CPU times: user 4h 22min 38s, sys: 1h 4min 27s, total: 5h 27min 5s
Wall time: 37min 1s


In [190]:
input_distil_ts

[array([[   1,  458,  143, ...,    0,    0,    0],
        [   1,  661,  737, ...,    0,    0,    0],
        [   1,   89,  313, ...,  597,   33, 1100],
        ...,
        [   1,  520,   15, ...,    4,   39,  232],
        [   1,   31,   38, ...,    0,    0,    0],
        [   1,  836,   97, ...,    0,    0,    0]]),
 array([[1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 1, 0, 0]], dtype=int32)]

In [195]:
%time preds = distil_model.predict(input_distil_ts,batch_size=64)









CPU times: user 13min 52s, sys: 3min 38s, total: 17min 30s
Wall time: 2min 13s


In [196]:
pred_arr = tf.nn.softmax(preds[0], axis=-1)
pred_labels = tf.argmax(pred_arr, axis=1).numpy()
print(accuracy_score(label_ts, pred_labels,))
print(roc_auc(label_ts, pred_labels))

0.8545064377682403
0.8358126626254221


### 3.3 ernie on single bi-LSTM

In [225]:
# Input for variable-length sequences of integers
inputs = tf.keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = layers.Embedding(max_features, 128)(inputs)
# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(64,dropout=0.3, recurrent_dropout=0.3, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
# Add a classifier
outputs = layers.Dense(1, activation="sigmoid")(x)
model_lstm = tf.keras.Model(inputs, outputs)
model_lstm.summary()

Model: "model_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_13 (InputLayer)        [(None, None)]            0         
_________________________________________________________________
embedding_12 (Embedding)     (None, None, 128)         2560000   
_________________________________________________________________
bidirectional_21 (Bidirectio (None, None, 128)         98816     
_________________________________________________________________
bidirectional_22 (Bidirectio (None, 128)               98816     
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 129       
Total params: 2,757,761
Trainable params: 2,757,761
Non-trainable params: 0
_________________________________________________________________


In [226]:
%%time
# loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5,epsilon=1e-04)
# model_lstm.compile(loss=loss,optimizer=optimizer,metrics=[metric])

opt = tf.keras.optimizers.Adam(learning_rate=1e-4)
model_lstm.compile(optimizer=opt, loss="binary_crossentropy", metrics=["accuracy"])

model_lstm.fit(ernie_ids_tr, label_tr, 
          batch_size=32, 
          epochs=100, 
          validation_data=(ernie_ids_va, label_va),
         callbacks=callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
CPU times: user 13min 53s, sys: 5min 10s, total: 19min 4s
Wall time: 2min 47s


<tensorflow.python.keras.callbacks.History at 0x1bb403090>

In [227]:
%time preds = model_lstm.predict(ernie_ids_ts)

CPU times: user 8.51 s, sys: 3.2 s, total: 11.7 s
Wall time: 2.67 s


In [228]:
pred_labels = [1 if i >=0.5 else 0 for i in pd.DataFrame(preds)[0].to_list()]
print(accuracy_score(label_ts, pred_labels))
print(roc_auc(label_ts, pred_labels))

0.8592274678111588
0.8351891154022439
