In [29]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
from sklearn.metrics import accuracy_score
import tensorflow as tf
import pandas as pd
import os
import shutil

In [4]:
def generate_bert_ids(tokenizer, texts, labels, output_length=128):
    bert_ids = tokenizer(texts, padding=True, truncation=True, return_tensors="tf", max_length=output_length)
    input_ids = np.asarray(bert_ids['input_ids'])
    attention_mask = np.asarray(bert_ids['attention_mask'])
    return input_ids, attention_mask, np.array(labels)

In [5]:
df = pd.read_csv('data/ChineseNlpCorpus/datasets/ChnSentiCorp_htl_all/ChnSentiCorp_htl_all.csv')
df.head(3)

Unnamed: 0,label,review
0,1,"距离川沙公路较近,但是公交指示不对,如果是""蔡陆线""的话,会非常麻烦.建议用别的路线.房间较..."
1,1,商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!
2,1,早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。


In [6]:
df_output = df.sample(frac=1, random_state=2021).reset_index(drop=True)
df_output.head(3)

Unnamed: 0,label,review
0,0,我第一次入住如家，实际情况与现实相差太远，屋子里味道很大，打开窗户换气又很吵，卫生不敢恭维！...
1,0,"给朋友定了该酒店,价格比以前高出了很多.这且不说.转天朋友结帐,前台因为染了一块毛巾和床单的..."
2,1,"环境不错,地点也很好!下次还会入住!"


In [7]:
df_output.shape

(7766, 2)

In [8]:
# split train:dev:test as 8:1:1
train_df = df_output.iloc[:5000]
valid_df = df_output.iloc[5000:6500]
test_df = df_output.iloc[6500:]

train_df.to_csv('data/ChineseNlpCorpus/datasets/ChnSentiCorp_htl_all/cleaned/train.csv', sep=',', index=False)
valid_df.to_csv('data/ChineseNlpCorpus/datasets/ChnSentiCorp_htl_all/cleaned/valid.csv', sep=',', index=False)
test_df.to_csv('data/ChineseNlpCorpus/datasets/ChnSentiCorp_htl_all/cleaned/test.csv', sep=',', index=False)

## 方法1 jieba分词

In [9]:
import jieba
import codecs
import re

In [10]:
# download from https://github.com/stopwords-iso/stopwords-zh
stopwords = [ w.strip() for w in codecs.open('data/stopwords-zh/stopwords-zh.txt', 'r', encoding='utf-8') ]

In [11]:
# text cleaning
def clearTxt(line):
    if line != '':
        line = line.strip()
        #remove english and numbers
        line = re.sub("[a-zA-Z0-9]","",line)
         
        #去除文本中的中文符号和英文符号
        line = re.sub("[\s+\.\!\/_,$%^*(+\"\'；：“”．]+|[+——！，。？?、~@#￥%……&*（）]+", "", line)
        return line
    else:
        return 'Empyt Line'
 
# text cut, remove stopwords
def sent2word(line):
     
    segList = jieba.cut(line, cut_all=False)
    segSentence = ''
    for word in segList:
        if word != '\t' and ( word not in stopwords ):
            segSentence += ( word + " " )
    return segSentence.strip()

In [12]:
test = jieba.cut(clearTxt(df['review'][0]))
sentence = ''
for w in test:
    if w != '\t' and (w not in stopwords):
        sentence += (w + " ")      

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/rb/2_frt6zn7qz8ybmdwqwmjd100000gn/T/jieba.cache
Loading model cost 0.723 seconds.
Prefix dict has been built successfully.


In [13]:
print(df['review'][0])
print(' ')
print(sentence)

距离川沙公路较近,但是公交指示不对,如果是"蔡陆线"的话,会非常麻烦.建议用别的路线.房间较为简单.
 
距离 川沙 公路 较近 公交 指示 蔡陆线 非常 麻烦 建议 路线 房间 较为简单 


## 1 BERT 

In [14]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-chinese", model_max_length=5000)
# bert_tokenizer.save_pretrained("/Users/wegzheng/Downloads/bert-base-chinses")

In [17]:
bert_model = TFBertForSequenceClassification.from_pretrained('/Users/wegzheng/Downloads/bert-base-chinses',num_labels=2)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at /Users/wegzheng/Downloads/bert-base-chinses and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
train_df.head(3)

Unnamed: 0,label,review
0,0,我第一次入住如家，实际情况与现实相差太远，屋子里味道很大，打开窗户换气又很吵，卫生不敢恭维！...
1,0,"给朋友定了该酒店,价格比以前高出了很多.这且不说.转天朋友结帐,前台因为染了一块毛巾和床单的..."
2,1,"环境不错,地点也很好!下次还会入住!"


In [19]:
[i for i in train_df['review'][:5].astype('str')]

['我第一次入住如家，实际情况与现实相差太远，屋子里味道很大，打开窗户换气又很吵，卫生不敢恭维！入住时间不长就搬走。而且位置也很偏，不知道为何在这里选址！总之不会再住！楼下涮肉一般！',
 '给朋友定了该酒店,价格比以前高出了很多.这且不说.转天朋友结帐,前台因为染了一块毛巾和床单的说法要求赔偿110元.朋友很不开心,说就是喝你们冰箱可乐时候喷出来一些.而且110买你5套也够了.结果我要求前台把经理找来,一个小伙子说他就是.朋友说你们四星这么做有点过火.后来我要求那个小伙子带我去房间看一下有没有污染,到什么程度.于是他就一起去了,床单的一大片我表示怀疑是否是朋友染的.于是在我拉掉床单的一刹那,令人惊讶的事情发生了,在下面棉单上面,竟然有一女人例假的血迹.当然就想吐,那血还好象是近期的.朋友一看就急了,在这床上睡一晚是多么让人恶心.我相信酒店也完全能检查出来有,根本没有换,或者简单洗完就铺上了.所以,请大家今后住这个酒店一定好好检查检查.(酒店最终解决是半价,但是远远无法消除这种失望,决心不再入住).补充点评2007年2月3日：如果床单上发现有血迹,这在国外,甚至国内的4,5星酒店绝对是无法容忍的事情.酒店不是简单的打着折就能摆平的事情.',
 '环境不错,地点也很好!下次还会入住!',
 '大堂门口外管理停车的保安不尊重客人，一辆市府领导的车停在对面，那一长条的车位就不准停车。更重要的事出言不逊，“这地方是你社会车辆能停的吗？”真让我在美国同事（懂汉语）面前觉得丢中国人的脸！',
 '1.希望房间的改造加快进度,改造完房间不错,没有改造好的房间的确是三星的感觉.2.早餐很有特色,很不错.']

In [167]:
%%time 
input_ids_tr, att_mask_tr, label_tr = generate_bert_ids(bert_tokenizer, train_df['review'].astype('str').to_list(), train_df['label'].values)
input_ids_va, att_mask_va, label_va = generate_bert_ids(bert_tokenizer, valid_df['review'].astype('str').to_list(), valid_df['label'].values)
input_ids_ts, att_mask_ts, label_ts = generate_bert_ids(bert_tokenizer, test_df['review'].astype('str').to_list(), test_df['label'].values)

CPU times: user 6.46 s, sys: 26.3 ms, total: 6.48 s
Wall time: 6.49 s


In [204]:
print("lenth of data: {}".format(len(input_ids_tr)))
print("lenth of data: {}".format(len(input_ids_va)))
print("lenth of data: {}".format(len(input_ids_ts)))

lenth of data: 5000
lenth of data: 1500
lenth of data: 1266


In [205]:
bert_tokenizer.vocab_size

21128

In [206]:
input_ids_tr[0]

array([ 101, 2769, 5018,  671, 3613, 1057,  857, 1963, 2157, 8024, 2141,
       7354, 2658, 1105,  680, 4385, 2141, 4685, 2345, 1922, 6823, 8024,
       2238, 2094, 7027, 1456, 6887, 2523, 1920, 8024, 2802, 2458, 4970,
       2787, 2940, 3698, 1348, 2523, 1427, 8024, 1310, 4495,  679, 3140,
       2621, 5335, 8013, 1057,  857, 3198, 7313,  679, 7270, 2218, 3021,
       6624,  511, 5445,  684,  855, 5390,  738, 2523,  974, 8024,  679,
       4761, 6887,  711,  862, 1762, 6821, 7027, 6848, 1770, 8013, 2600,
        722,  679,  833, 1086,  857, 8013, 3517,  678, 3888, 5489,  671,
       5663, 8013,  102,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0], dtype=int32)

In [207]:
print("Training ids length is {}".format(len(input_ids_tr[0])))

Training ids length is 128


In [208]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5,epsilon=1e-04)
bert_model.compile(loss=loss,optimizer=optimizer,metrics=[metric])

In [209]:
callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, mode='auto', restore_best_weights=True)]

In [210]:
%%time
history = bert_model.fit([input_ids_tr,att_mask_tr],label_tr,
                       batch_size=32, 
                       epochs=3, 
                       validation_data=([input_ids_va,att_mask_va],label_va),
                       callbacks=callbacks)

Epoch 1/3


  # not be used in advertising or publicity pertaining to distribution






















Epoch 2/3
Epoch 3/3
CPU times: user 12h 32min 2s, sys: 3h 54min 11s, total: 16h 26min 13s
Wall time: 3h 15min 11s


In [211]:
%time preds = bert_model.predict([input_ids_ts,att_mask_ts],batch_size=64)









CPU times: user 19min 25s, sys: 5min 43s, total: 25min 9s
Wall time: 3min 12s


In [212]:
pred_arr = tf.nn.softmax(preds[0], axis=-1)
pred_labels = tf.argmax(pred_arr, axis=1).numpy()
accuracy_score(label_ts, pred_labels,)

0.9115323854660348

## 2 raw keras

In [145]:
from tensorflow.keras import layers
from tensorflow.keras import losses
from keras.preprocessing import sequence, text
import matplotlib.pyplot as plt

In [146]:
# # using keras tokenizer here
# token = text.Tokenizer(num_words=20000) # or use None)
# max_len = 1500

# train_texts = train_df['review'].astype('str').to_list()
# val_texts = valid_df['review'].astype('str').to_list()
# test_texts = test_df['review'].astype('str').to_list()

# token.fit_on_texts(list(train_texts) + list(val_texts) + list(test_texts))
# xtrain_seq = token.texts_to_sequences(train_texts)
# xvalid_seq = token.texts_to_sequences(val_texts)
# xtest_seq = token.texts_to_sequences(test_texts)

In [156]:
word_index = token.word_index
print("token index length is {}".format(len(word_index)))

token index length is 25316


In [171]:
#zero pad the sequences
xtrain_pad = sequence.pad_sequences(input_ids_tr, maxlen=128)
xvalid_pad = sequence.pad_sequences(input_ids_va, maxlen=128)
xtest_pad = sequence.pad_sequences(input_ids_ts, maxlen=128)

In [172]:
xtest_pad.shape

(1266, 128)

In [173]:
xtrain_pad[3]

array([ 101, 1920, 1828, 7305, 1366, 1912, 5052, 4415,  977, 6756, 4638,
        924, 2128,  679, 2203, 7028, 2145,  782, 8024,  671, 6775, 2356,
       2424, 7566, 2193, 4638, 6756,  977, 1762, 2190, 7481, 8024, 6929,
        671, 7270, 3340, 4638, 6756,  855, 2218,  679, 1114,  977, 6756,
        511, 3291, 7028, 6206, 4638,  752, 1139, 6241,  679, 6849, 8024,
        100, 6821, 1765, 3175, 3221,  872, 4852,  833, 6756, 6775, 5543,
        977, 4638, 1408, 8043,  100, 4696, 6375, 2769, 1762, 5401, 1744,
       1398,  752, 8020, 2743, 3727, 6427, 8021, 7481, 1184, 6230, 2533,
        696,  704, 1744,  782, 4638, 5567, 8013,  102,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0], dtype=int32)

In [187]:
# create model structure
model_keras = tf.keras.Sequential([
    layers.Embedding(len(word_index) + 1, 32),
    layers.Dropout(0.2),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    layers.Dense(1)
])
model_keras.summary()

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, None, 32)          810144    
_________________________________________________________________
dropout_62 (Dropout)         (None, None, 32)          0         
_________________________________________________________________
global_average_pooling1d_12  (None, 32)                0         
_________________________________________________________________
dropout_63 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 33        
Total params: 810,177
Trainable params: 810,177
Non-trainable params: 0
_________________________________________________________________


In [188]:
model_keras.compile(loss=losses.BinaryCrossentropy(from_logits=True),
             optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4),
             metrics=tf.keras.metrics.BinaryAccuracy(threshold=0.5))

In [189]:
%%time
# fit the model
epochs = 200
history = model_keras.fit(xtrain_pad, np.array(label_tr),
                       batch_size=64, 
                       epochs=epochs, 
                       validation_data=(xvalid_pad,np.array(label_va)),
                       callbacks=callbacks)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200


Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
CPU times: user 2min 33s, sys: 18.5 s, total: 2min 51s
Wall time: 39.4 s


In [190]:
preds = model_keras.predict_classes(xtest_pad)
accuracy_score(label_ts, preds)

0.8854660347551343

## 3 baidu ERNIE

### 3.1 baidu raw tutorial

In [None]:
# !python3 -m pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple

In [62]:
!git clone https://github.com/PaddlePaddle/ERNIE.git

Cloning into 'ERNIE'...
remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 2034 (delta 1), reused 3 (delta 1), pack-reused 2027[K
Receiving objects: 100% (2034/2034), 88.86 MiB | 1.70 MiB/s, done.
Resolving deltas: 100% (1106/1106), done.


In [63]:
!pip install -r ERNIE/requirements.txt

Collecting pyzmq==18.0.2
  Downloading pyzmq-18.0.2-cp37-cp37m-macosx_10_9_x86_64.whl (812 kB)
[K     |████████████████████████████████| 812 kB 2.9 MB/s eta 0:00:01
[?25hCollecting six==1.11.0
  Downloading six-1.11.0-py2.py3-none-any.whl (10 kB)
Collecting sentencepiece==0.1.8
  Downloading sentencepiece-0.1.8-cp37-cp37m-macosx_10_6_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 107 kB/s eta 0:00:01
[?25hCollecting jieba==0.39
  Downloading jieba-0.39.zip (7.3 MB)
[K     |████████████████████████████████| 7.3 MB 3.8 MB/s eta 0:00:01
[?25hCollecting visualdl>=2.0.0b7
  Downloading visualdl-2.1.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 111 kB/s eta 0:00:01
[?25hCollecting pathlib2>=2.3.2
  Downloading pathlib2-2.3.5-py2.py3-none-any.whl (18 kB)
Collecting bce-python-sdk
  Downloading bce_python_sdk-0.8.59-py3-none-any.whl (192 kB)
[K     |████████████████████████████████| 192 kB 1.5 MB/s eta 0:00:01
Collecting Flask-Babel

[?25h  Downloading tqdm-4.37.0-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 3.1 MB/s eta 0:00:011
[?25h  Downloading tqdm-4.36.1-py2.py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 2.9 MB/s eta 0:00:011
[?25h  Downloading tqdm-4.36.0-py2.py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 2.6 MB/s eta 0:00:011
[?25h  Downloading tqdm-4.35.0-py2.py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 4.5 MB/s eta 0:00:011
[?25h  Downloading tqdm-4.34.0-py2.py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 12.2 MB/s eta 0:00:01
[?25h  Downloading tqdm-4.33.0-py2.py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 2.2 MB/s eta 0:00:01
[?25h  Downloading tqdm-4.32.2-py2.py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 4.0 MB/s eta 0:00:011
[?25hINFO: pip is looking at multiple versions of tqdm to determine which ve

In [64]:
import sys
sys.path.append('./ERNIE')

In [111]:
import paddle as P
import paddle.fluid as F
import paddle.fluid.layers as L
import paddle.fluid.dygraph as D

from ernie.tokenizing_ernie import ErnieTokenizer
from ernie.modeling_ernie import ErnieModelForSequenceClassification
from sklearn.metrics import f1_score

In [112]:
#设置好所有的超参数，对于ERNIE任务学习率推荐取1e-5/2e-5/5e-5, 根据显存大小调节BATCH大小, 最大句子长度不超过512.
BATCH=32
MAX_SEQLEN=128
LR=5e-5
EPOCH=5

In [67]:
tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')

2021-03-20 09:56:51,230 - INFO - get pretrain dir from https://ernie-github.cdn.bcebos.com/model-ernie1.0.1.tar.gz
downloading https://ernie-github.cdn.bcebos.com/model-ernie1.0.1.tar.gz: 788478KB [00:26, 29417.45KB/s]                            


In [125]:
len(tokenizer.vocab)

17964

In [74]:
train_texts[0]

'我第一次入住如家，实际情况与现实相差太远，屋子里味道很大，打开窗户换气又很吵，卫生不敢恭维！入住时间不长就搬走。而且位置也很偏，不知道为何在这里选址！总之不会再住！楼下涮肉一般！'

In [117]:
def generate_ernie_data(tokenizer, texts, labels):
    assert isinstance(texts, list) == True
    data = []
    for i in range(len(texts)):
        text = texts[i]
        text_id, _ = tokenizer.encode(text) # ErnieTokenizer 会自动添加ERNIE所需要的特殊token，如[CLS], [SEP]
        text_id = text_id[:MAX_SEQLEN]
        text_id = np.pad(text_id, [0, MAX_SEQLEN-len(text_id)], mode='constant')
        label = labels[i]
        data.append((text_id, label))
    return data

In [118]:
%%time
ernie_data_tr = generate_ernie_data(tokenizer, train_texts, label_tr)
ernie_data_va = generate_ernie_data(tokenizer, val_texts, label_va)
ernie_data_ts = generate_ernie_data(tokenizer, test_texts, label_ts)

CPU times: user 2.72 s, sys: 21.3 ms, total: 2.74 s
Wall time: 2.73 s


In [103]:
D.guard().__enter__() # 为了让Paddle进入动态图模式，需要添加这一行在最前面

ernie = ErnieModelForSequenceClassification.from_pretrained('ernie-1.0', num_labels=3)
optimizer = F.optimizer.Adam(LR, parameter_list=ernie.parameters())

2021-03-20 10:26:19,049 - INFO - get pretrain dir from https://ernie-github.cdn.bcebos.com/model-ernie1.0.1.tar.gz
2021-03-20 10:26:21,976 - INFO - loading pretrained model from /Users/wegzheng/.paddle-ernie-cache/466eabcffd6d6a83ae9cb97dd1a167bd
  log.warn('param:%s not set in pretrained model, skip' % k)


In [107]:
def get_batch_data(data, i):
    d = data[i*BATCH: (i + 1) * BATCH]
    feature, label = zip(*d)
    feature = np.stack(feature)  # 将BATCH行样本整合在一个numpy.array中
    label = np.stack(list(label))
    feature = D.to_variable(feature) # 使用to_variable将numpy.array转换为paddle tensor
    label = D.to_variable(label)
    return feature, label

In [113]:
def ernie_train(train_data, test_data):
    for i in range(EPOCH):
        np.random.shuffle(train_data) # 每个epoch都shuffle数据以获得最佳训练效果；
        #train
        for j in range(len(train_data) // BATCH):
            feature, label = get_batch_data(train_data, j)
            loss, _ = ernie(feature, labels=label) # ernie模型的返回值包含(loss, logits)；其中logits目前暂时不需要使用
            loss.backward()
            optimizer.minimize(loss)
            ernie.clear_gradients()
            if j % 2 == 0:
                print('train %d: loss %.5f' % (j, loss.numpy()))
            # evaluate
            if j % 10 == 0:
                all_pred, all_label = [], []
                with D.base._switch_tracer_mode_guard_(is_train=False): # 在这个with域内ernie不会进行梯度计算；
                    ernie.eval() # 控制模型进入eval模式，这将会关闭所有的dropout；
                    for j in range(len(test_data) // BATCH):
                        feature, label = get_batch_data(test_data, j)
                        loss, logits = ernie(feature, labels=label) 
                        all_pred.extend(L.argmax(logits, -1).numpy())
                        all_label.extend(label.numpy())
                    ernie.train()
                f1 = f1_score(all_label, all_pred, average='macro')
                print('f1 %.5f' % f1)
    return ernie

In [114]:
%time ernie_finetune = ernie_train(ernie_data_tr, ernie_data_va)

train 0: loss 0.71942
f1 0.40860
train 2: loss 1.06447
train 4: loss 0.56342
train 6: loss 0.75074
train 8: loss 0.68717
train 10: loss 0.47982
f1 0.43173
train 12: loss 0.52679
train 14: loss 0.49214
train 16: loss 0.52026
train 18: loss 0.47697
train 20: loss 0.39963


KeyboardInterrupt: 

### 3.2 self defined network

In [195]:
from keras.layers.recurrent import LSTM

In [183]:
def self_defined_model(tk_length, dim=32, nlabels=1, LR=1e-4):
    model_nn = tf.keras.Sequential([
    layers.Embedding(tk_length + 1, dim),
    layers.Dropout(0.2),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    layers.Dense(1)
    ])
    
    model_nn.compile(loss=losses.BinaryCrossentropy(from_logits=True),
             optimizer=tf.keras.optimizers.Adam(learning_rate=LR),
             metrics=tf.keras.metrics.BinaryAccuracy(threshold=0.5))
    return model_nn

In [115]:
def generate_ernie_ids(tokenizer, texts, labels):
    assert isinstance(texts, list) == True
    ids = []
    for i in range(len(texts)):
        text = texts[i]
        text_id, _ = tokenizer.encode(text) # ErnieTokenizer 会自动添加ERNIE所需要的特殊token，如[CLS], [SEP]
        text_id = text_id[:MAX_SEQLEN]
        text_id = np.pad(text_id, [0, MAX_SEQLEN-len(text_id)], mode='constant')
        label = labels[i]
        ids.append(text_id)
    return np.array(ids), np.array(labels)

In [184]:
%%time
ernie_ids_tr, label_tr = generate_ernie_ids(tokenizer, train_texts, train_df['label'].values)
ernie_ids_va, label_va = generate_ernie_ids(tokenizer, val_texts, valid_df['label'].values)
ernie_ids_ts, label_ts = generate_ernie_ids(tokenizer, test_texts, test_df['label'].values)

CPU times: user 2.65 s, sys: 21.8 ms, total: 2.68 s
Wall time: 2.67 s


In [185]:
model_baidu = self_defined_model(tk_length=len(tokenizer.vocab), LR=1e-4)

In [186]:
model_baidu.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, None, 32)          574880    
_________________________________________________________________
dropout_60 (Dropout)         (None, None, 32)          0         
_________________________________________________________________
global_average_pooling1d_11  (None, 32)                0         
_________________________________________________________________
dropout_61 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 33        
Total params: 574,913
Trainable params: 574,913
Non-trainable params: 0
_________________________________________________________________


In [192]:
%%time
# fit the model
epochs = 400
history = model_baidu.fit(ernie_ids_tr, label_tr,
                       batch_size=64, 
                       epochs=epochs, 
                       validation_data=(ernie_ids_va, label_va),
                       callbacks=callbacks)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200


Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
CPU times: user 1min 38s, sys: 15.6 s, total: 1min 53s
Wall time: 28.8 s


In [194]:
preds = model_baidu.predict_classes(ernie_ids_ts)
accuracy_score(label_ts, preds)

0.8862559241706162

In [197]:
def self_lstm_model(tk_length, dim=32, nlabels=1, LR=1e-4):
    model_nn = tf.keras.Sequential([
    layers.Embedding(tk_length + 1, dim),
    LSTM(100, dropout=0.3, recurrent_dropout=0.3),
    layers.Dropout(0.2),
    layers.Dense(1)
    ])
    
    model_nn.compile(loss=losses.BinaryCrossentropy(from_logits=True),
             optimizer=tf.keras.optimizers.Adam(learning_rate=LR),
             metrics=tf.keras.metrics.BinaryAccuracy(threshold=0.5))
    return model_nn

In [201]:
model_lstm = self_lstm_model(tk_length=len(tokenizer.vocab), LR=3e-5)

In [202]:
%%time
# fit the model
epochs = 300
history = model_lstm.fit(ernie_ids_tr, label_tr,
                       batch_size=64, 
                       epochs=epochs, 
                       validation_data=(ernie_ids_va, label_va),
                       callbacks=callbacks)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
CPU times: user 21min 46s, sys: 10min 7s, total: 31min 53s
Wall time: 5min 10s


In [203]:
preds = model_lstm.predict_classes(ernie_ids_ts)
accuracy_score(label_ts, preds)

0.8720379146919431