In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install kashgari
!pip install tensorflow_addons==0.13.0
!pip install tensorflow==2.3.0

In [4]:
import datetime
import os
import re
import tensorflow as tf
print(tf.__version__)
import kashgari
from kashgari.tokenizers import BertTokenizer
from kashgari.embeddings import BareEmbedding, BertEmbedding
from kashgari.layers import L
from kashgari.tasks.classification.abc_model import ABCClassificationModel
from tensorflow import keras
import tensorflow.keras.backend as K
from typing import Dict, Any
import tqdm
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import TensorBoard

pre_trained = 'roberta'
file_path = "/content/drive/MyDrive/ColabNotebooks/project_data/data.txt"
bert_path = "/content/drive/MyDrive/ColabNotebooks/roberta_zh/resources/"

models = []

def load_data(filepath, bertpath):
  X, y = [], []
  x_train, x_test, y_train, y_test = [], [], [], []
  label_list = []
  label_counter = {}

  global embed, tokenizer
  # embed = BareEmbedding(embedding_size=128, processor=kashgari.processors.class_processor, sequence_length=50)
  embed = BertEmbedding(bertpath)
  tokenizer = BertTokenizer.load_from_vocab_file(os.path.join(bert_path, 'vocab.txt'))

  pattern = '[，、。:：；/（）()《》“”"？,.;?·…0-9A-Za-z+=-]'
  lines = open(filepath, 'r', encoding='utf-8').read().splitlines()
  for line in tqdm.tqdm(lines): 
    rows = line.split('\t')
    if len(rows) == 4:
      content = tokenizer.tokenize(re.sub(pattern, "", rows[0]))
      label = rows[1]
      X.append(content)
      y.append(label)
      if label not in label_list:
        label_list.append(label)
        label_counter[label] = 1
      else:
        label_counter[label] += 1
  x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33,shuffle=True)

  print("Project Data Summary:")
  print("Train\t", len(x_train))
  print("Test\t", len(x_test))
  print("Label\t", len(label_list))
  for key, value in label_counter.items():
    print(key + '\t', value)
  return x_train, y_train, x_test, y_test,embed

Xtr, ytr, Xts, yts, bert_embedding = load_data(file_path, bert_path)

2022-05-15 21:57:07,683 [DEBUG] kashgari - ------------------------------------------------
2022-05-15 21:57:07,685 [DEBUG] kashgari - Loaded transformer model's vocab
2022-05-15 21:57:07,687 [DEBUG] kashgari - config_path       : /content/drive/MyDrive/ColabNotebooks/roberta_zh/resources/bert_config.json
2022-05-15 21:57:07,690 [DEBUG] kashgari - vocab_path      : /content/drive/MyDrive/ColabNotebooks/roberta_zh/resources/vocab.txt
2022-05-15 21:57:07,693 [DEBUG] kashgari - checkpoint_path : /content/drive/MyDrive/ColabNotebooks/roberta_zh/resources/bert_model.ckpt
2022-05-15 21:57:07,695 [DEBUG] kashgari - Top 50 words    : ['[PAD]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]', '[unused11]', '[unused12]', '[unused13]', '[unused14]', '[unused15]', '[unused16]', '[unused17]', '[unused18]', '[unused19]', '[unused20]', '[unused21]', '[unused22]', '[unused23]', '[unused24]', '[unused25]', '[unused26]', '

2.3.0


100%|██████████| 19550/19550 [00:00<00:00, 22771.67it/s]

Project Data Summary:
Train	 13096
Test	 6451
Label	 9
9	 1095
1	 6426
7	 1185
2	 3493
3	 3854
4	 1667
8	 226
6	 1129
5	 472





In [9]:
class My_CNN_Model(ABCClassificationModel):
    @classmethod
    def default_hyper_parameters(cls) -> Dict[str, Dict[str, Any]]:
        return {
            'conv_0': {
                'filters': 32,
                'kernel_size': 5,
                'kernel_initializer': 'normal',
                'padding': 'valid',
                'activation': 'relu',
                'strides': 1
            },
            'concat': {
                'axis': 1
            },
            'dropout': {
                'rate': 0.5
            },
            'activation_layer': {
                'activation': 'softmax'
            },
        }

    def build_model_arc(self):
        output_dim = len(self.label_processor.vocab2idx)
        config = self.hyper_parameters
        embed_model = self.embedding.embed_model

        # 1D Convolutional Layer
        layers_rcnn_seq = []
        layers_rcnn_seq.append(L.Conv1D(**config['conv_0']))

        # Max Pooling
        layers_sensor = []
        layers_sensor.append(L.GlobalMaxPooling1D())
        layer_concat = L.Concatenate(**config['concat'])

        # Two fully connected layers
        # Dropout to reduce overfitting
        layers_full_connect = []
        layers_full_connect.append(L.Dropout(**config['dropout']))
        layers_full_connect.append(L.Dense(output_dim, **config['activation_layer']))

        tensor = embed_model.output
        for layer in layers_rcnn_seq:
            tensor = layer(tensor)

        tensor_output = layers_sensor[0](tensor)

        for layer in layers_full_connect:
            tensor_output = layer(tensor_output)

        self.tf_model = keras.Model(embed_model.inputs, tensor_output)


In [12]:
# Run Model 
Xtr, ytr, Xts, yts, bert_embedding = load_data(file_path, bert_path)
embed = BareEmbedding(embedding_size=128, processor=kashgari.processors.class_processor, sequence_length=50)
tf_board = TensorBoard(log_dir='tf_dir/cnn_model',
                       histogram_freq=5,
                       update_freq='batch')
K.clear_session()
model = My_CNN_Model(embed)
model.fit(Xtr, ytr, Xts, yts,callbacks=[tf_board],epochs=30,batch_size=32)
report = model.evaluate(Xts, yts)
print(report)

2022-05-15 22:10:03,804 [DEBUG] kashgari - ------------------------------------------------
2022-05-15 22:10:03,810 [DEBUG] kashgari - Loaded transformer model's vocab
2022-05-15 22:10:03,815 [DEBUG] kashgari - config_path       : /content/drive/MyDrive/ColabNotebooks/roberta_zh/resources/bert_config.json
2022-05-15 22:10:03,819 [DEBUG] kashgari - vocab_path      : /content/drive/MyDrive/ColabNotebooks/roberta_zh/resources/vocab.txt
2022-05-15 22:10:03,825 [DEBUG] kashgari - checkpoint_path : /content/drive/MyDrive/ColabNotebooks/roberta_zh/resources/bert_model.ckpt
2022-05-15 22:10:03,827 [DEBUG] kashgari - Top 50 words    : ['[PAD]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]', '[unused11]', '[unused12]', '[unused13]', '[unused14]', '[unused15]', '[unused16]', '[unused17]', '[unused18]', '[unused19]', '[unused20]', '[unused21]', '[unused22]', '[unused23]', '[unused24]', '[unused25]', '[unused26]', '

Project Data Summary:
Train	 13096
Test	 6451
Label	 9
9	 1095
1	 6426
7	 1185
2	 3493
3	 3854
4	 1667
8	 226
6	 1129
5	 472


Preparing text vocab dict: 100%|██████████| 13096/13096 [00:00<00:00, 99253.20it/s] 
Preparing text vocab dict: 100%|██████████| 6451/6451 [00:00<00:00, 103124.73it/s]
2022-05-15 22:10:05,868 [DEBUG] kashgari - --- Build vocab dict finished, Total: 2429 ---
2022-05-15 22:10:05,870 [DEBUG] kashgari - Top-10: ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '的', '是', '一', '们', '我', '个']
Preparing classification label vocab dict: 100%|██████████| 13096/13096 [00:00<00:00, 718706.81it/s]
Preparing classification label vocab dict: 100%|██████████| 6451/6451 [00:00<00:00, 350462.47it/s]
Calculating sequence length: 100%|██████████| 13096/13096 [00:00<00:00, 708966.60it/s]
Calculating sequence length: 100%|██████████| 6451/6451 [00:00<00:00, 490020.38it/s]
2022-05-15 22:10:06,024 [DEBUG] kashgari - Calculated sequence length = 59
2022-05-15 22:10:06,085 [DEBUG] kashgari - Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape     

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


2022-05-15 22:11:50,002 [DEBUG] kashgari - predict input shape (6451, 364) x: 
[[  2   8  21 ...   0   0   0]
 [  2  22  13 ...   0   0   0]
 [  2   8  20 ...   0   0   0]
 ...
 [  2  10  82 ...   0   0   0]
 [  2  24  25 ...   0   0   0]
 [  2 123 120 ...   0   0   0]]
2022-05-15 22:11:50,498 [DEBUG] kashgari - predict output shape (6451, 9)
2022-05-15 22:11:50,509 [DEBUG] kashgari - predict output argmax: [1 6 1 ... 0 0 1]


              precision    recall  f1-score   support

           1     0.5171    0.6373    0.5709      2134
           2     0.3815    0.4565    0.4157      1104
           3     0.4496    0.4444    0.4470      1305
           4     0.3874    0.2696    0.3179       549
           5     0.1795    0.0449    0.0718       156
           6     0.2558    0.1226    0.1657       359
           7     0.6192    0.5608    0.5885       403
           8     0.0000    0.0000    0.0000        72
           9     0.6440    0.4363    0.5202       369

    accuracy                         0.4697      6451
   macro avg     0.3816    0.3303    0.3442      6451
weighted avg     0.4544    0.4697    0.4550      6451

{'detail': {'1': {'precision': 0.5171102661596958, 'recall': 0.6373008434864105, 'f1-score': 0.5709487825356843, 'support': 2134}, '2': {'precision': 0.38152914458743376, 'recall': 0.45652173913043476, 'f1-score': 0.4156701030927835, 'support': 1104}, '3': {'precision': 0.4496124031007752, 'rec

In [14]:
class MY_CNN_BILSTM_Model(ABCClassificationModel):
    @classmethod
    def default_hyper_parameters(cls) -> Dict[str, Dict[str, Any]]:
        return {
            'bilstm_0': {
                'units': 64,
                'return_sequences': True
            },
            'conv_0': {
                'filters': 32,
                'kernel_size': 5,
                'kernel_initializer': 'normal',
                'padding': 'valid',
                'activation': 'relu',
                'strides': 1
            },
            'concat': {
                'axis': 1
            },
            'dropout': {
                'rate': 0.5
            },
            'activation_layer': {
                'activation': 'softmax'
            },
        }

    def build_model_arc(self):
        output_dim = len(self.label_processor.vocab2idx)
        config = self.hyper_parameters
        embed_model = self.embedding.embed_model

        # One 1D Convolutional layer and one BiLSTM layer
        layers_rcnn_seq = []
        layers_rcnn_seq.append(L.Conv1D(**config['conv_0']))
        layers_rcnn_seq.append(L.Bidirectional(L.LSTM(**config['bilstm_0'])))

        # Max Pooling
        layers_sensor = []
        layers_sensor.append(L.GlobalMaxPooling1D())
        layer_concat = L.Concatenate(**config['concat'])

        # Two fully connected layers
        layers_full_connect = []
        layers_full_connect.append(L.Dropout(**config['dropout']))
        layers_full_connect.append(L.Dense(output_dim, **config['activation_layer']))

        tensor = embed_model.output
        for layer in layers_rcnn_seq:
            tensor = layer(tensor)

        tensor_output = layers_sensor[0](tensor)

        for layer in layers_full_connect:
            tensor_output = layer(tensor_output)

        self.tf_model = keras.Model(embed_model.inputs, tensor_output)


In [15]:
# Run Model 
Xtr, ytr, Xts, yts, bert_embedding = load_data(file_path, bert_path)
embed = BareEmbedding(embedding_size=128, processor=kashgari.processors.class_processor, sequence_length=50)
tf_board = TensorBoard(log_dir='tf_dir/cnn_bilstm_model',
                       histogram_freq=5,
                       update_freq='batch')
K.clear_session()
model = MY_CNN_BILSTM_Model(embed)
model.fit(Xtr, ytr, Xts, yts,callbacks=[tf_board],epochs=30,batch_size=32)
report = model.evaluate(Xts, yts)
print(report)

2022-05-15 22:15:36,950 [DEBUG] kashgari - ------------------------------------------------
2022-05-15 22:15:36,951 [DEBUG] kashgari - Loaded transformer model's vocab
2022-05-15 22:15:36,952 [DEBUG] kashgari - config_path       : /content/drive/MyDrive/ColabNotebooks/roberta_zh/resources/bert_config.json
2022-05-15 22:15:36,954 [DEBUG] kashgari - vocab_path      : /content/drive/MyDrive/ColabNotebooks/roberta_zh/resources/vocab.txt
2022-05-15 22:15:36,956 [DEBUG] kashgari - checkpoint_path : /content/drive/MyDrive/ColabNotebooks/roberta_zh/resources/bert_model.ckpt
2022-05-15 22:15:36,958 [DEBUG] kashgari - Top 50 words    : ['[PAD]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]', '[unused11]', '[unused12]', '[unused13]', '[unused14]', '[unused15]', '[unused16]', '[unused17]', '[unused18]', '[unused19]', '[unused20]', '[unused21]', '[unused22]', '[unused23]', '[unused24]', '[unused25]', '[unused26]', '

Project Data Summary:
Train	 13096
Test	 6451
Label	 9
9	 1095
1	 6426
7	 1185
2	 3493
3	 3854
4	 1667
8	 226
6	 1129
5	 472


Preparing text vocab dict: 100%|██████████| 13096/13096 [00:00<00:00, 176013.17it/s]
Preparing text vocab dict: 100%|██████████| 6451/6451 [00:00<00:00, 175209.68it/s]
2022-05-15 22:15:38,005 [DEBUG] kashgari - --- Build vocab dict finished, Total: 2429 ---
2022-05-15 22:15:38,008 [DEBUG] kashgari - Top-10: ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '的', '是', '一', '们', '我', '个']
Preparing classification label vocab dict: 100%|██████████| 13096/13096 [00:00<00:00, 1071924.07it/s]
Preparing classification label vocab dict: 100%|██████████| 6451/6451 [00:00<00:00, 968516.85it/s]
Calculating sequence length: 100%|██████████| 13096/13096 [00:00<00:00, 1354355.73it/s]
Calculating sequence length: 100%|██████████| 6451/6451 [00:00<00:00, 1301151.96it/s]
2022-05-15 22:15:38,099 [DEBUG] kashgari - Calculated sequence length = 59
2022-05-15 22:15:38,573 [DEBUG] kashgari - Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape  

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


2022-05-15 22:19:02,017 [DEBUG] kashgari - predict input shape (6451, 267) x: 
[[  2  28  27 ...   0   0   0]
 [  2  22  13 ...   0   0   0]
 [  2  31 850 ...   0   0   0]
 ...
 [  2  38  76 ...   0   0   0]
 [  2  26  40 ...   0   0   0]
 [  2  10  82 ...   0   0   0]]
2022-05-15 22:19:04,624 [DEBUG] kashgari - predict output shape (6451, 9)
2022-05-15 22:19:04,633 [DEBUG] kashgari - predict output argmax: [5 0 4 ... 3 0 0]


              precision    recall  f1-score   support

           1     0.5381    0.5527    0.5453      2106
           2     0.4051    0.4262    0.4154      1166
           3     0.4178    0.4408    0.4290      1275
           4     0.2878    0.3074    0.2973       514
           5     0.2520    0.1963    0.2207       163
           6     0.2242    0.1615    0.1878       390
           7     0.6050    0.5688    0.5863       385
           8     0.0909    0.0405    0.0561        74
           9     0.5769    0.5556    0.5660       378

    accuracy                         0.4508      6451
   macro avg     0.3775    0.3611    0.3671      6451
weighted avg     0.4453    0.4508    0.4473      6451

{'detail': {'1': {'precision': 0.5381414701803051, 'recall': 0.5527065527065527, 'f1-score': 0.5453267744202389, 'support': 2106}, '2': {'precision': 0.4050529747351263, 'recall': 0.4262435677530017, 'f1-score': 0.4153781863769327, 'support': 1166}, '3': {'precision': 0.4178438661710037, 'recal

In [16]:
class MY_Double_BILSTM(ABCClassificationModel):
    @classmethod
    def default_hyper_parameters(cls) -> Dict[str, Dict[str, Any]]:
        return {
            'spatial_dropout': {
                'rate': 0.2
            },
            'bilstm_0': {
                'units': 64,
                'return_sequences': True
            },
            'bilstm_1': {
                'units': 64,
                'return_sequences': True
            },
            'conv_0': {
                'filters': 32,
                'kernel_size': 5,
                'kernel_initializer': 'normal',
                'padding': 'valid',
                'activation': 'relu',
                'strides': 1
            },
            'concat': {
                'axis': 1
            },
            'dropout': {
                'rate': 0.5
            },
            'activation_layer': {
                'activation': 'softmax'
            },
        }

    def build_model_arc(self):
        output_dim = len(self.label_processor.vocab2idx)
        config = self.hyper_parameters
        embed_model = self.embedding.embed_model

        # Two BiLSTM Layers
        layers_rcnn_seq = []
        layers_rcnn_seq.append(L.Bidirectional(L.LSTM(**config['bilstm_0'])))
        layers_rcnn_seq.append(L.Bidirectional(L.LSTM(**config['bilstm_1'])))

        # Max Pooling
        layers_sensor = []
        layers_sensor.append(L.GlobalMaxPooling1D())
        # layers_sensor.append(L.GlobalAveragePooling1D())
        layer_concat = L.Concatenate(**config['concat'])

        # Two fully connected layers
        layers_full_connect = []
        layers_full_connect.append(L.Dropout(**config['dropout']))
        layers_full_connect.append(L.Dense(output_dim, **config['activation_layer']))

        tensor = embed_model.output
        for layer in layers_rcnn_seq:
            tensor = layer(tensor)

        tensor_output = layers_sensor[0](tensor)

        for layer in layers_full_connect:
            tensor_output = layer(tensor_output)

        self.tf_model = keras.Model(embed_model.inputs, tensor_output)


In [17]:
Xtr, ytr, Xts, yts, bert_embedding = load_data(file_path, bert_path)
embed = BareEmbedding(embedding_size=128, processor=kashgari.processors.class_processor, sequence_length=50)
tf_board = TensorBoard(log_dir='tf_dir/double_bilstm_model',
                       histogram_freq=5,
                       update_freq='batch')
K.clear_session()
model = MY_Double_BILSTM(embed)
model.fit(Xtr, ytr, Xts, yts,callbacks=[tf_board],epochs=30,batch_size=32)
report = model.evaluate(Xts, yts)
print(report)

2022-05-15 22:26:13,627 [DEBUG] kashgari - ------------------------------------------------
2022-05-15 22:26:13,628 [DEBUG] kashgari - Loaded transformer model's vocab
2022-05-15 22:26:13,631 [DEBUG] kashgari - config_path       : /content/drive/MyDrive/ColabNotebooks/roberta_zh/resources/bert_config.json
2022-05-15 22:26:13,632 [DEBUG] kashgari - vocab_path      : /content/drive/MyDrive/ColabNotebooks/roberta_zh/resources/vocab.txt
2022-05-15 22:26:13,634 [DEBUG] kashgari - checkpoint_path : /content/drive/MyDrive/ColabNotebooks/roberta_zh/resources/bert_model.ckpt
2022-05-15 22:26:13,641 [DEBUG] kashgari - Top 50 words    : ['[PAD]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]', '[unused11]', '[unused12]', '[unused13]', '[unused14]', '[unused15]', '[unused16]', '[unused17]', '[unused18]', '[unused19]', '[unused20]', '[unused21]', '[unused22]', '[unused23]', '[unused24]', '[unused25]', '[unused26]', '

Project Data Summary:
Train	 13096
Test	 6451
Label	 9
9	 1095
1	 6426
7	 1185
2	 3493
3	 3854
4	 1667
8	 226
6	 1129
5	 472


Preparing text vocab dict: 100%|██████████| 13096/13096 [00:00<00:00, 172205.64it/s]
Preparing text vocab dict: 100%|██████████| 6451/6451 [00:00<00:00, 187850.73it/s]
2022-05-15 22:26:14,697 [DEBUG] kashgari - --- Build vocab dict finished, Total: 2429 ---
2022-05-15 22:26:14,701 [DEBUG] kashgari - Top-10: ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '的', '是', '一', '们', '我', '个']
Preparing classification label vocab dict: 100%|██████████| 13096/13096 [00:00<00:00, 1072971.02it/s]
Preparing classification label vocab dict: 100%|██████████| 6451/6451 [00:00<00:00, 1211328.97it/s]
Calculating sequence length: 100%|██████████| 13096/13096 [00:00<00:00, 1114939.41it/s]
Calculating sequence length: 100%|██████████| 6451/6451 [00:00<00:00, 1179590.86it/s]
2022-05-15 22:26:14,794 [DEBUG] kashgari - Calculated sequence length = 59
2022-05-15 22:26:18,592 [DEBUG] kashgari - Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape 

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


2022-05-15 22:33:46,782 [DEBUG] kashgari - predict input shape (6451, 364) x: 
[[  2  23  18 ...   0   0   0]
 [  2  99  35 ...   0   0   0]
 [  2  23 790 ...   0   0   0]
 ...
 [  2  85  34 ...   0   0   0]
 [  2  22   8 ...   0   0   0]
 [  2 100  14 ...   0   0   0]]
2022-05-15 22:33:52,727 [DEBUG] kashgari - predict output shape (6451, 9)
2022-05-15 22:33:52,734 [DEBUG] kashgari - predict output argmax: [4 5 0 ... 3 0 2]


              precision    recall  f1-score   support

           1     0.5125    0.5532    0.5321      2077
           2     0.4262    0.4063    0.4160      1179
           3     0.4590    0.4355    0.4469      1325
           4     0.2909    0.2903    0.2906       527
           5     0.2041    0.1399    0.1660       143
           6     0.1598    0.1436    0.1513       376
           7     0.5443    0.5801    0.5616       381
           8     0.1294    0.1528    0.1401        72
           9     0.4933    0.4987    0.4960       371

    accuracy                         0.4416      6451
   macro avg     0.3577    0.3556    0.3556      6451
weighted avg     0.4367    0.4416    0.4386      6451

{'detail': {'1': {'precision': 0.5124888492417484, 'recall': 0.5532017332691381, 'f1-score': 0.5320676082426486, 'support': 2077}, '2': {'precision': 0.4261565836298932, 'recall': 0.40627650551314676, 'f1-score': 0.415979157620495, 'support': 1179}, '3': {'precision': 0.4590294351630867, 'recal

In [18]:
class My_Double_CNN(ABCClassificationModel):
    @classmethod
    def default_hyper_parameters(cls) -> Dict[str, Dict[str, Any]]:
        return {
            'spatial_dropout': {
                'rate': 0.2
            },
            'bilstm_0': {
                'units': 64,
                'return_sequences': True
            },
            'conv_0': {
                'filters': 128,
                'kernel_size': 5,
                'kernel_initializer': 'normal',
                'padding': 'valid',
                'activation': 'relu',
                'strides': 1
            },
            'conv_1': {
                'filters': 32,
                'kernel_size': 5,
                'kernel_initializer': 'normal',
                'padding': 'valid',
                'activation': 'relu',
                'strides': 1
            },
            'concat': {
                'axis': 1
            },
            'dropout': {
                'rate': 0.5
            },
            'activation_layer': {
                'activation': 'softmax'
            },
        }

    def build_model_arc(self):
        output_dim = len(self.label_processor.vocab2idx)
        config = self.hyper_parameters
        embed_model = self.embedding.embed_model

        # Two Convolution Layers
        layers_rcnn_seq = []
        layers_rcnn_seq.append(L.Conv1D(**config['conv_0']))
        layers_rcnn_seq.append(L.Conv1D(**config['conv_1']))

        # Max Pooling
        layers_sensor = []
        layers_sensor.append(L.GlobalMaxPooling1D())
        layer_concat = L.Concatenate(**config['concat'])

        # Two fully connected layers
        layers_full_connect = []
        layers_full_connect.append(L.Dropout(**config['dropout']))
        layers_full_connect.append(L.Dense(output_dim, **config['activation_layer']))

        tensor = embed_model.output
        for layer in layers_rcnn_seq:
            tensor = layer(tensor)

        tensor_output = layers_sensor[0](tensor)

        for layer in layers_full_connect:
            tensor_output = layer(tensor_output)

        self.tf_model = keras.Model(embed_model.inputs, tensor_output)


In [20]:
Xtr, ytr, Xts, yts, bert_embedding = load_data(file_path, bert_path)
embed = BareEmbedding(embedding_size=128, processor=kashgari.processors.class_processor, sequence_length=50)
tf_board = TensorBoard(log_dir='tf_dir/double_cnn_model',
                       histogram_freq=5,
                       update_freq='batch')
K.clear_session()
model = My_Double_CNN(embed)
model.fit(Xtr, ytr, Xts, yts,callbacks=[tf_board],epochs=30,batch_size=32)
report = model.evaluate(Xts, yts)
print(report)

2022-05-15 22:40:36,819 [DEBUG] kashgari - ------------------------------------------------
2022-05-15 22:40:36,822 [DEBUG] kashgari - Loaded transformer model's vocab
2022-05-15 22:40:36,824 [DEBUG] kashgari - config_path       : /content/drive/MyDrive/ColabNotebooks/roberta_zh/resources/bert_config.json
2022-05-15 22:40:36,825 [DEBUG] kashgari - vocab_path      : /content/drive/MyDrive/ColabNotebooks/roberta_zh/resources/vocab.txt
2022-05-15 22:40:36,827 [DEBUG] kashgari - checkpoint_path : /content/drive/MyDrive/ColabNotebooks/roberta_zh/resources/bert_model.ckpt
2022-05-15 22:40:36,829 [DEBUG] kashgari - Top 50 words    : ['[PAD]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]', '[unused11]', '[unused12]', '[unused13]', '[unused14]', '[unused15]', '[unused16]', '[unused17]', '[unused18]', '[unused19]', '[unused20]', '[unused21]', '[unused22]', '[unused23]', '[unused24]', '[unused25]', '[unused26]', '

Project Data Summary:
Train	 13096
Test	 6451
Label	 9
9	 1095
1	 6426
7	 1185
2	 3493
3	 3854
4	 1667
8	 226
6	 1129
5	 472


Preparing text vocab dict: 100%|██████████| 13096/13096 [00:00<00:00, 101525.05it/s]
Preparing text vocab dict: 100%|██████████| 6451/6451 [00:00<00:00, 97379.42it/s]
2022-05-15 22:40:39,231 [DEBUG] kashgari - --- Build vocab dict finished, Total: 2429 ---
2022-05-15 22:40:39,232 [DEBUG] kashgari - Top-10: ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '的', '是', '一', '们', '我', '个']
Preparing classification label vocab dict: 100%|██████████| 13096/13096 [00:00<00:00, 702447.76it/s]
Preparing classification label vocab dict: 100%|██████████| 6451/6451 [00:00<00:00, 522223.71it/s]
Calculating sequence length: 100%|██████████| 13096/13096 [00:00<00:00, 308858.35it/s]
Calculating sequence length: 100%|██████████| 6451/6451 [00:00<00:00, 234405.74it/s]
2022-05-15 22:40:39,415 [DEBUG] kashgari - Calculated sequence length = 59
2022-05-15 22:40:39,601 [DEBUG] kashgari - Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape      

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


2022-05-15 22:42:31,398 [DEBUG] kashgari - predict input shape (6451, 267) x: 
[[  2 402 972 ...   0   0   0]
 [  2   8   7 ...   0   0   0]
 [  2 148  15 ...   0   0   0]
 ...
 [  2  50  35 ...   0   0   0]
 [  2 213   4 ...   0   0   0]
 [  2 141  65 ...   0   0   0]]
2022-05-15 22:42:31,906 [DEBUG] kashgari - predict output shape (6451, 9)
2022-05-15 22:42:31,912 [DEBUG] kashgari - predict output argmax: [4 2 2 ... 1 1 3]


              precision    recall  f1-score   support

           1     0.5197    0.5519    0.5353      2129
           2     0.3620    0.4700    0.4090      1100
           3     0.4167    0.4066    0.4116      1279
           4     0.3357    0.2541    0.2893       551
           5     0.2174    0.0602    0.0943       166
           6     0.1854    0.1584    0.1709       385
           7     0.5895    0.5544    0.5714       386
           8     0.0847    0.0676    0.0752        74
           9     0.6133    0.4829    0.5404       381

    accuracy                         0.4381      6451
   macro avg     0.3694    0.3340    0.3441      6451
weighted avg     0.4337    0.4381    0.4323      6451

{'detail': {'1': {'precision': 0.5196815568332597, 'recall': 0.5519023015500235, 'f1-score': 0.5353075170842826, 'support': 2129}, '2': {'precision': 0.36204481792717086, 'recall': 0.47, 'f1-score': 0.40901898734177206, 'support': 1100}, '3': {'precision': 0.4166666666666667, 'recall': 0.406567

In [21]:
# Run Double-BiLSTM model with Roberta pre-train model
Xtr, ytr, Xts, yts, bert_embedding = load_data(file_path, bert_path)
tf_board = TensorBoard(log_dir='tf_dir/double_bilstm_model',
                       histogram_freq=5,
                       update_freq='batch')
K.clear_session()
model = MY_Double_BILSTM(bert_embedding)
model.fit(Xtr, ytr, Xts, yts,callbacks=[tf_board],epochs=30,batch_size=32)
report = model.evaluate(Xts, yts)
print(report)

2022-05-15 22:48:37,370 [DEBUG] kashgari - ------------------------------------------------
2022-05-15 22:48:37,371 [DEBUG] kashgari - Loaded transformer model's vocab
2022-05-15 22:48:37,372 [DEBUG] kashgari - config_path       : /content/drive/MyDrive/ColabNotebooks/roberta_zh/resources/bert_config.json
2022-05-15 22:48:37,374 [DEBUG] kashgari - vocab_path      : /content/drive/MyDrive/ColabNotebooks/roberta_zh/resources/vocab.txt
2022-05-15 22:48:37,376 [DEBUG] kashgari - checkpoint_path : /content/drive/MyDrive/ColabNotebooks/roberta_zh/resources/bert_model.ckpt
2022-05-15 22:48:37,377 [DEBUG] kashgari - Top 50 words    : ['[PAD]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]', '[unused11]', '[unused12]', '[unused13]', '[unused14]', '[unused15]', '[unused16]', '[unused17]', '[unused18]', '[unused19]', '[unused20]', '[unused21]', '[unused22]', '[unused23]', '[unused24]', '[unused25]', '[unused26]', '

Project Data Summary:
Train	 13096
Test	 6451
Label	 9
9	 1095
1	 6426
7	 1185
2	 3493
3	 3854
4	 1667
8	 226
6	 1129
5	 472


Preparing text vocab dict: 100%|██████████| 13096/13096 [00:00<00:00, 167620.10it/s]
Preparing text vocab dict: 100%|██████████| 6451/6451 [00:00<00:00, 168621.15it/s]
2022-05-15 22:48:38,464 [DEBUG] kashgari - --- Build vocab dict finished, Total: 2429 ---
2022-05-15 22:48:38,464 [DEBUG] kashgari - Top-10: ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '的', '是', '一', '们', '我', '个']
Preparing classification label vocab dict: 100%|██████████| 13096/13096 [00:00<00:00, 1287077.47it/s]
Preparing classification label vocab dict: 100%|██████████| 6451/6451 [00:00<00:00, 1152803.68it/s]
Calculating sequence length: 100%|██████████| 13096/13096 [00:00<00:00, 515050.64it/s]
Calculating sequence length: 100%|██████████| 6451/6451 [00:00<00:00, 814713.65it/s]
2022-05-15 22:48:42,420 [DEBUG] kashgari - Calculated sequence length = 59
2022-05-15 22:48:45,666 [DEBUG] kashgari - Model: "functional_3"
__________________________________________________________________________________________________
Layer (type

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


2022-05-15 23:37:34,997 [DEBUG] kashgari - predict input shape (2, 6451, 364) x: 
(array([[ 101, 3217, 1921, ...,    0,    0,    0],
       [ 101, 1071, 2141, ...,    0,    0,    0],
       [ 101, 1086, 6435, ...,    0,    0,    0],
       ...,
       [ 101,  671,  831, ...,    0,    0,    0],
       [ 101, 5018,  676, ...,    0,    0,    0],
       [ 101, 1765, 4413, ...,    0,    0,    0]], dtype=int32), array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32))
2022-05-15 23:40:37,891 [DEBUG] kashgari - predict output shape (6451, 9)
2022-05-15 23:40:37,901 [DEBUG] kashgari - predict output argmax: [2 3 3 ... 4 0 2]


              precision    recall  f1-score   support

           1     0.5411    0.6329    0.5834      2100
           2     0.4450    0.4936    0.4681      1165
           3     0.5266    0.4269    0.4715      1300
           4     0.3683    0.3612    0.3647       515
           5     0.2929    0.1986    0.2367       146
           6     0.2626    0.2114    0.2342       369
           7     0.6220    0.5050    0.5574       404
           8     0.2308    0.1364    0.1714        88
           9     0.5978    0.6044    0.6011       364

    accuracy                         0.4942      6451
   macro avg     0.4319    0.3967    0.4098      6451
weighted avg     0.4895    0.4942    0.4885      6451

{'detail': {'1': {'precision': 0.5411237785016286, 'recall': 0.6328571428571429, 'f1-score': 0.5834064969271291, 'support': 2100}, '2': {'precision': 0.44504643962848295, 'recall': 0.49356223175965663, 'f1-score': 0.468050468050468, 'support': 1165}, '3': {'precision': 0.5265654648956357, 'reca