**mounted drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Import lib**

In [2]:
!pip install tensorflow_addons
!pip install gensim==4.1.2

Collecting tensorflow_addons
  Downloading tensorflow_addons-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (612 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/612.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.6/612.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m604.2/612.1 kB[0m [31m8.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m612.1/612.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow_addons)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow_addons
Successfully installed tensorflow_addons-0.21.0 typeguard-2.13.3
Collecting gensim==4.1.2
  Downloading gensim-4.1.2.tar.gz (23.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
from tensorflow_addons.text import crf_log_likelihood, crf_decode
from tensorflow.keras.preprocessing.text import Tokenizer, tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, TimeDistributed, Input, BatchNormalization, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import initializers, Sequential
import tensorflow.keras.optimizers as Optimizer


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



**Set up model**

In [4]:
class CRF(L.Layer):
    def __init__(self,
                 output_dim,
                 sparse_target=True,
                 **kwargs):
        """
        Args:
            output_dim (int): the number of labels to tag each temporal input.
            sparse_target (bool): whether the the ground-truth label represented in one-hot.
        Input shape:
            (batch_size, sentence length, output_dim)
        Output shape:
            (batch_size, sentence length, output_dim)
        """
        super(CRF, self).__init__(**kwargs)
        self.output_dim = int(output_dim)
        self.sparse_target = sparse_target
        self.input_spec = L.InputSpec(min_ndim=3)
        self.supports_masking = False
        self.sequence_lengths = None
        self.transitions = None

    def build(self, input_shape):
        assert len(input_shape) == 3
        f_shape = tf.TensorShape(input_shape)
        input_spec = L.InputSpec(min_ndim=3, axes={-1: f_shape[-1]})

        if f_shape[-1] is None:
            raise ValueError('The last dimension of the inputs to `CRF` '
                             'should be defined. Found `None`.')
        if f_shape[-1] != self.output_dim:
            raise ValueError('The last dimension of the input shape must be equal to output'
                             ' shape. Use a linear layer if needed.')
        self.input_spec = input_spec
        self.transitions = self.add_weight(name='transitions',
                                           shape=[self.output_dim, self.output_dim],
                                           initializer='glorot_uniform',
                                           trainable=True)
        self.built = True

    def compute_mask(self, inputs, mask=None):
        # Just pass the received mask from previous layer, to the next layer or
        # manipulate it if this layer changes the shape of the input
        return mask

    def call(self, inputs, sequence_lengths=None, training=None, **kwargs):
        sequences = tf.convert_to_tensor(inputs, dtype=self.dtype)
        if sequence_lengths is not None:
            assert len(sequence_lengths.shape) == 2
            assert tf.convert_to_tensor(sequence_lengths).dtype == 'int32'
            seq_len_shape = tf.convert_to_tensor(sequence_lengths).get_shape().as_list()
            assert seq_len_shape[1] == 1
            self.sequence_lengths = K.flatten(sequence_lengths)
        else:
            self.sequence_lengths = tf.ones(tf.shape(inputs)[0], dtype=tf.int32) * (
                tf.shape(inputs)[1]
            )

        viterbi_sequence, _ = crf_decode(sequences,
                                         self.transitions,
                                         self.sequence_lengths)
        output = K.one_hot(viterbi_sequence, self.output_dim)
        return K.in_train_phase(sequences, output)

    @property
    def loss(self):
        def crf_loss(y_true, y_pred):
            y_pred = tf.convert_to_tensor(y_pred, dtype=self.dtype)
            log_likelihood, self.transitions = crf_log_likelihood(
                y_pred,
                tf.cast(K.argmax(y_true), dtype=tf.int32) if self.sparse_target else y_true,
                self.sequence_lengths,
                transition_params=self.transitions,
            )
            return tf.reduce_mean(-log_likelihood)
        return crf_loss

    @property
    def accuracy(self):
        def viterbi_accuracy(y_true, y_pred):
            # -1e10 to avoid zero at sum(mask)
            mask = K.cast(
                K.all(K.greater(y_pred, -1e10), axis=2), K.floatx())
            shape = tf.shape(y_pred)
            sequence_lengths = tf.ones(shape[0], dtype=tf.int32) * (shape[1])
            y_pred, _ = crf_decode(y_pred, self.transitions, sequence_lengths)
            if self.sparse_target:
                y_true = K.argmax(y_true, 2)
            y_pred = K.cast(y_pred, 'int32')
            y_true = K.cast(y_true, 'int32')
            corrects = K.cast(K.equal(y_true, y_pred), K.floatx())
            return K.sum(corrects * mask) / K.sum(mask)
        return viterbi_accuracy

    def compute_output_shape(self, input_shape):
        tf.TensorShape(input_shape).assert_has_rank(3)
        return input_shape[:2] + (self.output_dim,)

    def get_config(self):
        config = {
            'output_dim': self.output_dim,
            'sparse_target': self.sparse_target,
            'supports_masking': self.supports_masking,
            'transitions': K.eval(self.transitions)
        }
        base_config = super(CRF, self).get_config()
        return dict(base_config, **config)

**Word tokenize module**

In [5]:
!pip install py_vncorenlp

Collecting py_vncorenlp
  Downloading py_vncorenlp-0.1.4.tar.gz (3.9 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyjnius (from py_vncorenlp)
  Downloading pyjnius-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: py_vncorenlp
  Building wheel for py_vncorenlp (setup.py) ... [?25l[?25hdone
  Created wheel for py_vncorenlp: filename=py_vncorenlp-0.1.4-py3-none-any.whl size=4306 sha256=1fa23dda20a36e3a5bfc1a45ff79b81579d9f4e084207f43ed31568e83164636
  Stored in directory: /root/.cache/pip/wheels/d5/d9/bf/62632cdb007c702a0664091e92a0bb1f18a2fcecbe962d9827
Successfully built py_vncorenlp
Installing collected packages: pyjnius, py_vncorenlp
Successfully installed py_vncorenlp-0.1.4 pyjnius-1.5.0


In [12]:
!rm -r /content/models

In [None]:
import py_vncorenlp

py_vncorenlp.download_model(save_dir='/content/')

In [14]:
rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='/content/')

**get tokenize**

In [15]:
from gensim.models import FastText
import pickle
trained = True

model_fasttext = FastText.load('/content/drive/MyDrive/CRF/Data/Fasttext/model_fasttext_gensim.bin')

with open('/content/drive/MyDrive/CRF/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

word_index = tokenizer.word_index
emb_mean, emb_std = -0.5,0.5
embed_size = 100 #Kích thước vector biểu diễn 1 từ
nb_words = len(word_index) + 1
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    if word in model_fasttext.wv.key_to_index:
        embedding_matrix[i] = model_fasttext.wv.get_vector(word)

with open('/content/drive/MyDrive/CRF/tag_tokenizer.pickle', 'rb') as handle:
    tag_tokenizer = pickle.load(handle)

tag_index = tag_tokenizer.word_index
tag_size = len(tag_index) + 1

In [16]:
def create_model(embeddings_matrix, vocab_size, embedding_dim, max_length):
    crf = CRF(len(tag_index), sparse_target=True)
    input = Input(shape = (max_length, ), dtype='int32', name='input_text')
    x = Embedding(input_dim=vocab_size, output_dim=embedding_dim,
                  weights=[embedding_matrix])(input)
    x = Bidirectional(LSTM(units=max_length, return_sequences=True,
                                recurrent_dropout=0.01))(x)
    x = TimeDistributed(Dense(128, activation='relu', kernel_initializer='he_normal'))(x)
    x = BatchNormalization()(x)
    x = Dropout(rate=0.6)(x)
    x = Dense(len(tag_index), activation='relu', kernel_initializer='he_normal')(x)
    x = BatchNormalization()(x)
    x = Dropout(rate=0.1)(x)
    output = crf(x)
    model_final = Model(input, output)
    model_final.compile(optimizer=Optimizer.Adam(lr=0.001), loss=crf.loss,
                        metrics=[crf.accuracy])

    return model_final

model_ner = create_model(embedding_matrix, nb_words, embed_size, max_length = 100)
model_ner.load_weights("/content/drive/MyDrive/CRF/Data/Fasttext/best_weight.hdf5")



In [17]:
def get_tags(sequences, tag_index):
    sequence_tags = []
    for sequence in sequences:
        sequence_tag = []
        for categorical in sequence:
            sequence_tag.append(tag_index.get(np.argmax(categorical)))
        sequence_tags.append(sequence_tag)
    return sequence_tags

def predict(model, tag_tokenizer, sent):
    tag_index = tag_tokenizer.word_index
    tag_size = len(tag_index) + 1
    pred = model.predict(sent)
    sequence_tags = get_tags(pred, {i: t for t, i in tag_index.items()})
    for idx, each in enumerate(sequence_tags):
        try:
           idx_cut = each.index(None)
        except:
           idx_cut = len(each) + 1
        sequence_tags[idx] = each[:idx_cut]
    return sequence_tags

def match_pair_ner(text, ner):
    dict_ = {}
    text_arr = text.split(' ')
    save_ner, save_words = None, ''
    for i in range(len(text_arr)):
      if ner[i] == 'O' or 'B_' in ner[i] or ('I_' in ner[i] and ner[i - 1] == 'O'):
        if save_ner is not None and save_ner != 'O':
          if save_ner not in dict_:
            dict_[save_ner] = []
          dict_[save_ner].append(save_words.replace('_', ' ').strip())
        save_words = text_arr[i] + ' '
      else:
        save_words += text_arr[i] + ' '
      save_ner = ner[i] if ner[i] == 'O' else ner[i][2:]
    if save_ner is not None and save_ner != 'O':
        if save_ner not in dict_:
            dict_[save_ner] = []
        dict_[save_ner].append(save_words.replace('_', ' ').strip())
    return dict_

**Infer**

In [20]:
month_list = ["t","th","thg","tháng"]
quy_list = ["q","quý"]
number_list = [*range(1,13)]
num_quarter_list = [*range(1,5)]

def infer_ner(text):
    text = text.replace('?','').replace('/',' ').replace("\\"," ").replace('.'," ").replace("_","").lower()
    for month_desc in month_list:
      for num in number_list:
        text = text.replace(month_desc + str(num),month_desc + " " + str(num))
    for quy in quy_list:
      for num in num_quarter_list:
        text = text.replace(quy + str(num),quy + " " + str(num))
    word_segment = rdrsegmenter.word_segment(text)
    res_text = tokenizer.texts_to_sequences(word_segment)
    res_text = pad_sequences(res_text, maxlen=100, padding='post')
    res_text = predict(model_ner, tag_tokenizer, res_text)
    # print(res_text)
    dict_ = match_pair_ner(word_segment[0], res_text[0])
    return dict_

**gradio**

In [None]:
!pip install gradio

In [24]:
import gradio as gr

demo = gr.Interface(fn=infer_ner, inputs="text", outputs="text")

if __name__ == "__main__":
    demo.launch(share=True,debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://cad34f47d6de6777ba.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://cad34f47d6de6777ba.gradio.live
