<a href="https://colab.research.google.com/github/hhhhh0102/financial_software_project/blob/main/ner_model/make_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
!pip install sentencepiece

In [None]:
import tensorflow as tf
import numpy as np
import os
from transformers import *
from keras.preprocessing.sequence import pad_sequences
from tokenizer import KoBertTokenizer
from sklearn.model_selection import train_test_split
from Preprocess import preprocess


class make_model:
    def __init__(self):
        self.max_len = 88
        self.bs = 32
        self.pr = preprocess()
        self.tr_inputs = None
        self.val_inputs = None
        self.tr_tags = None
        self.val_tags = None
        self.tr_masks = None
        self.val_masks = None
        self.resolver = None

    def make_input(self):
        self.pr.preprocessing()
        tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
        tokenized_texts_and_labels = [
            tokenizer.tokenize_and_preserve_labels(sent, labs)
            for sent, labs in zip(self.pr.sentences, self.pr.targets)]
        tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
        labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

        input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                                  maxlen=self.max_len, dtype="int", value=tokenizer.convert_tokens_to_ids("[PAD]"),
                                  truncating="post", padding="post")

        tags = pad_sequences([lab for lab in labels], maxlen=self.max_len, value=self.pr.label_dict["[PAD]"],
                             padding='post', \
                             dtype='int', truncating='post')

        attention_masks = np.array(
            [[int(i != tokenizer.convert_tokens_to_ids("[PAD]")) for i in ii] for ii in input_ids])

        self.tr_inputs, self.val_inputs, self.tr_tags, self.val_tags = train_test_split(input_ids, tags,
                                                                                        random_state=2018,
                                                                                        test_size=0.1)

        self.tr_masks, self.val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                                               random_state=2018, test_size=0.1)

    def create_model(self):
      # TPU 작동을 위해 실행
      self.resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
      tf.config.experimental_connect_to_cluster(self.resolver)
      tf.tpu.experimental.initialize_tpu_system(self.resolver)

      SEQ_LEN = self.max_len
      model = TFBertModel.from_pretrained("monologg/kobert", from_pt=True, num_labels=len(self.pr.label_dict),
                                            output_attentions=False,
                                            output_hidden_states=False)
      token_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_word_ids')  # 토큰 인풋
      mask_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_masks')  # 마스크 인풋

      bert_outputs = model.bert([token_inputs, mask_inputs])
      bert_outputs = bert_outputs[0]  # shape : (Batch_size, max_len, 30(개체의 총 개수))
      nr = tf.keras.layers.Dense(30, activation='softmax')(bert_outputs)  # shape : (Batch_size, max_len, 30)

      nr_model = tf.keras.Model([token_inputs, mask_inputs], nr)
      

      nr_model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.00002),
                         loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                         metrics=['sparse_categorical_accuracy'])
      nr_model.summary()
      return nr_model

    def train_model(self):
      self.resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
      tf.config.experimental_connect_to_cluster(self.resolver)
      tf.tpu.experimental.initialize_tpu_system(self.resolver)
      strategy = tf.distribute.experimental.TPUStrategy(self.resolver)
      # TPU를 활용하기 위해 context로 묶어주기
      with strategy.scope():
        nr_model = model.create_model()
        nr_model.fit([self.tr_inputs, self.tr_masks], self.tr_tags, 
                     validation_data=([self.val_inputs, self.val_masks], self.val_tags), epochs=20, shuffle=False, batch_size = self.bs)
        nr_model.save("my_model.h5")

    def load_model(self):
      model = tf.keras.models.load_model("my_model.h5")
      return model


In [None]:
model = make_model()

In [None]:
model.make_input()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=371391.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=77779.0, style=ProgressStyle(descriptio…




In [None]:
model.train_model()

INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0


INFO:tensorflow:Initializing the TPU system: grpc://10.119.55.250:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.119.55.250:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)






INFO:tensorflow:Initializing the TPU system: grpc://10.119.55.250:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.119.55.250:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=426.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=368792146.0, style=ProgressStyle(descri…




All PyTorch model weights were used when initializing TFBertModel.

All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 88)]         0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 88)]         0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 92186880    input_word_ids[0][0]             
                                                                 input_masks[0][0]                
__________________________________________________________________________________________________
dense (Dense)                   (None, 88, 30)       23070       bert[0][0]                   







Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
nr_model = model.load_model()





In [None]:
 y_predicted = nr_model.predict([model.val_inputs, model.val_masks])

f_label = [i for i, j in model.pr.label_dict.items()]
model.val_tags_l = [model.pr.index_to_ner[x] for x in np.ravel(model.val_tags).astype(int).tolist()]
y_predicted_l = [model.pr.index_to_ner[x] for x in np.ravel(np.argmax(y_predicted, axis=2)).astype(int).tolist()]
f_label.remove("[PAD]")

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
print(classification_report(model.val_tags_l, y_predicted_l, labels=f_label))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       PER_B       0.85      0.91      0.88     10932
       DAT_B       0.89      0.86      0.88      3902
           -       0.96      0.96      0.96    128371
       ORG_B       0.86      0.87      0.87     11584
       CVL_B       0.88      0.84      0.86     14292
       NUM_B       0.94      0.93      0.94     10286
       LOC_B       0.87      0.83      0.85      5364
       EVT_B       0.80      0.83      0.82      2863
       TRM_B       0.84      0.87      0.85      6896
       TRM_I       0.56      0.46      0.50       817
       EVT_I       0.84      0.73      0.78      1411
       PER_I       0.73      0.82      0.77      1380
       CVL_I       0.61      0.47      0.53       822
       NUM_I       0.78      0.77      0.78      1408
       TIM_B       0.85      0.90      0.88       578
       TIM_I       0.94      0.88      0.91       204
       ORG_I       0.78      0.67      0.72      1164
       DAT_I       0.86    