#####################################
##    뉴스 요약 트랜스포머 모델   ##
#####################################

0. 인코딩 타입 지정, 라이브러리 임포트, GPU 확인

In [43]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from keras.callbacks import Callback, ModelCheckpoint
from keras.models import load_model
import pickle

gpus = tf.config.list_physical_devices('GPU')
print("Available GPUs:", gpus)

Available GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


1. 전처리 데이터 로드

In [44]:
with open('D:/TJ_FInal_Project/KDJ/News_Summarization/Data/pickle/sentences.pkl', 'rb') as f:
    sentences = pickle.load(f)

with open('D:/TJ_FInal_Project/KDJ/News_Summarization/Data/pickle/abs.pkl', 'rb') as f:
    abs = pickle.load(f)

dataset = tf.data.Dataset.from_tensor_slices((
    {
        'inputs':sentences,
        'dec_inputs':abs[:, :-1]
    },
    {
        'outputs':abs[:, 1:]
    }
))

print(f'sentence : {sentences[1]}')
print(f'\nabs : {abs[1]}')

def convert_to_int16(inputs, outputs):
    inputs = {key: tf.cast(value, tf.int16) for key, value in inputs.items()}
    outputs = {key: tf.cast(value, tf.int16) for key, value in outputs.items()}
    return inputs, outputs

dataset = dataset.map(convert_to_int16)

for batch in dataset.take(1):
    print({key: value.dtype for key, value in batch[0].items()})
    print({key: value.dtype for key, value in batch[1].items()})

sentence : [4079 1001  154  387 2215 1381   59 2765 3855   16 2507 3855  567 3855
  101 2139  501 1904   19   60 1536 1001  154  387 2215  556    8   77
 1348   91  653  537   59 2765 3855 1170   89  452   24  256   90    7
   16  550    1   51  211  942 3282 3855   40   43  820  826    2 1844
  279 1379   54 4060 3956 3980    3  567  399  536    6 2106 3855   75
  960  536  168 2437   11  673  978 3898 1758  684  387  426 3876  706
   19  228   82  106    3 3893 3888 2192 3855 1541 3855   28 3888 3890
 3899 3855 1246  376 3587    2 2588 1459   16 2636 3855   12 2731  190
   51  511  858  304    2  659    1   40   43  820  826    9   91  653
  100    4 1380    7 1413 2404  288    2 1001  154  387 2215  556    4
   59   15 1112 3413  111  256   90    7   16  550    1  474 1001    9
 2731   26 1480  101  106   22    1   78  351 1043 3493   64 1243  158
  875 3193 1704  894 2792   18  501  283   10  126  219 1101   26  101
  106   22    1  909 3789  288  220  222   90 1428   17   19  172 

2. 하이퍼파라미터 설정

In [45]:
tokenizer = tfds.deprecated.text.SubwordTextEncoder.load_from_file('D:/TJ_FInal_Project/KDJ/News_Summarization/Data/문서요약 텍스트/Preprocess/tokenizer')

SEN_MAX_LENGTH = 799
ABS_MAX_LENGTH = 149

VOCAB_SIZE = tokenizer.vocab_size + 2
BATCH_SIZE = 128
BUFFER_SIZE = 20000

D_MODEL = 128
NUM_LAYERS = 2  # 1로 바꿔보기
NUM_HEADS = 2
DFF = 256
DROPOUT = 0.3

# D_MODEL = 256
# NUM_LAYERS = 2
# NUM_HEADS = 8
# DFF = 512

EPOCHS = 2000

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

warmup_steps = 1000
previous_steps = 0


3. 트랜스포머 모델 빌드

3-1. 포지셔널 인코딩

In [46]:
class PositionalEncoding(tf.keras.layers.Layer):
  def __init__(self, position, d_model):
    super(PositionalEncoding, self).__init__()
    self.pos_encoding = self.positional_encoding(position, d_model)

  def get_angles(self, position, i, d_model):
    angles = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32))
    return position * angles

  def positional_encoding(self, position, d_model):
    angle_rads = self.get_angles(
        position=tf.range(position, dtype=tf.float32)[:, tf.newaxis],
        i=tf.range(d_model, dtype=tf.float32)[tf.newaxis, :],
        d_model=d_model)

    sines = tf.math.sin(angle_rads[:, 0::2])

    cosines = tf.math.cos(angle_rads[:, 1::2])

    angle_rads = np.zeros(angle_rads.shape)
    angle_rads[:, 0::2] = sines
    angle_rads[:, 1::2] = cosines
    pos_encoding = tf.constant(angle_rads)
    pos_encoding = pos_encoding[tf.newaxis, ...]

    print(pos_encoding.shape)
    return tf.cast(pos_encoding, tf.float32)

  def call(self, inputs):
    return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]

3-2. 패딩 마스크

In [47]:
def create_padding_mask(x):
  mask = tf.cast(tf.math.equal(x, 0), tf.float32)
  return mask[:, tf.newaxis, tf.newaxis, :]

3-3. 룩-어헤드 마스크

In [48]:
def create_look_ahead_mask(x):
  seq_len = tf.shape(x)[1]
  look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
  padding_mask = create_padding_mask(x)
  return tf.maximum(look_ahead_mask, padding_mask)

3-4. 셀프 어텐션(스케일드 닷 프로덕트 어텐션)

In [49]:
def scaled_dot_product_attention(query, key, value, mask):
  matmul_qk = tf.matmul(query, key, transpose_b=True)

  depth = tf.cast(tf.shape(key)[-1], tf.float32)
  logits = matmul_qk / tf.math.sqrt(depth)

  if mask is not None:
    logits += (mask * -1e9)

  attention_weights = tf.nn.softmax(logits, axis=-1)

  output = tf.matmul(attention_weights, value)

  return output, attention_weights

3-5. 멀티헤드 어텐션

In [50]:
class MultiHeadAttention(tf.keras.layers.Layer):

  def __init__(self, d_model, num_heads, name="multi_head_attention"):
    super(MultiHeadAttention, self).__init__(name=name)
    self.num_heads = num_heads
    self.d_model = d_model

    assert d_model % self.num_heads == 0

    self.depth = d_model // self.num_heads

    self.query_dense = tf.keras.layers.Dense(units=d_model)
    self.key_dense = tf.keras.layers.Dense(units=d_model)
    self.value_dense = tf.keras.layers.Dense(units=d_model)

    self.dense = tf.keras.layers.Dense(units=d_model)

  def split_heads(self, inputs, batch_size):
    inputs = tf.reshape(
        inputs, shape=(batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(inputs, perm=[0, 2, 1, 3])

  def call(self, inputs):
    query, key, value, mask = inputs['query'], inputs['key'], inputs[
        'value'], inputs['mask']
    batch_size = tf.shape(query)[0]

    query = self.query_dense(query)
    key = self.key_dense(key)
    value = self.value_dense(value)

    query = self.split_heads(query, batch_size)
    key = self.split_heads(key, batch_size)
    value = self.split_heads(value, batch_size)

    scaled_attention, _ = scaled_dot_product_attention(query, key, value, mask)
    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

    concat_attention = tf.reshape(scaled_attention,
                                  (batch_size, -1, self.d_model))

    outputs = self.dense(concat_attention)

    return outputs

3-6. 인코더 층

In [51]:
def encoder_layer(dff, d_model, num_heads, dropout, name="encoder_layer"):
  inputs = tf.keras.Input(shape=(None, d_model), name="inputs")

  padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")

  attention = MultiHeadAttention(
      d_model, num_heads, name="attention")({
          'query': inputs, 'key': inputs, 'value': inputs,
          'mask': padding_mask
      })

  attention = tf.keras.layers.Dropout(rate=dropout)(attention)
  attention = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(inputs + attention)

  outputs = tf.keras.layers.Dense(units=dff, activation='relu')(attention)
  outputs = tf.keras.layers.Dense(units=d_model)(outputs)

  outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
  outputs = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(attention + outputs)

  return tf.keras.Model(
      inputs=[inputs, padding_mask], outputs=outputs, name=name)

3-7. 인코더

In [52]:
def encoder(vocab_size, num_layers, dff,
            d_model, num_heads, dropout,
            name="encoder"):
  inputs = tf.keras.Input(shape=(None,), name="inputs")

  padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")

  embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
  embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
  embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)
  outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)

  for i in range(num_layers):
    outputs = encoder_layer(dff=dff, d_model=d_model, num_heads=num_heads,
        dropout=dropout, name="encoder_layer_{}".format(i),
    )([outputs, padding_mask])

  return tf.keras.Model(
      inputs=[inputs, padding_mask], outputs=outputs, name=name)

3-8. 디코더 층

In [53]:
def decoder_layer(dff, d_model, num_heads, dropout, name="decoder_layer"):
  inputs = tf.keras.Input(shape=(None, d_model), name="inputs")
  enc_outputs = tf.keras.Input(shape=(None, d_model), name="encoder_outputs")

  look_ahead_mask = tf.keras.Input(
      shape=(1, None, None), name="look_ahead_mask")

  padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')

  attention1 = MultiHeadAttention(
      d_model, num_heads, name="attention_1")(inputs={
          'query': inputs, 'key': inputs, 'value': inputs,
          'mask': look_ahead_mask
      })

  attention1 = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(attention1 + inputs)

  attention2 = MultiHeadAttention(
      d_model, num_heads, name="attention_2")(inputs={
          'query': attention1, 'key': enc_outputs, 'value': enc_outputs,
          'mask': padding_mask
      })

  attention2 = tf.keras.layers.Dropout(rate=dropout)(attention2)
  attention2 = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(attention2 + attention1)

  outputs = tf.keras.layers.Dense(units=dff, activation='relu')(attention2)
  outputs = tf.keras.layers.Dense(units=d_model)(outputs)

  outputs = tf.keras.layers.Dropout(rate=dropout)(outputs)
  outputs = tf.keras.layers.LayerNormalization(
      epsilon=1e-6)(outputs + attention2)

  return tf.keras.Model(
      inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask],
      outputs=outputs,
      name=name)

3-9. 디코더

In [54]:
def decoder(vocab_size, num_layers, dff,
            d_model, num_heads, dropout,
            name='decoder'):
  inputs = tf.keras.Input(shape=(None,), name='inputs')
  enc_outputs = tf.keras.Input(shape=(None, d_model), name='encoder_outputs')

  look_ahead_mask = tf.keras.Input(
      shape=(1, None, None), name='look_ahead_mask')
  padding_mask = tf.keras.Input(shape=(1, 1, None), name='padding_mask')

  embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
  embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
  embeddings = PositionalEncoding(vocab_size, d_model)(embeddings)
  outputs = tf.keras.layers.Dropout(rate=dropout)(embeddings)

  for i in range(num_layers):
    outputs = decoder_layer(dff=dff, d_model=d_model, num_heads=num_heads,
        dropout=dropout, name='decoder_layer_{}'.format(i),
    )(inputs=[outputs, enc_outputs, look_ahead_mask, padding_mask])

    outputs = tf.keras.layers.BatchNormalization()(outputs)

  return tf.keras.Model(
      inputs=[inputs, enc_outputs, look_ahead_mask, padding_mask],
      outputs=outputs,
      name=name)
  
tf.keras.backend.clear_session()

3-10. 트랜스포머

In [55]:
def transformer(vocab_size, num_layers, dff,
                d_model, num_heads, dropout,
                name="transformer"):
  inputs = tf.keras.Input(shape=(None,), name="inputs")
  dec_inputs = tf.keras.Input(shape=(None,), name="dec_inputs")
  enc_padding_mask = tf.keras.layers.Lambda(
      create_padding_mask, output_shape=(1, 1, None),
      name='enc_padding_mask')(inputs)
  look_ahead_mask = tf.keras.layers.Lambda(
      create_look_ahead_mask, output_shape=(1, None, None),
      name='look_ahead_mask')(dec_inputs)
  dec_padding_mask = tf.keras.layers.Lambda(
      create_padding_mask, output_shape=(1, 1, None),
      name='dec_padding_mask')(inputs)
  enc_outputs = encoder(vocab_size=vocab_size, num_layers=num_layers, dff=dff,
      d_model=d_model, num_heads=num_heads, dropout=dropout,
  )(inputs=[inputs, enc_padding_mask]) 

  dec_outputs = decoder(vocab_size=vocab_size, num_layers=num_layers, dff=dff,
      d_model=d_model, num_heads=num_heads, dropout=dropout,
  )(inputs=[dec_inputs, enc_outputs, look_ahead_mask, dec_padding_mask])

  outputs = tf.keras.layers.Dense(units=vocab_size, name="outputs")(dec_outputs)
  
  return tf.keras.Model(inputs=[inputs, dec_inputs], outputs=outputs, name=name)

3-11. 모델 구조 선언

In [56]:
model = transformer(
  vocab_size=VOCAB_SIZE,
  num_layers=NUM_LAYERS,
  dff=DFF,
  d_model=D_MODEL,
  num_heads=NUM_HEADS,
  dropout=DROPOUT
)

(1, 4081, 128)
(1, 4081, 128)


4. 커스텀 스케줄러

In [57]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):

  def __init__(self, d_model, warmup_steps=1000, previous_steps=0):
    super(CustomSchedule, self).__init__()
    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)
    self.warmup_steps = warmup_steps
    self.previous_steps = previous_steps

  def __call__(self, step):
    step = tf.cast(step, tf.float32)
    adjusted_step = step + self.previous_steps
    arg1 = tf.math.rsqrt(adjusted_step)
    arg2 = adjusted_step * (self.warmup_steps**-1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
  
learning_rate = CustomSchedule(D_MODEL, warmup_steps, previous_steps)


5. 오차 함수 및 옵티마이저

In [58]:
def loss_function(y_true, y_pred):
  y_true = tf.reshape(y_true, shape=(-1, ABS_MAX_LENGTH - 1))

  loss = tf.keras.losses.SparseCategoricalCrossentropy(
      from_logits=True, reduction='none')(y_true, y_pred)

  mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
  loss = tf.multiply(loss, mask)

  return tf.reduce_mean(loss)

optimizer = tf.keras.optimizers.Adam(
  learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9
)

6. 정확도 계산 함수 및 컴파일

In [59]:
def acuuracy(y_true, y_pred):
  y_true = tf.reshape(y_true, shape=(-1, ABS_MAX_LENGTH - 1))
  return tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)

model.compile(optimizer=optimizer, loss=loss_function, metrics=[acuuracy])

7. 검증 함수

In [60]:
START_TOKEN, END_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size+1]

def evaluate(sentence):
  sentence = tf.expand_dims(
      START_TOKEN + tokenizer.encode(sentence) + END_TOKEN, axis=0)

  output = tf.expand_dims(START_TOKEN, 0)

  for i in range(ABS_MAX_LENGTH):
    predictions = model(inputs=[sentence, output], training=False)

    predictions = predictions[:, -1:, :]
    predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

    if tf.equal(predicted_id, END_TOKEN[0]):
      break

    output = tf.concat([output, predicted_id], axis=-1)

  return tf.squeeze(output, axis=0)

8. 예측 함수

In [61]:
def predict(sentence):
  prediction = evaluate(sentence)

  predicted_sentence = tokenizer.decode(
      [i for i in prediction if i < tokenizer.vocab_size])
  print(f'원문 : {sentence}')
  print(f'\n요약 : {predicted_sentence}')

  return predicted_sentence

9. 검증 데이터 준비 및 epoch 마다 검증

In [62]:
df = pd.read_csv('D:/TJ_FInal_Project/KDJ/News_Summarization/Data/문서요약 텍스트/Preprocess/finalPreprocess.csv')
sentence = df['sentence']
abs = df['abs']

predict_sentence_1 = sentence[100]
predict_sentence_2 = sentence[200]
predict_abs_1 = abs[100]
predict_abs_2 = abs[200]

previous_loss = tf.Variable(float('inf'), trainable=False)

class EpochValidation(Callback):
    def on_epoch_end(self, epoch, logs=None):
        self.epochPrint(epoch, logs)

    def epochPrint(self, epoch, logs=None):
        if epoch % 10 == 0:
            print("\n***************  첫번째 예측  ***************")
            predict(predict_sentence_1)
            print(f'\n정답 : {predict_abs_1}')
            print("\n***************  두번째 예측  ***************")
            predict(predict_sentence_2)
            print(f'\n정답 : {predict_abs_2}')
        
        current_loss = logs.get('loss')
        
        if current_loss is not None:
            if current_loss < previous_loss.numpy():
                model.save_weights('D:/TJ_FInal_Project/KDJ/News_Summarization/Model/transformer.h5')
                print(f'\n손실 값 감소! 이전 손실: {previous_loss.numpy()}, 현재 손실: {current_loss}')
            else:
                print(f'\n손실 값 증가 또는 동일')
            
            previous_loss.assign(current_loss)

10. 모델 불러오기 및 검증

In [63]:
import sys
import re

model.load_weights('D:/TJ_FInal_Project/KDJ/News_Summarization/Model/transformer(202_0.89_0.22).h5')

def regex_column(columnList):
  if not isinstance(columnList, str):                                                   # Nan을 빈 문자열로 대체
      return ''
  columnList = re.sub(r'\S+@\S+\.\S+', '', columnList)                                  # 이메일 삭제
  columnList = columnList.replace('\n', '')                                             # 개행 삭제
  columnList = re.sub(r'\[.*?\]|\{.*?\}|\(.*?\)', '', columnList)                       # 소,중,대괄호 내용 삭제
  columnList = re.sub(r'[^가-힣a-zA-Z0-9\u4e00-\u9fff\s.,!?\'\"~]', ' ', columnList)    # 이상한 특수문자 삭제
  columnList = re.sub(r'\s+', ' ', columnList).strip()                                  # 양 끝 공백 삭제
  return columnList

while True:
  news_article = []

  print("\n뉴스 기사를 입력하세요. 입력을 마치려면 enter을 입력하세요.")
  while True:
      line = input()
      if line == "":
          break
      news_article.append(line)

  # 리스트를 문자열로 변환
  news_article = "\n".join(news_article)
  regex_articel = regex_column(news_article)
  print(news_article)
  print("\n***************  결과  ***************")
  predict(regex_articel)


뉴스 기사를 입력하세요. 입력을 마치려면 enter을 입력하세요.
연말정산서류가 회사로갔고 회사에서 연말정산쪽?으로 보내서  전화했더니 5월에 수정신고하라고 연락이올수도있다고하는데  등본만누락됬는데 만약 5월에 수정신고할때 처음부터 다시 다~~해야하나요  아니면 등본만가져가서 수정신청을해야하는건가요?ㅠㅠ  너무너무 어려워서 연말정산 너무어려워요,    ㅠㅠㅠㅠ 등본만내도될까요?ㅠㅠ

***************  결과  ***************
원문 : 연말정산서류가 회사로갔고 회사에서 연말정산쪽?으로 보내서 전화했더니 5월에 수정신고하라고 연락이올수도있다고하는데 등본만누락됬는데 만약 5월에 수정신고할때 처음부터 다시 다~~해야하나요 아니면 등본만가져가서 수정신청을해야하는건가요? 너무너무 어려워서 연말정산 너무어려워요, 등본만내도될까요?

요약 : 연말정산쪽 으로 보내서 전화했더니 5월에 수정신고할때 처음부터 다시 다~해야하나요 아니면 등본만가져가서 수정신청을해야하는건가요, 등본만내도될까요, 등본만내도될까요, 등본만내도될까요, 등본만내도될까요, 등본만내도될까요, 등본만내도될까요, 등본만내도될까요, 등본만내도될까요, 등본만내도될까요, 

뉴스 기사를 입력하세요. 입력을 마치려면 enter을 입력하세요.
안녕하세요~  나이스 세무법인 부산지사 차영현 세무사입니다.     등본 제출하지 않으셨으면 본인만 공제 받으실 거구요.  등본을 제출하지 않았다 해서 세무서에서 연락은 오지 않고 개인적으로 수정신고를 하시면 됩니다.     등본상 부양가족중 기본공제 대상자가 있을 경우에는 5월 종합소득세 신고기간에 홈택스나 가까운 세무서에 등본 가지고 가시면 수정신고 가능합니다.     제출한 자료는 입력이 다 되어있기 때문에 처음부터 다시 하실필요는 없습니다.     만족할 만한 답변 되셨길 바랍니다.  항상 행복하세요~^^

***************  결과  ***************
원문 : 안녕하세요~ 나이스 세무법인 부산지사 차영현 세무사입니다. 등본 제출하

KeyboardInterrupt: 