# 데이터

In [1]:
# 라이브러리 불러오기
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras

import re
from collections import Counter
import sentencepiece as spm
from konlpy.tag import Okt
from konlpy.tag import Mecab
import csv 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential
import tensorflow_addons as tfa
from itertools import combinations
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    confusion_matrix
)

from keras.utils.vis_utils import plot_model

In [2]:
# 한글 폰트에 문제가 생겼을 때

# 한글 폰트 설치
!apt-get update -qq
!apt-get install -qq fonts-nanum

# 설치한 폰트를 matplotlib에서 사용할 수 있도록 설정
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt

# 나눔 폰트 경로 설정
font_path = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'

# 폰트 매니저에 폰트 추가
fm.fontManager.addfont(font_path)
plt.rc('font', family='NanumGothic')  # 폰트 설정

## 데이터 업로드

In [3]:
# zip 파일 해제
%%timeit
path = 'TL_01_KAKAO.zip'
output_dir = "aihub_dataset"
os.system("unzip "+path+" -d "+output_dir)

Archive:  TL_01_KAKAO.zip
  inflating: aihub_dataset/KAKAO_898_15.json  
  inflating: aihub_dataset/KAKAO_898_16.json  
  inflating: aihub_dataset/KAKAO_898_17.json  
  inflating: aihub_dataset/KAKAO_898_18.json  
  inflating: aihub_dataset/KAKAO_898_19.json  
  inflating: aihub_dataset/KAKAO_898_20.json  
  inflating: aihub_dataset/KAKAO_899_01.json  
  inflating: aihub_dataset/KAKAO_899_02.json  
  inflating: aihub_dataset/KAKAO_899_03.json  
  inflating: aihub_dataset/KAKAO_899_04.json  
  inflating: aihub_dataset/KAKAO_899_05.json  
  inflating: aihub_dataset/KAKAO_899_06.json  
  inflating: aihub_dataset/KAKAO_899_07.json  
  inflating: aihub_dataset/KAKAO_899_08.json  
  inflating: aihub_dataset/KAKAO_899_09.json  
  inflating: aihub_dataset/KAKAO_899_10.json  
  inflating: aihub_dataset/KAKAO_899_11.json  
  inflating: aihub_dataset/KAKAO_899_12.json  
  inflating: aihub_dataset/KAKAO_899_13.json  
  inflating: aihub_dataset/KAKAO_899_14.json  
  inflating: aihub_dataset/KAKAO_8

  inflating: aihub_dataset/KAKAO_933_08.json  
  inflating: aihub_dataset/KAKAO_933_09.json  
  inflating: aihub_dataset/KAKAO_933_10.json  
  inflating: aihub_dataset/KAKAO_933_11.json  
  inflating: aihub_dataset/KAKAO_933_12.json  
  inflating: aihub_dataset/KAKAO_933_13.json  
  inflating: aihub_dataset/KAKAO_933_14.json  
  inflating: aihub_dataset/KAKAO_933_15.json  
  inflating: aihub_dataset/KAKAO_933_16.json  
  inflating: aihub_dataset/KAKAO_933_17.json  
  inflating: aihub_dataset/KAKAO_933_18.json  
  inflating: aihub_dataset/KAKAO_933_19.json  
  inflating: aihub_dataset/KAKAO_933_20.json  
  inflating: aihub_dataset/KAKAO_934_01.json  
  inflating: aihub_dataset/KAKAO_934_02.json  
  inflating: aihub_dataset/KAKAO_934_03.json  
  inflating: aihub_dataset/KAKAO_934_04.json  
  inflating: aihub_dataset/KAKAO_934_05.json  
  inflating: aihub_dataset/KAKAO_934_06.json  
  inflating: aihub_dataset/KAKAO_934_07.json  
  inflating: aihub_dataset/KAKAO_934_08.json  
  inflating: 

  inflating: aihub_dataset/KAKAO_982_05.json  
  inflating: aihub_dataset/KAKAO_982_06.json  
  inflating: aihub_dataset/KAKAO_982_07.json  
  inflating: aihub_dataset/KAKAO_982_08.json  
  inflating: aihub_dataset/KAKAO_982_09.json  
  inflating: aihub_dataset/KAKAO_982_10.json  
  inflating: aihub_dataset/KAKAO_982_11.json  
  inflating: aihub_dataset/KAKAO_982_12.json  
  inflating: aihub_dataset/KAKAO_982_13.json  
  inflating: aihub_dataset/KAKAO_982_14.json  
  inflating: aihub_dataset/KAKAO_982_15.json  
  inflating: aihub_dataset/KAKAO_982_16.json  
  inflating: aihub_dataset/KAKAO_982_17.json  
  inflating: aihub_dataset/KAKAO_982_18.json  
  inflating: aihub_dataset/KAKAO_982_19.json  
  inflating: aihub_dataset/KAKAO_982_20.json  
  inflating: aihub_dataset/KAKAO_983_01.json  
  inflating: aihub_dataset/KAKAO_983_02.json  
  inflating: aihub_dataset/KAKAO_983_03.json  
  inflating: aihub_dataset/KAKAO_983_04.json  
  inflating: aihub_dataset/KAKAO_983_05.json  
  inflating: 

  inflating: aihub_dataset/KAKAO_1030_16.json  
  inflating: aihub_dataset/KAKAO_1030_17.json  
  inflating: aihub_dataset/KAKAO_1030_18.json  
  inflating: aihub_dataset/KAKAO_1030_19.json  
  inflating: aihub_dataset/KAKAO_1030_20.json  
  inflating: aihub_dataset/KAKAO_1031_01.json  
  inflating: aihub_dataset/KAKAO_1031_02.json  
  inflating: aihub_dataset/KAKAO_1031_03.json  
  inflating: aihub_dataset/KAKAO_1031_04.json  
  inflating: aihub_dataset/KAKAO_1031_05.json  
  inflating: aihub_dataset/KAKAO_1031_06.json  
  inflating: aihub_dataset/KAKAO_1031_07.json  
  inflating: aihub_dataset/KAKAO_1031_08.json  
  inflating: aihub_dataset/KAKAO_1031_09.json  
  inflating: aihub_dataset/KAKAO_1031_10.json  
  inflating: aihub_dataset/KAKAO_1031_11.json  
  inflating: aihub_dataset/KAKAO_1031_12.json  
  inflating: aihub_dataset/KAKAO_1031_13.json  
  inflating: aihub_dataset/KAKAO_1031_14.json  
  inflating: aihub_dataset/KAKAO_1031_15.json  
  inflating: aihub_dataset/KAKAO_1031_16

  inflating: aihub_dataset/KAKAO_1079_04.json  
  inflating: aihub_dataset/KAKAO_1079_05.json  
  inflating: aihub_dataset/KAKAO_1079_06.json  
  inflating: aihub_dataset/KAKAO_1079_07.json  
  inflating: aihub_dataset/KAKAO_1079_08.json  
  inflating: aihub_dataset/KAKAO_1079_09.json  
  inflating: aihub_dataset/KAKAO_1079_10.json  
  inflating: aihub_dataset/KAKAO_1079_11.json  
  inflating: aihub_dataset/KAKAO_1079_12.json  
  inflating: aihub_dataset/KAKAO_1079_13.json  
  inflating: aihub_dataset/KAKAO_1079_14.json  
  inflating: aihub_dataset/KAKAO_1079_15.json  
  inflating: aihub_dataset/KAKAO_1079_16.json  
  inflating: aihub_dataset/KAKAO_1079_17.json  
  inflating: aihub_dataset/KAKAO_1079_18.json  
  inflating: aihub_dataset/KAKAO_1079_19.json  
  inflating: aihub_dataset/KAKAO_1079_20.json  
  inflating: aihub_dataset/KAKAO_1080_01.json  
  inflating: aihub_dataset/KAKAO_1080_02.json  
  inflating: aihub_dataset/KAKAO_1080_03.json  
  inflating: aihub_dataset/KAKAO_1080_04

  inflating: aihub_dataset/KAKAO_1127_09.json  
  inflating: aihub_dataset/KAKAO_1127_10.json  
  inflating: aihub_dataset/KAKAO_1127_11.json  
  inflating: aihub_dataset/KAKAO_1127_12.json  
  inflating: aihub_dataset/KAKAO_1127_13.json  
  inflating: aihub_dataset/KAKAO_1127_14.json  
  inflating: aihub_dataset/KAKAO_1127_15.json  
  inflating: aihub_dataset/KAKAO_1127_16.json  
  inflating: aihub_dataset/KAKAO_1127_17.json  
  inflating: aihub_dataset/KAKAO_1127_18.json  
  inflating: aihub_dataset/KAKAO_1127_19.json  
  inflating: aihub_dataset/KAKAO_1127_20.json  
  inflating: aihub_dataset/KAKAO_1128_01.json  
  inflating: aihub_dataset/KAKAO_1128_02.json  
  inflating: aihub_dataset/KAKAO_1128_03.json  
  inflating: aihub_dataset/KAKAO_1128_04.json  
  inflating: aihub_dataset/KAKAO_1128_05.json  
  inflating: aihub_dataset/KAKAO_1128_06.json  
  inflating: aihub_dataset/KAKAO_1128_07.json  
  inflating: aihub_dataset/KAKAO_1128_08.json  
  inflating: aihub_dataset/KAKAO_1128_09

  inflating: aihub_dataset/KAKAO_1177_05.json  
  inflating: aihub_dataset/KAKAO_1177_06.json  
  inflating: aihub_dataset/KAKAO_1177_07.json  
  inflating: aihub_dataset/KAKAO_1177_08.json  
  inflating: aihub_dataset/KAKAO_1177_09.json  
  inflating: aihub_dataset/KAKAO_1177_10.json  
  inflating: aihub_dataset/KAKAO_1177_11.json  
  inflating: aihub_dataset/KAKAO_1177_12.json  
  inflating: aihub_dataset/KAKAO_1177_13.json  
  inflating: aihub_dataset/KAKAO_1177_14.json  
  inflating: aihub_dataset/KAKAO_1177_15.json  
  inflating: aihub_dataset/KAKAO_1177_16.json  
  inflating: aihub_dataset/KAKAO_1177_17.json  
  inflating: aihub_dataset/KAKAO_1177_18.json  
  inflating: aihub_dataset/KAKAO_1177_19.json  
  inflating: aihub_dataset/KAKAO_1177_20.json  
  inflating: aihub_dataset/KAKAO_1178_01.json  
  inflating: aihub_dataset/KAKAO_1178_02.json  
  inflating: aihub_dataset/KAKAO_1178_03.json  
  inflating: aihub_dataset/KAKAO_1178_04.json  
  inflating: aihub_dataset/KAKAO_1178_05

  inflating: aihub_dataset/KAKAO_1218_16.json  
  inflating: aihub_dataset/KAKAO_1218_17.json  
  inflating: aihub_dataset/KAKAO_1218_18.json  
  inflating: aihub_dataset/KAKAO_1218_19.json  
  inflating: aihub_dataset/KAKAO_1218_20.json  
  inflating: aihub_dataset/KAKAO_1219_01.json  
  inflating: aihub_dataset/KAKAO_1219_02.json  
  inflating: aihub_dataset/KAKAO_1219_03.json  
  inflating: aihub_dataset/KAKAO_1219_04.json  
  inflating: aihub_dataset/KAKAO_1219_05.json  
  inflating: aihub_dataset/KAKAO_1219_06.json  
  inflating: aihub_dataset/KAKAO_1219_07.json  
  inflating: aihub_dataset/KAKAO_1219_08.json  
  inflating: aihub_dataset/KAKAO_1219_09.json  
  inflating: aihub_dataset/KAKAO_1219_10.json  
  inflating: aihub_dataset/KAKAO_1219_11.json  
  inflating: aihub_dataset/KAKAO_1219_12.json  
  inflating: aihub_dataset/KAKAO_1219_13.json  
  inflating: aihub_dataset/KAKAO_1219_14.json  
  inflating: aihub_dataset/KAKAO_1219_15.json  
  inflating: aihub_dataset/KAKAO_1219_16

  inflating: aihub_dataset/KAKAO_1260_16.json  
  inflating: aihub_dataset/KAKAO_1260_17.json  
  inflating: aihub_dataset/KAKAO_1260_18.json  
  inflating: aihub_dataset/KAKAO_1260_19.json  
  inflating: aihub_dataset/KAKAO_1260_20.json  
  inflating: aihub_dataset/KAKAO_1261_01.json  
  inflating: aihub_dataset/KAKAO_1261_02.json  
  inflating: aihub_dataset/KAKAO_1261_03.json  
  inflating: aihub_dataset/KAKAO_1261_04.json  
  inflating: aihub_dataset/KAKAO_1261_05.json  
  inflating: aihub_dataset/KAKAO_1261_06.json  
  inflating: aihub_dataset/KAKAO_1261_07.json  
  inflating: aihub_dataset/KAKAO_1261_08.json  
  inflating: aihub_dataset/KAKAO_1261_09.json  
  inflating: aihub_dataset/KAKAO_1261_10.json  
  inflating: aihub_dataset/KAKAO_1261_11.json  
  inflating: aihub_dataset/KAKAO_1261_12.json  
  inflating: aihub_dataset/KAKAO_1261_13.json  
  inflating: aihub_dataset/KAKAO_1261_14.json  
  inflating: aihub_dataset/KAKAO_1261_15.json  
  inflating: aihub_dataset/KAKAO_1261_16

  inflating: aihub_dataset/KAKAO_1305_12.json  
  inflating: aihub_dataset/KAKAO_1305_13.json  
  inflating: aihub_dataset/KAKAO_1305_14.json  
  inflating: aihub_dataset/KAKAO_1305_15.json  
  inflating: aihub_dataset/KAKAO_1305_16.json  
  inflating: aihub_dataset/KAKAO_1305_17.json  
  inflating: aihub_dataset/KAKAO_1305_18.json  
  inflating: aihub_dataset/KAKAO_1305_19.json  
  inflating: aihub_dataset/KAKAO_1305_20.json  
  inflating: aihub_dataset/KAKAO_1306_01.json  
  inflating: aihub_dataset/KAKAO_1306_02.json  
  inflating: aihub_dataset/KAKAO_1306_03.json  
  inflating: aihub_dataset/KAKAO_1306_04.json  
  inflating: aihub_dataset/KAKAO_1306_05.json  
  inflating: aihub_dataset/KAKAO_1306_06.json  
  inflating: aihub_dataset/KAKAO_1306_07.json  
  inflating: aihub_dataset/KAKAO_1306_08.json  
  inflating: aihub_dataset/KAKAO_1306_09.json  
  inflating: aihub_dataset/KAKAO_1306_10.json  
  inflating: aihub_dataset/KAKAO_1306_11.json  
  inflating: aihub_dataset/KAKAO_1306_12

  inflating: aihub_dataset/KAKAO_1351_10.json  
  inflating: aihub_dataset/KAKAO_1351_11.json  
  inflating: aihub_dataset/KAKAO_1351_12.json  
  inflating: aihub_dataset/KAKAO_1351_13.json  
  inflating: aihub_dataset/KAKAO_1351_14.json  
  inflating: aihub_dataset/KAKAO_1351_15.json  
  inflating: aihub_dataset/KAKAO_1351_16.json  
  inflating: aihub_dataset/KAKAO_1351_17.json  
  inflating: aihub_dataset/KAKAO_1351_18.json  
  inflating: aihub_dataset/KAKAO_1351_19.json  
  inflating: aihub_dataset/KAKAO_1351_20.json  
  inflating: aihub_dataset/KAKAO_1352_01.json  
  inflating: aihub_dataset/KAKAO_1352_02.json  
  inflating: aihub_dataset/KAKAO_1352_03.json  
  inflating: aihub_dataset/KAKAO_1352_04.json  
  inflating: aihub_dataset/KAKAO_1352_05.json  
  inflating: aihub_dataset/KAKAO_1352_06.json  
  inflating: aihub_dataset/KAKAO_1352_07.json  
  inflating: aihub_dataset/KAKAO_1352_08.json  
  inflating: aihub_dataset/KAKAO_1352_09.json  
  inflating: aihub_dataset/KAKAO_1352_10

  inflating: aihub_dataset/KAKAO_1395_20.json  
  inflating: aihub_dataset/KAKAO_1396_01.json  
  inflating: aihub_dataset/KAKAO_1396_02.json  
  inflating: aihub_dataset/KAKAO_1396_03.json  
  inflating: aihub_dataset/KAKAO_1396_04.json  
  inflating: aihub_dataset/KAKAO_1396_05.json  
  inflating: aihub_dataset/KAKAO_1396_06.json  
  inflating: aihub_dataset/KAKAO_1396_07.json  
  inflating: aihub_dataset/KAKAO_1396_08.json  
  inflating: aihub_dataset/KAKAO_1396_09.json  
  inflating: aihub_dataset/KAKAO_1396_10.json  
  inflating: aihub_dataset/KAKAO_1396_11.json  
  inflating: aihub_dataset/KAKAO_1396_12.json  
  inflating: aihub_dataset/KAKAO_1396_13.json  
  inflating: aihub_dataset/KAKAO_1396_14.json  
  inflating: aihub_dataset/KAKAO_1396_15.json  
  inflating: aihub_dataset/KAKAO_1396_16.json  
  inflating: aihub_dataset/KAKAO_1396_17.json  
  inflating: aihub_dataset/KAKAO_1396_18.json  
  inflating: aihub_dataset/KAKAO_1396_19.json  
  inflating: aihub_dataset/KAKAO_1396_20

  inflating: aihub_dataset/KAKAO_1440_09.json  
  inflating: aihub_dataset/KAKAO_1440_10.json  
  inflating: aihub_dataset/KAKAO_1440_11.json  
  inflating: aihub_dataset/KAKAO_1440_12.json  
  inflating: aihub_dataset/KAKAO_1440_13.json  
  inflating: aihub_dataset/KAKAO_1440_14.json  
  inflating: aihub_dataset/KAKAO_1440_15.json  
  inflating: aihub_dataset/KAKAO_1440_16.json  
  inflating: aihub_dataset/KAKAO_1440_17.json  
  inflating: aihub_dataset/KAKAO_1440_18.json  
  inflating: aihub_dataset/KAKAO_1440_19.json  
  inflating: aihub_dataset/KAKAO_1440_20.json  
  inflating: aihub_dataset/KAKAO_1441_01.json  
  inflating: aihub_dataset/KAKAO_1441_02.json  
  inflating: aihub_dataset/KAKAO_1441_03.json  
  inflating: aihub_dataset/KAKAO_1441_04.json  
  inflating: aihub_dataset/KAKAO_1441_05.json  
  inflating: aihub_dataset/KAKAO_1441_06.json  
  inflating: aihub_dataset/KAKAO_1441_07.json  
  inflating: aihub_dataset/KAKAO_1441_08.json  
  inflating: aihub_dataset/KAKAO_1441_09

  inflating: aihub_dataset/KAKAO_1487_06.json  
  inflating: aihub_dataset/KAKAO_1487_07.json  
  inflating: aihub_dataset/KAKAO_1487_08.json  
  inflating: aihub_dataset/KAKAO_1487_09.json  
  inflating: aihub_dataset/KAKAO_1487_10.json  
  inflating: aihub_dataset/KAKAO_1487_11.json  
  inflating: aihub_dataset/KAKAO_1487_12.json  
  inflating: aihub_dataset/KAKAO_1487_13.json  
  inflating: aihub_dataset/KAKAO_1487_14.json  
  inflating: aihub_dataset/KAKAO_1487_15.json  
  inflating: aihub_dataset/KAKAO_1487_16.json  
  inflating: aihub_dataset/KAKAO_1487_17.json  
  inflating: aihub_dataset/KAKAO_1487_18.json  
  inflating: aihub_dataset/KAKAO_1487_19.json  
  inflating: aihub_dataset/KAKAO_1487_20.json  
  inflating: aihub_dataset/KAKAO_1488_01.json  
  inflating: aihub_dataset/KAKAO_1488_02.json  
  inflating: aihub_dataset/KAKAO_1488_03.json  
  inflating: aihub_dataset/KAKAO_1488_04.json  
  inflating: aihub_dataset/KAKAO_1488_05.json  
  inflating: aihub_dataset/KAKAO_1488_06

  inflating: aihub_dataset/KAKAO_1534_11.json  
  inflating: aihub_dataset/KAKAO_1534_12.json  
  inflating: aihub_dataset/KAKAO_1534_13.json  
  inflating: aihub_dataset/KAKAO_1534_14.json  
  inflating: aihub_dataset/KAKAO_1534_15.json  
  inflating: aihub_dataset/KAKAO_1534_16.json  
  inflating: aihub_dataset/KAKAO_1534_17.json  
  inflating: aihub_dataset/KAKAO_1534_18.json  
  inflating: aihub_dataset/KAKAO_1534_19.json  
  inflating: aihub_dataset/KAKAO_1534_20.json  
  inflating: aihub_dataset/KAKAO_1535_01.json  
  inflating: aihub_dataset/KAKAO_1535_02.json  
  inflating: aihub_dataset/KAKAO_1535_03.json  
  inflating: aihub_dataset/KAKAO_1535_04.json  
  inflating: aihub_dataset/KAKAO_1535_05.json  
  inflating: aihub_dataset/KAKAO_1535_06.json  
  inflating: aihub_dataset/KAKAO_1535_07.json  
  inflating: aihub_dataset/KAKAO_1535_08.json  
  inflating: aihub_dataset/KAKAO_1535_09.json  
  inflating: aihub_dataset/KAKAO_1535_10.json  
  inflating: aihub_dataset/KAKAO_1535_11

  inflating: aihub_dataset/KAKAO_1585_12.json  
  inflating: aihub_dataset/KAKAO_1585_13.json  
  inflating: aihub_dataset/KAKAO_1585_14.json  
  inflating: aihub_dataset/KAKAO_1585_15.json  
  inflating: aihub_dataset/KAKAO_1585_16.json  
  inflating: aihub_dataset/KAKAO_1585_17.json  
  inflating: aihub_dataset/KAKAO_1585_18.json  
  inflating: aihub_dataset/KAKAO_1585_19.json  
  inflating: aihub_dataset/KAKAO_1585_20.json  
  inflating: aihub_dataset/KAKAO_1586_01.json  
  inflating: aihub_dataset/KAKAO_1586_02.json  
  inflating: aihub_dataset/KAKAO_1586_03.json  
  inflating: aihub_dataset/KAKAO_1586_04.json  
  inflating: aihub_dataset/KAKAO_1586_05.json  
  inflating: aihub_dataset/KAKAO_1586_06.json  
  inflating: aihub_dataset/KAKAO_1586_07.json  
  inflating: aihub_dataset/KAKAO_1586_08.json  
  inflating: aihub_dataset/KAKAO_1586_09.json  
  inflating: aihub_dataset/KAKAO_1586_10.json  
  inflating: aihub_dataset/KAKAO_1586_11.json  
  inflating: aihub_dataset/KAKAO_1586_12

  inflating: aihub_dataset/KAKAO_1631_19.json  
  inflating: aihub_dataset/KAKAO_1631_20.json  
  inflating: aihub_dataset/KAKAO_1632_01.json  
  inflating: aihub_dataset/KAKAO_1632_02.json  
  inflating: aihub_dataset/KAKAO_1632_03.json  
  inflating: aihub_dataset/KAKAO_1632_04.json  
  inflating: aihub_dataset/KAKAO_1632_05.json  
  inflating: aihub_dataset/KAKAO_1632_06.json  
  inflating: aihub_dataset/KAKAO_1632_07.json  
  inflating: aihub_dataset/KAKAO_1632_08.json  
  inflating: aihub_dataset/KAKAO_1632_09.json  
  inflating: aihub_dataset/KAKAO_1632_10.json  
  inflating: aihub_dataset/KAKAO_1632_11.json  
  inflating: aihub_dataset/KAKAO_1632_12.json  
  inflating: aihub_dataset/KAKAO_1632_13.json  
  inflating: aihub_dataset/KAKAO_1632_14.json  
  inflating: aihub_dataset/KAKAO_1632_15.json  
  inflating: aihub_dataset/KAKAO_1632_16.json  
  inflating: aihub_dataset/KAKAO_1632_17.json  
  inflating: aihub_dataset/KAKAO_1632_18.json  
  inflating: aihub_dataset/KAKAO_1632_19

  inflating: aihub_dataset/KAKAO_1675_17.json  
  inflating: aihub_dataset/KAKAO_1675_18.json  
  inflating: aihub_dataset/KAKAO_1675_19.json  
  inflating: aihub_dataset/KAKAO_1675_20.json  
  inflating: aihub_dataset/KAKAO_1676_01.json  
  inflating: aihub_dataset/KAKAO_1676_02.json  
  inflating: aihub_dataset/KAKAO_1676_03.json  
  inflating: aihub_dataset/KAKAO_1676_04.json  
  inflating: aihub_dataset/KAKAO_1676_05.json  
  inflating: aihub_dataset/KAKAO_1676_06.json  
  inflating: aihub_dataset/KAKAO_1676_07.json  
  inflating: aihub_dataset/KAKAO_1676_08.json  
  inflating: aihub_dataset/KAKAO_1676_09.json  
  inflating: aihub_dataset/KAKAO_1676_10.json  
  inflating: aihub_dataset/KAKAO_1676_11.json  
  inflating: aihub_dataset/KAKAO_1676_12.json  
  inflating: aihub_dataset/KAKAO_1676_13.json  
  inflating: aihub_dataset/KAKAO_1676_14.json  
  inflating: aihub_dataset/KAKAO_1676_15.json  
  inflating: aihub_dataset/KAKAO_1676_16.json  
  inflating: aihub_dataset/KAKAO_1676_17

  inflating: aihub_dataset/KAKAO_1724_06.json  
  inflating: aihub_dataset/KAKAO_1724_07.json  
  inflating: aihub_dataset/KAKAO_1724_08.json  
  inflating: aihub_dataset/KAKAO_1724_09.json  
  inflating: aihub_dataset/KAKAO_1724_10.json  
  inflating: aihub_dataset/KAKAO_1724_11.json  
  inflating: aihub_dataset/KAKAO_1724_12.json  
  inflating: aihub_dataset/KAKAO_1724_13.json  
  inflating: aihub_dataset/KAKAO_1724_14.json  
  inflating: aihub_dataset/KAKAO_1724_15.json  
  inflating: aihub_dataset/KAKAO_1724_16.json  
  inflating: aihub_dataset/KAKAO_1724_17.json  
  inflating: aihub_dataset/KAKAO_1724_18.json  
  inflating: aihub_dataset/KAKAO_1724_19.json  
  inflating: aihub_dataset/KAKAO_1724_20.json  
  inflating: aihub_dataset/KAKAO_1725_01.json  
  inflating: aihub_dataset/KAKAO_1725_02.json  
  inflating: aihub_dataset/KAKAO_1725_03.json  
  inflating: aihub_dataset/KAKAO_1725_04.json  
  inflating: aihub_dataset/KAKAO_1725_05.json  
  inflating: aihub_dataset/KAKAO_1725_06

  inflating: aihub_dataset/KAKAO_1772_14.json  
  inflating: aihub_dataset/KAKAO_1772_15.json  
  inflating: aihub_dataset/KAKAO_1772_16.json  
  inflating: aihub_dataset/KAKAO_1772_17.json  
  inflating: aihub_dataset/KAKAO_1772_18.json  
  inflating: aihub_dataset/KAKAO_1772_19.json  
  inflating: aihub_dataset/KAKAO_1772_20.json  
  inflating: aihub_dataset/KAKAO_1773_01.json  
  inflating: aihub_dataset/KAKAO_1773_02.json  
  inflating: aihub_dataset/KAKAO_1773_03.json  
  inflating: aihub_dataset/KAKAO_1773_04.json  
  inflating: aihub_dataset/KAKAO_1773_05.json  
  inflating: aihub_dataset/KAKAO_1773_06.json  
  inflating: aihub_dataset/KAKAO_1773_07.json  
  inflating: aihub_dataset/KAKAO_1773_08.json  
  inflating: aihub_dataset/KAKAO_1773_09.json  
  inflating: aihub_dataset/KAKAO_1773_10.json  
  inflating: aihub_dataset/KAKAO_1773_11.json  
  inflating: aihub_dataset/KAKAO_1773_12.json  
  inflating: aihub_dataset/KAKAO_1773_13.json  
  inflating: aihub_dataset/KAKAO_1773_14

replace aihub_dataset/KAKAO_898_15.json? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)


Archive:  TL_01_KAKAO.zip


replace aihub_dataset/KAKAO_898_15.json? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)


Archive:  TL_01_KAKAO.zip


replace aihub_dataset/KAKAO_898_15.json? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)


Archive:  TL_01_KAKAO.zip


replace aihub_dataset/KAKAO_898_15.json? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)


Archive:  TL_01_KAKAO.zip


replace aihub_dataset/KAKAO_898_15.json? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)


Archive:  TL_01_KAKAO.zip


replace aihub_dataset/KAKAO_898_15.json? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)


Archive:  TL_01_KAKAO.zip
241 ms ± 4.79 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


replace aihub_dataset/KAKAO_898_15.json? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)


In [44]:
# text 데이터만 추출해서 리스트에 저장
import json
file_list = os.listdir("aihub_dataset")
# 1000개의 text 추출
file_list = file_list[:1000]
text_list = []

for file in file_list:
    file_path = os.path.join("aihub_dataset", file)
    with open(file_path, 'r', encoding='UTF-8') as f:
        json_data = json.load(f)
        info = json_data['info']
        anno = info[0]['annotations']
        text = anno['text']
        text_list.append(text)

print(text_list)

['1 : 요즘 전기차가 대세라 하더라구요 키키\n2 : 오 그러니까 키키\n2 : 길에도 많이 돌아다니더라고\n1 : 언니는 전기차에 관심 있어요? 키키\n2 : 나 있긴 한데 키키 아직 타본 적도 없어 넌?\n1 : 내년에 차 살려고 했는데 전기차 기다려야 하나 싶기도 하고 키키\n2 : 오오 키키 요즘 아이오닉 5였나 그거 괜찮더라\n1 : 키키 저도 전기차 타 본 적은 없어요 키키\n2 : 전기차는 소모품 드는 게 없다고 하더라고\n1 : 아 그래요? 키키\n1 : 그건 얼마 한대요?\n2 : 키키 몰라 보조금 받아도 한 4~5천만 원 내야 하지 않을까\n1 : 근데 차가 엄청 가볍다 하더라구요 키키\n2 : 키키 응 안에 실내도 엄청 넓데 부품이 배터리밖에 없어서\n1 : 아 좀 비싸긴 하네요 키키\n2 : 키키 비싸지...\n2 : 겨울에 시동 안 걸릴지도 몰라 배터리 얼어서\n1 : 키키 일단 고민 좀 해봐야겠어요 전기차 키키\n2 : 키키 앞으로는 전기차가 대세가 될 것 같긴 해', '1 : 언니 옛날에 우리 놀던 무궁화꽃이피었습니다 알지? 키키\n2 : 응 알지 요즘 오징어 게임 때문에 다시 새록새록\n1 : 그거 요즘 다시 유행이래\n2 : 너도 오징어 게임 봤어?\n2 : 나는 어제 정주행 했어\n1 : 응 키키 거기 나온 게임들 진짜 오랜만이지!\n2 : 맞아 무섭긴 한데 오랜만이긴 해\n1 : 그러게 솔직히 동심 파괴 하는 줄!\n2 : 그래도 재미있어서 정주행 했지 모야 하하\n1 : 나는 어릴 때 땅따먹기 자주 했는데 키키 언니는?\n2 : 음 나는 소꿉놀이? 같은 거\n2 : 흙 만지면서 논 거 같아\n1 : 모래성 쌓기 이런 거 맞나? 키키\n2 : 맞아 풀 으깨고... 막 돌맹이로 찌고 했던 것 같아\n1 : 헐 봉숭아 꽃 따다가 그랬는데 나도 키키\n1 : 역시 어릴 때 노는 건 다 비슷해\n2 : 맞아 옛날에 그렇게 놀았던 게 추억이긴 하지\n1 : 그치 그때는 컴퓨터 게임도 없었고 그렇게 노는 게 진리였어\n2 : 요즘에는

In [49]:
train_data = pd.DataFrame()
type_list = np.full(1000, 4).tolist()
train_data['type']=type_list
train_data['conversation'] = text_list
train_data

Unnamed: 0,type,conversation
0,4,1 : 요즘 전기차가 대세라 하더라구요 키키\n2 : 오 그러니까 키키\n2 : 길...
1,4,1 : 언니 옛날에 우리 놀던 무궁화꽃이피었습니다 알지? 키키\n2 : 응 알지 요...
2,4,1 : 내 아는 동생이 뭐 사막쥐인가? 쥐 키우던데 사진을 보내줘\n2 : 쥐를? ...
3,4,1 : **가 너보다 더 심하잖아 키키\n2 : 키키 그니까 **를 누가 이겨 키키...
4,4,1 : 맞아 다음주 화요일 접종일이야\n2 : 상거래는 전자로 하는게 좋아\n3 :...
...,...,...
995,4,"1 : 지금 머리 상태는 어때?\n2 : 나 거지존 입성, 죽겠어 이리 뻗치고 저리..."
996,4,1 : 키키 산책하다 보면 도롱뇽 많이 봄...\n2 : 아직도 도롱뇽 있음? 키키...
997,4,1 : 아 참 그리고 군대 말년에 할 거 없어서 공부하는 거 국룰이잖아 키키\n2 ...
998,4,1 : 오늘 저녁도 안 먹어?\n2 : 오늘 점심 때 너무 먹을 게 당겨서 떡볶이랑...


In [50]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type          1000 non-null   int64 
 1   conversation  1000 non-null   object
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


## 전처리

### 기본 전처리

#### 중복값 여부 찾아보기

In [51]:
# 중복값 찾기
duplicates = train_data[train_data.duplicated()]
duplicates

Unnamed: 0,type,conversation


중복값 없음

#### 결측치 여부 확인하기

In [52]:
# 결측치 여부 확인하기
train_data.isnull().sum()

type            0
conversation    0
dtype: int64

결측치 없음

### 텍스트 전처리

#### 한글 외 문자 삭제
- 한글, '?', '!', '.', '.', 공백 유지
- 대화 순서 나타내는 '1: ', '2: ',... 지우기

In [13]:
# 전처리 함수
def preprocess_sentence(sentence): 
    # \n을 공백으로 바꾸기
    sentence = re.sub("\n", " ", sentence)
    
    # (ㄱ-ㅎ, ㅏ-ㅣ, ".", "?", "!", ",", ' ')를 제외한 모든 문자를 없애기
    sentence = re.sub("[^ㄱ-ㅣ가-힣.?!, ]", "", sentence)
    
    # 단어와 구두점(punctuation) 사이에 공백 추가
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    
    return sentence

In [14]:
# 전처리 데이터 새로운 column에 저장
train_data['preprocessed'] = train_data['conversation'].apply(preprocess_sentence)
train_data['preprocessed']

0       학교 점심 뭐 나와 ?  주로 한식이 나와 .  학원 다녀 ?  응 ,  영어 학원...
1       어이 거기 뒤뚱거리는 놈  나 ?  그래 너 여기 뒤뚱거리는 놈이 너밖에 더 있냐 ...
2       너 그따위로 운전하면 확 갈아마셔버린다 .   뭐라구 ?   나와 이 자식아 .  ...
3       길동씨 이번에 이것좀 처리해요 이거 제가 한게 아닌데요 팀에서 내가 니가가 어딨어 ...
4       비가 많이 오네 .  우산 가져왔어 ?  날씨가 추워졌어 .  맞아 ,  이제 겨울...
                              ...                        
4945    오 깡패다 니 지금 뭐라했노 말하는것도 깡패네 닌 죽었다 시키야 어디서 건방지게 아...
4946    이거 니 주민등록증 아니야 ?  잃어버린줄 알았는데 . 고마워 !  고맙긴 뭘 근데...
4947    여행 준비 다 했어 ?  아직 ,  짐 싸는 중이야 .  여행 가방은 다 쌌어 ? ...
4948    그거 사줘 안사주면 죽어버릴거야  이러지마 돈없어 나한테 해준게 뭐있어 !  !  ...
4949    얘들아 .  이 년 몰골좀 봐 .   야 .  너 좀 씻고다녀 .  우웩 너희가 나...
Name: preprocessed, Length: 4950, dtype: object

In [15]:
# 기존 'conversation' column 전처리한 데이터로 바꾸기
train_data['conversation'] = train_data['preprocessed']
train_data.drop('preprocessed', axis=1)

Unnamed: 0,conversation,type
0,"학교 점심 뭐 나와 ? 주로 한식이 나와 . 학원 다녀 ? 응 , 영어 학원...",4
1,어이 거기 뒤뚱거리는 놈 나 ? 그래 너 여기 뒤뚱거리는 놈이 너밖에 더 있냐 ...,3
2,너 그따위로 운전하면 확 갈아마셔버린다 . 뭐라구 ? 나와 이 자식아 . ...,0
3,길동씨 이번에 이것좀 처리해요 이거 제가 한게 아닌데요 팀에서 내가 니가가 어딨어 ...,2
4,"비가 많이 오네 . 우산 가져왔어 ? 날씨가 추워졌어 . 맞아 , 이제 겨울...",4
...,...,...
4945,오 깡패다 니 지금 뭐라했노 말하는것도 깡패네 닌 죽었다 시키야 어디서 건방지게 아...,0
4946,이거 니 주민등록증 아니야 ? 잃어버린줄 알았는데 . 고마워 ! 고맙긴 뭘 근데...,1
4947,"여행 준비 다 했어 ? 아직 , 짐 싸는 중이야 . 여행 가방은 다 쌌어 ? ...",4
4948,그거 사줘 안사주면 죽어버릴거야 이러지마 돈없어 나한테 해준게 뭐있어 ! ! ...,0


#### 불용어 삭제
[불용어 리스트 출처](https://www.ranks.nl/stopwords/korean)

In [16]:
# 불용어 리스트 불러오기
stopwords_path = "/aiffel/aiffel/dlthon-minions/share/preprocess/ko_stopwords.txt"
with open(stopwords_path, 'r', encoding='utf-8') as file:
    stopwords = file.read().splitlines()

#### 토큰화

```!pip install konlpy```

In [17]:
# 단어사전 크기 20000으로 제한
VOCAB_SIZE=20000

Okt 토크나이저 활용

In [18]:
# Okt로 토큰화
tokenizer=Okt()

def tokenize(conversation, tokenizer):
    return [token for token in tokenizer.morphs(conversation) if token not in stopwords]

# 각 conversation을 토큰화하여 새로운 열 'tokenized'에 저장
train_data['tokenized'] = train_data['conversation'].apply(lambda x: tokenize(x, tokenizer))
tokenized_df = train_data[['type', 'tokenized']]

## Augmentation
rd, rs 방법  
[코드 참조](https://github.com/catSirup/KorEDA/blob/master/eda.py)

In [19]:
import random

########################################################################
# Random deletion
# Randomly delete words from the sentence with probability p
########################################################################
def random_deletion(words, p):
	if len(words) == 1:
		return words

	new_words = []
	for word in words:
		r = random.uniform(0, 1)
		if r > p:
			new_words.append(word)

	if len(new_words) == 0:
		rand_int = random.randint(0, len(words)-1)
		return [words[rand_int]]

	return new_words

########################################################################
# Random swap
# Randomly swap two words in the sentence n times
########################################################################
def random_swap(words, n):
	new_words = words.copy()
	for _ in range(n):
		new_words = swap_word(new_words)

	return new_words

def swap_word(new_words):
	random_idx_1 = random.randint(0, len(new_words)-1)
	random_idx_2 = random_idx_1
	counter = 0

	while random_idx_2 == random_idx_1:
		random_idx_2 = random.randint(0, len(new_words)-1)
		counter += 1
		if counter > 3:
			return new_words

	new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
	return new_words

In [20]:
words = ['가', '나', '다', '라', '마', '바', '사']
a= random_deletion(words, 0.2)
b = random_swap(words, 3)
print(a)
print(b)

['가', '나', '다', '라', '바', '사']
['라', '나', '다', '가', '마', '바', '사']
['가', '나', '다', '바', '마', '라', '사']


In [21]:
def augmentation(df):
    new_rows = []
    for _, row in df.iterrows():
        type_int = row['type']
        # '일반 대화' 이외의 클래스에만 적용
        # 2배씩 늘어남
        if type_int != 4:
            words = row['tokenized']
            # 랜덤으로 augmentation 방식 적용
            use_rd = random.choice([True, False])
            # random deletion
            # p의 확률로 제거
            if use_rd:
                p = random.uniform(0, 1)
                new_words = random_deletion(words, p)
            # random swap
            # n회만큼 swap
            else:
                n = random.randint(1, len(words)//2)
                new_words = random_swap(words, n)
            new_rows.append([type_int, new_words])
    
    # 기존 데이터에 새로운 데이터 추가
    augmented_df = df.append(pd.DataFrame(new_rows, columns=df.columns), ignore_index=True)
    return augmented_df

In [22]:
augmented_data = augmentation(tokenized_df)

In [23]:
augmented_data

Unnamed: 0,type,tokenized
0,4,"[학교, 점심, 뭐, 나와, ?, 주로, 한식, 나와, ., 학원, 다녀, ?, ,..."
1,3,"[뒤뚱거리, 는, 놈, ?, 뒤뚱거리, 는, 놈, 밖에, 더, 있냐, ?, 놀리지마..."
2,0,"[그따위, 운전, 하면, 확, 갈아, 마셔, 버린다, ., 뭐라구, ?, 나와, 자..."
3,2,"[길동, 씨, 것좀, 처리, 거, 한, 게, 아닌데요, 팀, 내, 니, 가가, 어딨..."
4,4,"[비, 많이, 오네, ., 우산, 가져왔어, ?, 날씨, 추워졌어, ., 맞아, ,..."
...,...,...
8895,1,"[자네, 만나는가, ?, 봤다네, 소문, 할망구, 이라도, 하려는가, 먹고, ?]"
8896,0,"[다, 다, 눈치, 지금, 뭐라, 했노, 죽, 없는, 닌, 죽었다, 시키야, 어디서..."
8897,1,"[!, 거, 사례금, ?, 사례금, 내, 꺼, 그건, 만원, 는, 거, ., 찍어놨..."
8898,0,"[그거, 사줘, 벌어와, 죽는, 버릴거야, 이러지마, 정신차려, 없어, 한테, 해준..."


In [24]:
# 중복값 확인
# Convert the 'tokenized' column to strings for duplication checking
augmented_data['tokenized_str'] = augmented_data['tokenized'].apply(lambda x: ' '.join(x))

# Check for duplicates based on the string representation of the 'tokenized' column
duplicates = augmented_data[augmented_data.duplicated(subset=['tokenized_str'])]
print('중복된 행 개수: ', len(duplicates))

unique_augmented_data = augmented_data.drop_duplicates(subset=['tokenized_str'])
unique_augmented_data = unique_augmented_data.drop(columns=['tokenized_str'])
unique_augmented_data


중복된 행 개수:  138


Unnamed: 0,type,tokenized
0,4,"[학교, 점심, 뭐, 나와, ?, 주로, 한식, 나와, ., 학원, 다녀, ?, ,..."
1,3,"[뒤뚱거리, 는, 놈, ?, 뒤뚱거리, 는, 놈, 밖에, 더, 있냐, ?, 놀리지마..."
2,0,"[그따위, 운전, 하면, 확, 갈아, 마셔, 버린다, ., 뭐라구, ?, 나와, 자..."
3,2,"[길동, 씨, 것좀, 처리, 거, 한, 게, 아닌데요, 팀, 내, 니, 가가, 어딨..."
4,4,"[비, 많이, 오네, ., 우산, 가져왔어, ?, 날씨, 추워졌어, ., 맞아, ,..."
...,...,...
8895,1,"[자네, 만나는가, ?, 봤다네, 소문, 할망구, 이라도, 하려는가, 먹고, ?]"
8896,0,"[다, 다, 눈치, 지금, 뭐라, 했노, 죽, 없는, 닌, 죽었다, 시키야, 어디서..."
8897,1,"[!, 거, 사례금, ?, 사례금, 내, 꺼, 그건, 만원, 는, 거, ., 찍어놨..."
8898,0,"[그거, 사줘, 벌어와, 죽는, 버릴거야, 이러지마, 정신차려, 없어, 한테, 해준..."


In [25]:
# 클래스 별 샘플 개수 확인
Counter(unique_augmented_data['type'])

Counter({4: 1000, 3: 2090, 0: 1782, 2: 1945, 1: 1945})

In [26]:
tokenized_df = unique_augmented_data

#### 단어사전 생성

In [27]:
def create_word_to_index(vocab_path):
    # 인코딩에 활용할 단어사전 딕셔너리 생성
    word_to_index = {}
    with open(vocab_path, 'r') as f:
        lines = f.readlines()
        for line in lines:
            line = line.strip()
            tmp = line.split(": ")
            word = tmp[0]
            idx = int(tmp[1])
            word_to_index.update({word: idx})
    return word_to_index

In [28]:
vocab_path = '/aiffel/aiffel/dlthon-minions/share/preprocess/vocab.txt'
word_to_index=create_word_to_index(vocab_path)
word_to_index

{'<pad>': 0,
 '<unk>': 1,
 '.': 2,
 '?': 3,
 ',': 4,
 '!': 5,
 '내': 6,
 '해': 7,
 '뭐': 8,
 '는': 9,
 '도': 10,
 '좋아해': 11,
 '거': 12,
 '말': 13,
 '다': 14,
 '은': 15,
 '돈': 16,
 '친구': 17,
 '잘': 18,
 '니': 19,
 '있어': 20,
 '랑': 21,
 '요': 22,
 '영화': 23,
 '만': 24,
 '진짜': 25,
 '이야': 26,
 '정말': 27,
 '죄송합니다': 28,
 '한': 29,
 '게': 30,
 '지금': 31,
 '할': 32,
 '고': 33,
 '하고': 34,
 '한테': 35,
 '오늘': 36,
 '주로': 37,
 '님': 38,
 '주말': 39,
 '그냥': 40,
 '여행': 41,
 '돼': 42,
 '집': 43,
 '많이': 44,
 '자주': 45,
 '제발': 46,
 '가족': 47,
 '알': 48,
 '생각': 49,
 '거야': 50,
 '적': 51,
 '이랑': 52,
 '하는': 53,
 '더': 54,
 '운동': 55,
 '지': 56,
 '그렇게': 57,
 '너무': 58,
 '했어': 59,
 '빨리': 60,
 '회사': 61,
 '새끼': 62,
 '씨': 63,
 '만나': 64,
 '하면': 65,
 '아니야': 66,
 '없어': 67,
 '걸': 68,
 '수': 69,
 '애': 70,
 '면': 71,
 '줄': 72,
 '그런': 73,
 '이렇게': 74,
 '그게': 75,
 '그건': 76,
 '어제': 77,
 '서': 78,
 '이제': 79,
 '넌': 80,
 '대리': 81,
 '인데': 82,
 '사진': 83,
 '나도': 84,
 '엄마': 85,
 '아침': 86,
 '다녀': 87,
 '취미': 88,
 '부모님': 89,
 '만원': 90,
 '싶어': 91,
 '본': 92,
 '뭘': 93,
 '

#### 정수인코딩

In [29]:
# 'tokenized' 열의 데이터를 정수 인코딩
def encode_tokens(tokens, word_to_index):
    unk_index = word_to_index['<unk>']
    return [word_to_index.get(token, unk_index) for token in tokens]

tokenized_df['encoded'] = tokenized_df['tokenized'].apply(lambda x: encode_tokens(x, word_to_index))

In [30]:
tokenized_df.head()

Unnamed: 0,type,tokenized,encoded
0,4,"[학교, 점심, 뭐, 나와, ?, 주로, 한식, 나와, ., 학원, 다녀, ?, ,...","[101, 171, 8, 166, 3, 37, 223, 166, 2, 155, 87..."
1,3,"[뒤뚱거리, 는, 놈, ?, 뒤뚱거리, 는, 놈, 밖에, 더, 있냐, ?, 놀리지마...","[6378, 9, 201, 3, 6378, 9, 201, 179, 54, 516, ..."
2,0,"[그따위, 운전, 하면, 확, 갈아, 마셔, 버린다, ., 뭐라구, ?, 나와, 자...","[1330, 883, 65, 584, 3547, 272, 707, 2, 5233, ..."
3,2,"[길동, 씨, 것좀, 처리, 거, 한, 게, 아닌데요, 팀, 내, 니, 가가, 어딨...","[356, 63, 1301, 338, 12, 29, 30, 1302, 203, 6,..."
4,4,"[비, 많이, 오네, ., 우산, 가져왔어, ?, 날씨, 추워졌어, ., 맞아, ,...","[162, 44, 611, 2, 612, 517, 3, 124, 631, 2, 15..."


## 전처리 후 분석

#### 패딩 적용

In [31]:
# 대화 최대 길이 150으로 설정
MAX_LENGTH = 150

In [32]:
X = pad_sequences(tokenized_df['encoded'], maxlen=MAX_LENGTH, padding='post', truncating='post')

## 데이터 분할

In [33]:
y = tokenized_df['type']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, shuffle=True, random_state=42)

print('훈련 데이터의 개수 :', len(X_train))
print('훈련 레이블의 개수 :', len(y_train))
print('검증 데이터의 개수 :', len(X_val))
print('검증 레이블의 개수 :', len(y_val))
print('테스트 데이터의 개수 :', len(X_test))
print('테스트 레이블의 개수 :', len(y_test))

훈련 데이터의 개수 : 7009
훈련 레이블의 개수 : 7009
검증 데이터의 개수 : 876
검증 레이블의 개수 : 876
테스트 데이터의 개수 : 877
테스트 레이블의 개수 : 877


In [34]:
# 테스트 데이터셋 클래스 불균형 확인
counter = Counter(y_train)
counter.most_common()

[(3, 1672), (2, 1583), (1, 1539), (0, 1430), (4, 785)]

# 모델링

In [35]:
!pip install wandb==0.16.0



In [36]:
import wandb

key='809618c39f10bc0019fd6fd710cb28c698c30197'
wandb.login(key = key)

[34m[1mwandb[0m: Currently logged in as: [33m4rldur0[0m ([33m4-rldur0[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /aiffel/.netrc


True

In [93]:
sweep_config = {
    "name": "sweep_test_nlp",
    "metric": {"name": "val_loss", "goal": "minimize"},
    "method": "random",
    "parameters": {
        "learning_rate" : {
            "values": [0.03447]
            },
        "epoch" : {
            "values": [8]
            },
        "batch_size": {
            "values": [16]
            },
        "optimizer": {
            "values": ["adam"]
            },
        "dropout_rate":{
            "values": [0.1]
            }
        }
    }

default_config = {
        "vocab" : VOCAB_SIZE,
        "embeddings" : 128,
        "units_128" : 128,
        "units_256" : 256,
        "units_512" : 512,
        "units_1024" : 1024,
        "units_2048" : 2048,
        "kernel_3" : 3,
        "kernel_5" : 5,
        "class_num" : 5,
        "loss" : "sparse_categorical_crossentropy",
        "metrics" : ["accuracy"],
    }

In [38]:
def build_model_baseline(config):
    model=keras.models.Sequential()
    model.add(keras.layers.Embedding(config.vocab, config.embeddings))
    model.add(keras.layers.GRU(units = config.units_256, return_sequences = True))
    model.add(keras.layers.GRU(units = config.units_512))
    model.add(keras.layers.Dense(config.units_1024, activation='relu'))
    model.add(keras.layers.Dense(config.class_num, activation='softmax'))  
    return model

In [39]:
def build_model_1DCNN(config):
    model = keras.models.Sequential()
    model.add(keras.layers.Embedding(config.vocab, config.embeddings))
    model.add(keras.layers.Conv1D(config.embeddings, config.kernel_5, activation='relu'))
    model.add(keras.layers.MaxPooling1D(pool_size=4))
    model.add(keras.layers.GlobalMaxPooling1D())
    model.add(keras.layers.Dense(config.units_128, activation='relu'))
    model.add(keras.layers.Dense(config.class_num, activation='softmax')) 
    return model

In [103]:
def build_model_1DCNN_2(config):
    model = keras.models.Sequential()
    model.add(keras.layers.Embedding(config.vocab, config.embeddings))
    model.add(keras.layers.SpatialDropout1D(config.dropout_rate))
    model.add(keras.layers.Conv1D(config.embeddings, config.kernel_5, activation='relu'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.MaxPooling1D(pool_size=4))
    model.add(keras.layers.GlobalMaxPooling1D())
    model.add(keras.layers.Dense(config.units_1024, activation='relu'))
    model.add(keras.layers.Dropout(config.dropout_rate))
    model.add(keras.layers.Dense(config.class_num, activation='softmax')) 
    return model

In [85]:
def build_model_1DCNN_GRU(config):
    model = keras.models.Sequential()
    model.add(keras.layers.Embedding(config.vocab, config.embeddings))
    model.add(keras.layers.SpatialDropout1D(config.dropout_rate))
    model.add(keras.layers.Conv1D(config.embeddings, config.kernel_5, activation='relu'))
    model.add(keras.layers.MaxPooling1D(pool_size=4))
    model.add(keras.layers.GRU(config.units_128, dropout=config.dropout_rate, recurrent_dropout=config.dropout_rate))
    model.add(keras.layers.Dense(config.class_num, activation='softmax'))
    return model

In [86]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow import keras
import wandb

# heatmap으로 비교 결과를 그려주는 함수
def plot_table(cm):
    title = "Overall Prediction Result"
    # 실제 클래스명으로 변환
    classes = [
        '협박 대화',
        '갈취 대화',
        '직장 내 괴롭힘 대화',
        '기타 괴롭힘 대화',
        '일반 대화'
    ]
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=classes, yticklabels=classes, annot_kws={'size': 30})
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(title)
    return plt

# epoch이 모두 끝나면 각 클래스별 GT와 예측값을 비교한 표 시각화
# 5x5의 표의 [i,j]의 위치는 실제값은 i번째 클래스이고, 예측값은 j번째 클래스임을 나타냄
class CompareResultsCallback(keras.callbacks.Callback):
    def __init__(self, X_test, y_test, class_num):
        super().__init__()
        self.X_test = X_test
        self.y_test = y_test
        self.class_num = class_num
        # 전체 표 초기화
        self.table = np.zeros((self.class_num, self.class_num), dtype=np.int32)

    def on_train_end(self, epoch, logs=None):
        # 마지막 epoch에서만 계산
        pred_test = self.model.predict(self.X_test).argmax(axis=1)
        self.y_test = np.array(self.y_test)
        pred_test = np.array(pred_test)
            
        # 5개의 클래스에서 두 개씩 뽑아내어 비교
        # 실제값이 class_a일 때, 예측값을 claas_b로 예측한 횟수
        for class_a in range(self.class_num):
            for class_b in range(self.class_num):
                num = len(np.where((self.y_test == class_a) & (pred_test == class_b))[0])
                self.table[class_a, class_b] += num

        # 표 그리기
        cr_plot = plot_table(self.table)
            
        # wandb에 로그로 저장
        cr_image = wandb.Image(cr_plot)
        wandb.log({"Overall Prediction Result": cr_image})


In [104]:
# 학습 함수 정의
# CompareResultsCallback 테스트 데이터셋을 활용하므로 인자로 넣어줌
def train(default_config, X_test, y_test):

    wandb.init(config = default_config)
    config = wandb.config
    
    keras.backend.clear_session()

    # Model
    model = build_model_1DCNN_2(config)

    # Compile
    model.compile(optimizer = config.optimizer,
                  loss = config.loss,
                  metrics = config.metrics)
    
    # 비교 결과 그리는 콜백
    cr_callback = CompareResultsCallback(X_test, y_test, config.class_num)
    # earlystopping 콜백
    es_callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
    
    # 학습
    history = model.fit(X_train, y_train,
              epochs = config.epoch,
              batch_size = config.batch_size,
              validation_data = (X_val, y_val),
              callbacks=[wandb.keras.WandbCallback(), cr_callback, es_callback])
    
    # test dataset으로 accuracy 계산    
    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=2)
    pred_test = model.predict(X_test).argmax(axis=1)
    # test dataset으로 f1 score 계산
    f1_score_res = f1_score(y_test, pred_test, average='micro')

    # wandb에 log 추가
    wandb.log({
        "Test Accuracy Rate": test_accuracy,
        "Test F1 Score": f1_score_res,
        "Test Error Rate": 1 - test_accuracy
    })
    
    return history

In [105]:
# train()에 인자가 있으므로 wrapper function 정의
def sweep_train():
    train(default_config=default_config, X_test=X_test, y_test=y_test)

# 팀프로젝트 내에서 sweep 실행
sweep_id = wandb.sweep(sweep_config,
                       entity = 'aiffel_minions',
                       project = 'DLthon_CNN2_Augmented')


wandb.agent(sweep_id,
            function=sweep_train,
            count=1)

Create sweep with ID: ggo3ndxu
Sweep URL: https://wandb.ai/aiffel_minions/DLthon_CNN2_Augmented/sweeps/ggo3ndxu


[34m[1mwandb[0m: Agent Starting Run: qptfec6f with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	dropout_rate: 0.5
[34m[1mwandb[0m: 	epoch: 8
[34m[1mwandb[0m: 	learning_rate: 0.03447
[34m[1mwandb[0m: 	optimizer: adam


Epoch 1/8


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Run qptfec6f errored: TypeError("in user code:\n\n    /opt/conda/lib/python3.9/site-packages/keras/engine/training.py:853 train_function  *\n        return step_function(self, iterator)\n    /opt/conda/lib/python3.9/site-packages/keras/engine/training.py:842 step_function  **\n        outputs = model.distribute_strategy.run(run_step, args=(data,))\n    /opt/conda/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:1286 run\n        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)\n    /opt/conda/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:2849 call_for_each_replica\n        return self._call_for_each_replica(fn, args, kwargs)\n    /opt/conda/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:3632 _call_for_each_replica\n        return fn(*args, **kwargs)\n    /opt/conda/lib/python3.9/site-packages/keras/engine/training.py:835 run_step  **\n        outputs = model.train_step(data)\n   