In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import urllib.request
from konlpy.tag import Okt
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

ModuleNotFoundError: No module named 'konlpy'

In [2]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

('ratings_test.txt', <http.client.HTTPMessage at 0x7a6b49b72590>)

In [3]:
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

In [4]:
train_data

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1
...,...,...,...
149995,6222902,인간이 문제지.. 소는 뭔죄인가..,0
149996,8549745,평점이 너무 낮아서...,1
149997,9311800,이게 뭐요? 한국인은 거들먹거리고 필리핀 혼혈은 착하다?,0
149998,2376369,청춘 영화의 최고봉.방황과 우울했던 날들의 자화상,1


In [5]:
# 결측값 제거 (document 열에서 NaN 값이 있는 경우 삭제)
train_data = train_data.dropna(subset=['document'])

# 텍스트가 문자열인지 확인 후 길이 계산
train_data['document'] = train_data['document'].astype(str)  # 모든 document를 문자열로 변환

# 리뷰 텍스트의 길이 계산
train_data['length'] = train_data['document'].apply(len)

# 최솟값, 최댓값, 평균 계산
min_length = train_data['length'].min()
max_length = train_data['length'].max()
avg_length = train_data['length'].mean()

print(f"최솟값: {min_length}")
print(f"최댓값: {max_length}")
print(f"평균값: {avg_length:.2f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['document'] = train_data['document'].astype(str)  # 모든 document를 문자열로 변환


최솟값: 1
최댓값: 146
평균값: 35.20


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['length'] = train_data['document'].apply(len)


In [6]:
from tokenizers import ByteLevelBPETokenizer

# Initialize the BPE tokenizer
tokenizer = ByteLevelBPETokenizer()

# Train the tokenizer on your dataset
tokenizer.train(files='ratings_train.txt', vocab_size=8000, min_frequency=2, special_tokens=["<pad>", "<s>", "</s>", "<unk>"])
tokenizer.save_model(".", "bpe_tokenizer")

['./bpe_tokenizer-vocab.json', './bpe_tokenizer-merges.txt']

In [7]:
tokenizer = ByteLevelBPETokenizer("bpe_tokenizer-vocab.json", "bpe_tokenizer-merges.txt")

# Ensure all entries are strings and handle missing values
train_data['document'] = train_data['document'].astype(str).fillna('')
test_data['document'] = test_data['document'].astype(str).fillna('')

# Tokenize the data with BPE
train_data['tokenized'] = train_data['document'].apply(lambda x: tokenizer.encode(x).ids)
test_data['tokenized'] = test_data['document'].apply(lambda x: tokenizer.encode(x).ids)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['document'] = train_data['document'].astype(str).fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['tokenized'] = train_data['document'].apply(lambda x: tokenizer.encode(x).ids)


In [8]:
train_data

Unnamed: 0,id,document,label,length,tokenized
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0,19,"[337, 2551, 266, 621, 4522, 484, 4496]"
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1,33,"[4042, 339, 3200, 811, 3178, 360, 862, 431, 45..."
2,10265843,너무재밓었다그래서보는것을추천한다,0,17,"[745, 496, 112, 245, 692, 4473, 1146, 2484, 30..."
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0,29,"[1011, 297, 609, 1071, 1439, 5851, 1319, 1773,..."
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1,61,"[5052, 6206, 2404, 478, 309, 5784, 1084, 3495,..."
...,...,...,...,...,...
149995,6222902,인간이 문제지.. 소는 뭔죄인가..,0,19,"[2339, 267, 2071, 287, 266, 700, 273, 1186, 21..."
149996,8549745,평점이 너무 낮아서...,1,13,"[1783, 465, 5017, 339]"
149997,9311800,이게 뭐요? 한국인은 거들먹거리고 필리핀 혼혈은 착하다?,0,31,"[1277, 802, 323, 34, 1107, 3870, 865, 334, 117..."
149998,2376369,청춘 영화의 최고봉.방황과 우울했던 날들의 자화상,1,27,"[917, 2077, 1660, 7591, 17, 1136, 1478, 450, 4..."


In [9]:
import pandas as pd
import sentencepiece as spm
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

max_len = 40
X = pad_sequences(train_data['tokenized'], maxlen=max_len, padding='post')

# 7. 레이블 준비
y = to_categorical(train_data['label'])
# 8. 데이터 분할 (훈련셋과 검증셋)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, Conv1D, MaxPooling1D, GlobalMaxPooling1D, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 10000  # 어휘 사전의 크기
word_vector_dim = 512   # 임베딩 벡터의 차원

# 1. 모델 구성
model1 = tf.keras.Sequential()
model1.add(tf.keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))

# 첫 번째 Conv1D 레이어
model1.add(tf.keras.layers.Conv1D(256, 3, activation='relu'))  # 커널 크기 3
model1.add(tf.keras.layers.MaxPooling1D(2))  # 풀링 크기 2

# 두 번째 Conv1D 레이어
model1.add(tf.keras.layers.Conv1D(128, 3, activation='relu'))  # 커널 크기 3
model1.add(tf.keras.layers.GlobalMaxPooling1D())

# Dense 레이어
model1.add(tf.keras.layers.Dense(32, activation='relu'))
model1.add(tf.keras.layers.Dense(1, activation='sigmoid'))  # 이진 분류를 위한 출력층

# 2. 모델 컴파일
model1.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

y_train_binary = np.argmax(y_train, axis=1)
y_val_binary = np.argmax(y_val, axis=1)

# 모델 요약 정보 출력
model1.summary()

  super().__init__(**kwargs)


In [11]:
epochs= 5  # 몇 epoch를 훈련하면 좋을지 결과를 보면서 바꾸어 봅시다.

history = model1.fit(X_train,
                    y_train_binary,  # 이진 레이블을 사용
                    epochs=epochs,
                    batch_size=512,
                    validation_data=(X_val, y_val_binary),  # 이진 레이블을 사용
                    verbose=1)

Epoch 1/5
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 61ms/step - accuracy: 0.7428 - loss: 0.4947 - val_accuracy: 0.8507 - val_loss: 0.3380
Epoch 2/5
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - accuracy: 0.8795 - loss: 0.2891 - val_accuracy: 0.8517 - val_loss: 0.3373
Epoch 3/5
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 23ms/step - accuracy: 0.9193 - loss: 0.2068 - val_accuracy: 0.8399 - val_loss: 0.3819
Epoch 4/5
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 22ms/step - accuracy: 0.9555 - loss: 0.1270 - val_accuracy: 0.8354 - val_loss: 0.4563
Epoch 5/5
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - accuracy: 0.9789 - loss: 0.0632 - val_accuracy: 0.8385 - val_loss: 0.5914


In [12]:
# 1. 테스트 데이터 로드
test_data = pd.read_table('ratings_test.txt')

# 2. 결측값 제거
test_data = test_data.dropna(subset=['document','label'])

# 3. 레이블 준비(감정 분석 레이블)
y_test = test_data['label'].values

# 이미 학습된 sentencepiece 모델을 로드
#import sentencepiece as spm
#sp = spm.SentencePieceProcessor()
#sp.load('naver_spm.model')
#test_data['tokenized'] = test_data['document'].apply(lambda x: tokenizer.encode(x).ids)
# 리뷰를 토큰화하여 ID로 변환
x_test = test_data['document'].apply(lambda x: tokenizer.encode(x).ids)

# 5. 패딩 처리 (최대 길이 80으로 설정)
max_len = 40
x_test_padded = pad_sequences(x_test, maxlen=max_len, padding='post')

# 이제 x_test_padded와 y_test를 사용하여 모델을 평가할 수 있습니다.
results = model1.evaluate(x_test_padded, y_test, verbose=2)
print(f"테스트 데이터에서의 손실: {results[0]}, 정확도: {results[1]}")

1563/1563 - 4s - 2ms/step - accuracy: 0.8364 - loss: 0.6057
테스트 데이터에서의 손실: 0.6057165265083313, 정확도: 0.8363701701164246


# 모델2 구성(Global MaxPooling only)

In [13]:
import tensorflow as tf
vocab_size =50000
word_vector_dim = 512

model2 = tf.keras.Sequential()
model2.add(tf.keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))
model2.add(tf.keras.layers.GlobalMaxPooling1D())
model2.add(tf.keras.layers.Dense(128, activation='relu'))
model2.add(tf.keras.layers.Dense(1, activation='sigmoid'))  # 최종 출력은 긍정/부정을 나타내는 1dim 입니다.

model2.summary()

  super().__init__(**kwargs)


In [14]:
model2.compile(optimizer='adam',
               loss='binary_crossentropy',
               metrics=['accuracy'])

y_train_binary = np.argmax(y_train, axis=1)
y_val_binary = np.argmax(y_val, axis=1)
epoch = 5
history2 = model2.fit(X_train,
                    y_train_binary,  # 이진 레이블을 사용
                    epochs=epoch,
                    batch_size=512,
                    validation_data=(X_val, y_val_binary),  # 이진 레이블을 사용
                    verbose=1)

Epoch 1/5
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - accuracy: 0.7307 - loss: 0.5300 - val_accuracy: 0.8500 - val_loss: 0.3408
Epoch 2/5
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8820 - loss: 0.2846 - val_accuracy: 0.8507 - val_loss: 0.3426
Epoch 3/5
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9134 - loss: 0.2234 - val_accuracy: 0.8445 - val_loss: 0.3645
Epoch 4/5
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.9436 - loss: 0.1625 - val_accuracy: 0.8436 - val_loss: 0.4121
Epoch 5/5
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9726 - loss: 0.0935 - val_accuracy: 0.8369 - val_loss: 0.4895


In [15]:
# 1. 테스트 데이터 로드
test_data = pd.read_table('ratings_test.txt')

# 2. 결측값 제거
test_data = test_data.dropna(subset=['document','label'])

# 3. 레이블 준비(감정 분석 레이블)
y_test = test_data['label'].values

# 이미 학습된 sentencepiece 모델을 로드


# 리뷰를 토큰화하여 ID로 변환
x_test = test_data['document'].apply(lambda x: tokenizer.encode(x).ids)

# 5. 패딩 처리 (최대 길이 80으로 설정)
max_len = 45
x_test_padded = pad_sequences(x_test, maxlen=max_len, padding='post')

# 이제 x_test_padded와 y_test를 사용하여 모델을 평가할 수 있습니다.
results = model2.evaluate(x_test_padded, y_test, verbose=2)
print(f"테스트 데이터에서의 손실: {results[0]}, 정확도: {results[1]}")

1563/1563 - 3s - 2ms/step - accuracy: 0.8352 - loss: 0.5035
테스트 데이터에서의 손실: 0.5034829378128052, 정확도: 0.8351700901985168


# Bert 모델 적용

In [17]:
# 포지셔널 인코딩 레이어
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, position, d_model, **kwargs):
        super(PositionalEncoding, self).__init__(**kwargs)
        self.position = position
        self.d_model = d_model
        self.pos_encoding = self.positional_encoding(position, d_model)

    def get_angles(self, position, i, d_model):
        # 각도 계산
        angles = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        return position * angles

    def positional_encoding(self, position, d_model):
        # 포지셔널 인코딩 계산
        angle_rads = self.get_angles(
            np.arange(position)[:, np.newaxis],
            np.arange(d_model)[np.newaxis, :],
            d_model)

        # 배열의 짝수 인덱스에는 sin 함수 적용
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

        # 배열의 홀수 인덱스에는 cos 함수 적용
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

        pos_encoding = angle_rads[np.newaxis, ...]

        return tf.cast(pos_encoding, dtype=tf.float32)

    def call(self, inputs):
        return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]

    def get_config(self):
        config = super(PositionalEncoding, self).get_config()
        config.update({
            'position': self.position,
            'd_model': self.d_model
        })
        return config

In [16]:
# 패딩 마스크 생성 함수
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    # (batch_size, 1, 1, seq_len)
    return seq[:, tf.newaxis, tf.newaxis, :]

In [18]:
def scaled_dot_product_attention(q, k, v, mask):
    """ 스케일드 닷 프로덕트 어텐션 """
    matmul_qk = tf.matmul(q, k, transpose_b=True)  # Q와 K의 행렬 곱

    # 스케일링
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    # 마스크가 주어졌을 경우 -inf 적용
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)

    # softmax를 통해 attention weights 계산
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)

    # attention weights와 V의 행렬곱
    output = tf.matmul(attention_weights, v)

    return output

In [19]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, **kwargs):
        super(MultiHeadAttention, self).__init__(**kwargs)
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % num_heads == 0

        self.depth = d_model // num_heads

        # 가중치 행렬 정의
        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        """ 헤드 분할: [batch_size, seq_len, d_model] -> [batch_size, num_heads, seq_len, depth] """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask=None):
        batch_size = tf.shape(q)[0]

        # 선형 변환
        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)

        # 헤드 분할
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        # 스케일드 닷 프로덕트 어텐션
        scaled_attention = scaled_dot_product_attention(q, k, v, mask)

        # 헤드 병합
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))

        # 최종 선형 변환
        output = self.dense(concat_attention)

        return output

    def get_config(self):
        # 직렬화 지원을 위한 설정 저장
        config = super(MultiHeadAttention, self).get_config()
        config.update({
            'd_model': self.d_model,
            'num_heads': self.num_heads,
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)

class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, position, d_model, **kwargs):
        super(PositionalEncoding, self).__init__(**kwargs)
        self.position = position
        self.d_model = d_model
        self.pos_encoding = self.positional_encoding(position, d_model)

    def get_angles(self, position, i, d_model):
        # 각도 계산
        angles = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        return position * angles

    def positional_encoding(self, position, d_model):
        # 포지셔널 인코딩 계산
        angle_rads = self.get_angles(
            np.arange(position)[:, np.newaxis],
            np.arange(d_model)[np.newaxis, :],
            d_model)

        # 배열의 짝수 인덱스에는 sin 함수 적용
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

        # 배열의 홀수 인덱스에는 cos 함수 적용
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

        pos_encoding = angle_rads[np.newaxis, ...]

        return tf.cast(pos_encoding, dtype=tf.float32)

    def call(self, inputs):
        return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]

    def get_config(self):
        config = super(PositionalEncoding, self).get_config()
        config.update({
            'position': self.position,
            'd_model': self.d_model
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [20]:
# 인코더 레이어
def encoder_layer(units, d_model, num_heads, dropout_rate, name="encoder_layer"):
    inputs = tf.keras.Input(shape=(None, d_model), name="inputs")
    padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")

    # 멀티 헤드 어텐션
    attention = MultiHeadAttention(d_model, num_heads)(inputs, inputs, inputs, padding_mask)
    attention = tf.keras.layers.Dropout(dropout_rate)(attention)
    attention = tf.keras.layers.LayerNormalization(epsilon=1e-6)(inputs + attention)

    # 포지션 와이즈 피드 포워드 네트워크
    outputs = tf.keras.layers.Dense(units, activation='relu')(attention)
    outputs = tf.keras.layers.Dense(d_model)(outputs)
    outputs = tf.keras.layers.Dropout(dropout_rate)(outputs)
    outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention + outputs)

    return tf.keras.Model(inputs=[inputs, padding_mask], outputs=outputs, name=name)

In [21]:
# 인코더 전체
def encoder(vocab_size,
            num_layers,
            units,
            d_model,
            num_heads,
            dropout_rate,
            maximum_position_encoding,
            name="encoder"):
    inputs = tf.keras.Input(shape=(None,), name="inputs")
    padding_mask = tf.keras.Input(shape=(1, 1, None), name="padding_mask")

    # 임베딩 및 포지셔널 인코딩
    embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
    embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))
    embeddings = PositionalEncoding(maximum_position_encoding, d_model)(embeddings)

    outputs = tf.keras.layers.Dropout(dropout_rate)(embeddings)

    # 인코더 레이어 쌓기
    for i in range(num_layers):
        outputs = encoder_layer(units=units,
                                d_model=d_model,
                                num_heads=num_heads,
                                dropout_rate=dropout_rate,
                                name=f"encoder_layer_{i}",
                                )(inputs=[outputs, padding_mask])

    return tf.keras.Model(inputs=[inputs, padding_mask], outputs=outputs, name=name)

In [22]:
def build_encoder_model():
    inputs = tf.keras.Input(shape=(40,), name="inputs")
    padding_mask = tf.keras.layers.Lambda(create_padding_mask, output_shape=(1, 1, None), name='padding_mask')(inputs)

    # Embedding Layer
    embeddings = tf.keras.layers.Embedding(vocab_size, d_model)(inputs)
    embeddings *= tf.math.sqrt(tf.cast(d_model, tf.float32))

    # Convert embeddings to dense before applying positional encoding
    embeddings = tf.keras.layers.Lambda(lambda x: tf.sparse.to_dense(x) if tf.is_tensor(x) and isinstance(x, tf.sparse.SparseTensor) else x)(embeddings)
    embeddings = PositionalEncoding(position=40, d_model=d_model)(embeddings)

    outputs = tf.keras.layers.Dropout(dropout_rate)(embeddings)

    # Stack Encoder Layers
    for i in range(num_layers):
        outputs = encoder_layer(units=units,
                                d_model=d_model,
                                num_heads=num_heads,
                                dropout_rate=dropout_rate,
                                name=f"encoder_layer_{i}",
                                )(inputs=[outputs, padding_mask])

    # Global Average Pooling
    pooled_output = tf.keras.layers.GlobalAveragePooling1D()(outputs)

    # Output Layer for Classification
    outputs = tf.keras.layers.Dense(units=num_classes, activation='softmax', name='outputs')(pooled_output)

    # Model Definition
    model = tf.keras.Model(inputs=inputs, outputs=outputs, name="transformer_classifier")

    return model


In [23]:
# 하이퍼파라미터 설정
vocab_size = 30000
num_layers = 2
d_model = 256
num_heads = 4
units = 512
dropout_rate = 0.4
num_classes = 5  # 클래스 수

In [24]:
import numpy as np

# Convert one-hot labels to integer labels
y_train = np.argmax(y_train, axis=1)
y_val = np.argmax(y_val, axis=1)

In [25]:
# 모델 생성
model = build_encoder_model()

# 모델 컴파일
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# 모델 요약
model.summary()

# 모델 학습

In [26]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
# 콜백 설정
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

model_checkpoint = ModelCheckpoint(
    filepath='best_model.keras',  # Use .h5 if preferred
    monitor='val_loss',
    save_best_only=True,
    mode='min',
    verbose=1
)
# 모델 학습
history = model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=6,
    batch_size=32,
    callbacks=[early_stopping, model_checkpoint]
)

Epoch 1/6
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.7407 - loss: 0.5218
Epoch 1: val_loss improved from inf to 0.42191, saving model to best_model.keras
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 11ms/step - accuracy: 0.7407 - loss: 0.5218 - val_accuracy: 0.7909 - val_loss: 0.4219
Epoch 2/6
[1m3747/3750[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 7ms/step - accuracy: 0.8244 - loss: 0.3914
Epoch 2: val_loss improved from 0.42191 to 0.39123, saving model to best_model.keras
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 8ms/step - accuracy: 0.8244 - loss: 0.3914 - val_accuracy: 0.8262 - val_loss: 0.3912
Epoch 3/6
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.8337 - loss: 0.3760
Epoch 3: val_loss did not improve from 0.39123
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 9ms/step - accuracy: 0.8337 - loss: 0.3760 - v

# 모델 LSTM 구성

In [28]:
import tensorflow as tf

vocab_size = 50000
word_vector_dim = 512

model3 = tf.keras.Sequential()
model3.add(tf.keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))

# 첫 번째 LSTM 레이어 (깊이 증가)
model3.add(tf.keras.layers.LSTM(64, return_sequences=True))  # 중간 층에서도 시퀀스 출력
model3.add(tf.keras.layers.Dropout(0.3))  # Dropout 추가

# 두 번째 LSTM 레이어 (크기 증가)
model3.add(tf.keras.layers.LSTM(64))
model3.add(tf.keras.layers.Dropout(0.3))

# Dense 레이어 (뉴런 수 증가)
model3.add(tf.keras.layers.Dense(64, activation='relu'))
model3.add(tf.keras.layers.Dropout(0.3))

# 출력 레이어
model3.add(tf.keras.layers.Dense(1, activation='sigmoid'))  # 긍정/부정 분류

# 모델 요약
model3.summary()

In [30]:
model3.compile(optimizer='adam',
               loss='binary_crossentropy',
               metrics=['accuracy'])

# Train the model
history = model3.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=32,
    callbacks=[early_stopping, model_checkpoint]
)

Epoch 1/10
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.6708 - loss: 0.5690
Epoch 1: val_loss improved from 0.39123 to 0.34340, saving model to best_model.keras
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 25ms/step - accuracy: 0.6708 - loss: 0.5690 - val_accuracy: 0.8482 - val_loss: 0.3434
Epoch 2/10
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.8668 - loss: 0.3101
Epoch 2: val_loss improved from 0.34340 to 0.31990, saving model to best_model.keras
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 26ms/step - accuracy: 0.8668 - loss: 0.3101 - val_accuracy: 0.8599 - val_loss: 0.3199
Epoch 3/10
[1m3749/3750[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 23ms/step - accuracy: 0.8946 - loss: 0.2545
Epoch 3: val_loss did not improve from 0.31990
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 24ms/step - accuracy: 0.8946 - los

In [32]:
# 1. 테스트 데이터 로드
test_data = pd.read_table('ratings_test.txt')

# 2. 결측값 제거
test_data = test_data.dropna(subset=['document','label'])

# 3. 레이블 준비(감정 분석 레이블)
y_test = test_data['label'].values

# 이미 학습된 sentencepiece 모델을 로드
#import sentencepiece as spm
#sp = spm.SentencePieceProcessor()
#sp.load('naver_spm.model')
#test_data['tokenized'] = test_data['document'].apply(lambda x: tokenizer.encode(x).ids)
# 리뷰를 토큰화하여 ID로 변환
x_test = test_data['document'].apply(lambda x: tokenizer.encode(x).ids)

# 5. 패딩 처리 (최대 길이 80으로 설정)
max_len = 40
x_test_padded = pad_sequences(x_test, maxlen=max_len, padding='post')

# 이제 x_test_padded와 y_test를 사용하여 모델을 평가할 수 있습니다.
results = model3.evaluate(x_test_padded, y_test, verbose=2)
print(f"테스트 데이터에서의 손실: {results[0]}, 정확도: {results[1]}")

1563/1563 - 5s - 3ms/step - accuracy: 0.8539 - loss: 0.3310
테스트 데이터에서의 손실: 0.3310238718986511, 정확도: 0.8539312481880188


## LSTM 정확도는 85%이다.

# Bi-LSTM 모델 적용

In [34]:
import tensorflow as tf

vocab_size = 50000
word_vector_dim = 512

model4 = tf.keras.Sequential()
model4.add(tf.keras.layers.Embedding(vocab_size, word_vector_dim, input_shape=(None,)))

# 첫 번째 Bi-LSTM 레이어 (양방향)
model4.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)))
model4.add(tf.keras.layers.Dropout(0.3))  # Dropout 추가

# 두 번째 Bi-LSTM 레이어
model4.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
model4.add(tf.keras.layers.Dropout(0.3))

# Dense 레이어 (뉴런 수 증가)
model4.add(tf.keras.layers.Dense(64, activation='relu'))
model4.add(tf.keras.layers.Dropout(0.3))

# 출력 레이어
model4.add(tf.keras.layers.Dense(1, activation='sigmoid'))  # 긍정/부정 분류

# 모델 요약
model4.summary()

  super().__init__(**kwargs)


In [35]:
model4.compile(optimizer='adam',
               loss='binary_crossentropy',
               metrics=['accuracy'])

# Train the model
history = model4.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=32,
    callbacks=[early_stopping, model_checkpoint]
)

Epoch 1/10
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.7713 - loss: 0.4557
Epoch 1: val_loss did not improve from 0.31990
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 29ms/step - accuracy: 0.7713 - loss: 0.4556 - val_accuracy: 0.8545 - val_loss: 0.3310
Epoch 2/10
[1m3749/3750[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 29ms/step - accuracy: 0.8754 - loss: 0.2939
Epoch 2: val_loss did not improve from 0.31990
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 30ms/step - accuracy: 0.8754 - loss: 0.2939 - val_accuracy: 0.8595 - val_loss: 0.3263


In [36]:
# 1. 테스트 데이터 로드
test_data = pd.read_table('ratings_test.txt')

# 2. 결측값 제거
test_data = test_data.dropna(subset=['document','label'])

# 3. 레이블 준비(감정 분석 레이블)
y_test = test_data['label'].values

# 이미 학습된 sentencepiece 모델을 로드
#import sentencepiece as spm
#sp = spm.SentencePieceProcessor()
#sp.load('naver_spm.model')
#test_data['tokenized'] = test_data['document'].apply(lambda x: tokenizer.encode(x).ids)
# 리뷰를 토큰화하여 ID로 변환
x_test = test_data['document'].apply(lambda x: tokenizer.encode(x).ids)

# 5. 패딩 처리 (최대 길이 80으로 설정)
max_len = 40
x_test_padded = pad_sequences(x_test, maxlen=max_len, padding='post')

# 이제 x_test_padded와 y_test를 사용하여 모델을 평가할 수 있습니다.
results = model4.evaluate(x_test_padded, y_test, verbose=2)
print(f"테스트 데이터에서의 손실: {results[0]}, 정확도: {results[1]}")

1563/1563 - 8s - 5ms/step - accuracy: 0.8479 - loss: 0.3405
테스트 데이터에서의 손실: 0.34047386050224304, 정확도: 0.8478908538818359


# Bi-LSTM 정확도는 84.7%이다.

# 회고
###감정분류 데이터를 가지고 여러가지 모델로 분류 task를 진행하였는데 LSTM모델이 가장 BEST(정확도:85%)인거 같다.