In [1]:
from tensorflow import keras
from tensorflow.keras.datasets import imdb
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['axes.unicode_minus'] = False  # 마이너스 표시 해결
# 한글설정
matplotlib.rcParams['font.family'] = 'AppleGothic' # Mac사용자
matplotlib.rcParams['font.size'] = '10' # 글자크기

In [2]:
# imdb데이터 불러오기
# 단어사전은 500개 까지 만 가져옴
# 원핫인코딩하면 500개 컬럼이 만들어짐.
(train_input,train_target),(test_input,test_target) = keras.datasets.imdb.load_data(
    num_words = 2000
)

In [3]:
print(train_input.shape,test_input.shape)

(25000,) (25000,)


In [4]:
# 2진 분류 : 긍정 - 1, 부정 - 0
print(train_target[:20])

[1 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 1]


In [5]:
from sklearn.model_selection import train_test_split
train_input,val_input,train_target,val_target = train_test_split(
    train_input,train_target,test_size=0.2, random_state=42
)

In [6]:
print(train_input.shape,val_input.shape)

(20000,) (5000,)


In [7]:
lengths = np.array([len(x) for x in train_input])
lengths

array([259, 520, 290, ..., 300,  70,  77])

In [8]:
print(np.mean(lengths),np.median(lengths))

239.00925 178.0


In [9]:
np.max(lengths)

1854

In [10]:
# 시퀀스 패딩
# 글자의 수를 제한해서 없는 부분은 0으로 채워줌
# 최대 글자수 100개 제한
from tensorflow.keras.preprocessing.sequence import pad_sequences

train_seq = pad_sequences(train_input,maxlen=300)

In [11]:
# 20000개 샘플, 크기 100
train_seq.shape

(20000, 300)

In [12]:
# 검증세트 - 시퀀스패딩
val_seq = pad_sequences(val_input,maxlen=300)

In [13]:
train_seq[0][0:20]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int32)

단어 임베딩

In [14]:
model = keras.Sequential()
# 임베딩층 추가
model.add(keras.layers.Embedding(2000,16,input_length=300))
model.add(keras.layers.SimpleRNN(8))
model.add(keras.layers.Dense(1,activation='sigmoid'))
model.summary()



In [15]:
# 인공신경망 훈련 동일
# Flatten 필요없음.
# 원핫인코딩을 사용하지 않음. - 임베딩층 사용
# train_oh -> train_seq 데이터 사용
rmsprop = keras.optimizers.RMSprop(learning_rate=0.0001)
model.compile(optimizer=rmsprop,loss='binary_crossentropy',metrics=['accuracy'])
checkpoint_cb = keras.callbacks.ModelCheckpoint('simpleRnn_embedding_model.keras')
early_stopping_cb = keras.callbacks.EarlyStopping(patience=3,
                                                  restore_best_weights=True )
history = model.fit(train_seq,train_target,batch_size=64,epochs=100,
                validation_data=(val_seq,val_target), 
                callbacks=[checkpoint_cb,early_stopping_cb])

Epoch 1/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 20ms/step - accuracy: 0.5441 - loss: 0.6862 - val_accuracy: 0.6622 - val_loss: 0.6434
Epoch 2/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 20ms/step - accuracy: 0.7017 - loss: 0.6214 - val_accuracy: 0.7550 - val_loss: 0.5929
Epoch 3/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 20ms/step - accuracy: 0.7652 - loss: 0.5752 - val_accuracy: 0.7784 - val_loss: 0.5524
Epoch 4/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 20ms/step - accuracy: 0.7905 - loss: 0.5355 - val_accuracy: 0.7586 - val_loss: 0.5421
Epoch 5/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 20ms/step - accuracy: 0.8170 - loss: 0.4998 - val_accuracy: 0.8206 - val_loss: 0.4881
Epoch 6/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 20ms/step - accuracy: 0.8292 - loss: 0.4708 - val_accuracy: 0.8160 - val_loss: 0.4759
Epoch 7/100
[1m

In [16]:
# stop위치 출력 - 2번 전의 모델이 가장 좋음.
early_stopping_cb.stopped_epoch

22

In [17]:
model.evaluate(val_seq,val_target)

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8528 - loss: 0.3506


[0.34686243534088135, 0.8575999736785889]

In [18]:
# test_input
# 시퀀스 패딩
test_seq = pad_sequences(test_input,maxlen=100)
# 원핫인코딩
# test_oh = keras.utils.to_categorical(test_seq)
model.evaluate(test_seq,test_target)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8245 - loss: 0.4015


[0.40021172165870667, 0.8244799971580505]

In [19]:
# 딥러닝 훈련
# 데이터전처리 train_seq = pad_sequences(train_input,maxlen=100)

# 모델결정 model = keras.Sequential()
# 순환 신경망 (RNN)
# model2.add(keras.layers.Embedding(500,16,input_length=100))
# model.add(keras.layers.SimpleRNN(8))
# model.add(keras.layers.Dense(1,activation='sigmoid'))
# model.summary()


# 인공신경망 훈련
# 인공신경망 훈련 동일
# Flatten 필요없음. 원핫인코딩
# 모델설정 옵티마이저 - rmsprop
# rmsprop = keras.optimizers.RMSprop(learning_rate=0.0001)
# model.compile(optimizer=rmsprop,loss='binary_crossentropy',metrics=['accuracy'])
# 콜백모델체크포인트 checkpoint_cb = keras.callbacks.ModelCheckpoint('simpleRnn_model.keras')
# 종기종료 early_stopping_cb = keras.callbacks.EarlyStopping(patience=3,restore_best_weights=True )
# 모델훈련 history = model.fit(train_seq,train_target,batch_size=64,epochs=100,
#                 validation_data=(val_oh,val_target), 
#                 callbacks=[checkpoint_cb,early_stopping_cb])
# 모델평가 model.evaluate(val_seq,val_target)

# test세트 검증 - 3차원행렬로 변경해서 test진행해야 함.
# # 시퀀스 패딩
# test_seq = pad_sequences(test_input,maxlen=100)
# # 원핫인코딩
# #test_oh = keras.utils.to_categorical(test_seq)
# model.evaluate(test_seq,test_target)