In [12]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
df = pd.read_csv('movie_data.csv', encoding='utf-8')

In [13]:
# 1단계 : 데이터셋 만들기
target = df.pop('sentiment') # 레이블 뽑아내고 df에서 삭제

## tf로 데이터 불러오기(dataset만들기)
ds_raw = tf.data.Dataset.from_tensor_slices( 
    (df.values, target.values))

# 확인 - 3행 가져오기
for ex in ds_raw.take(3): 
    tf.print(ex[0].numpy()[0][ :50], ex[1])
    

b'In 1974, the teenager Martha Moxley (Maggie Grace)' 1
b'OK... so... I really like Kris Kristofferson and h' 0
b'***SPOILER*** Do not read this, if you think about' 0


In [16]:
# 전체 데이터셋은 5만개, 처음 2만 5천개는 평가, 다음 2만개는 훈련 다음 5천개는 검증
tf.random.set_seed(1)
ds_raw = ds_raw.shuffle(
50000, reshuffle_each_iteration=False)
ds_raw_test = ds_raw.take(25000)
ds_raw_train_valid = ds_raw.skip(25000)
ds_raw_train = ds_raw_train_valid.take(20000)
ds_raw_valid = ds_raw_train_valid.skip(20000)

In [17]:
# 2단계 : 고유 토큰 (단어) 찾기
from collections import Counter # 고유단어 빈도 수집
tokenizer=tfds.deprecated.text.Tokenizer() # 텍스트를 단어(토큰)으로 나누기 위해 사용
token_counts = Counter()
for example in ds_raw_train:
    tokens = tokenizer.tokenize(example[0].numpy()[0])
    token_counts.update(tokens)
print('어휘 사전 크기 :', len(token_counts))

어휘 사전 크기 : 87505


In [24]:
# 3단계 : 고유 토큰을 정수로 인코딩하기
encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)
example_str = 'This is an example!'
print(encoder.encode(example_str))

[176, 48, 247, 716]


In [27]:
# 3-A 단계 : 변환을 위한 함수 정의
def encode(text_tensor, label):
    text = text_tensor.numpy()[0]
    encoded_text = encoder.encode(text)
    return encoded_text, label

# 3-B 단계 : 함수를 TF연산으로 변환하기
def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label], Tout = (tf.int64, tf.int64))
ds_train = ds_raw_train.map(encode_map_fn)
ds_valid = ds_raw_valid.map(encode_map_fn)
ds_test = ds_raw_test.map(encode_map_fn)

# 샘플의 크기 확인하기
tf.random.set_seed(1)
for example in ds_train.shuffle(1000).take(5):
    print('시퀀스 길이 :', example[0].shape)

시퀀스 길이 : (81,)
시퀀스 길이 : (117,)
시퀀스 길이 : (92,)
시퀀스 길이 : (305,)
시퀀스 길이 : (102,)


In [33]:
# padded_batch : 하나의 배치에 포함되는 모든 원소를 0으로 패딩하여 모든 시퀀스가 동일한 길이가 되도록 만듦
train_data = ds_train.padded_batch(32, padded_shapes=([-1],[]))
valid_data = ds_valid.padded_batch(32, padded_shapes=([-1],[]))
test_data = ds_test.padded_batch(32, padded_shapes=([-1],[]))

In [34]:
from tensorflow.keras.layers import Embedding # 벡터의 차원을 줄여줌
model = tf.keras.Sequential()
model.add(Embedding(input_dim = 100, output_dim=6, input_length=20, name='embed-layer'))
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embed-layer (Embedding)      (None, 20, 6)             600       
Total params: 600
Trainable params: 600
Non-trainable params: 0
_________________________________________________________________


In [35]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import SimpleRNN

model = Sequential()
model.add(Embedding(input_dim=1000, output_dim=32))
model.add(SimpleRNN(32, return_sequences=True))
model.add(SimpleRNN(32))
model.add(Dense(1))
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 32)          32000     
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, None, 32)          2080      
_________________________________________________________________
simple_rnn_3 (SimpleRNN)     (None, 32)                2080      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 36,193
Trainable params: 36,193
Non-trainable params: 0
_________________________________________________________________


In [37]:
embedding_dim = 20
vocab_size = len(token_counts) + 2
tf.random.set_seed(1)
# 모델 만들기
bi_lstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim = vocab_size, output_dim = embedding_dim, name='embed-layer'),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, name='lstm-layer'), name='bidir-lstm'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
bi_lstm_model.summary()
# 컴파일과 훈련
bi_lstm_model.compile(
optimizer = tf.keras.optimizers.Adam(1e-3),
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False),
metrics=['accuracy'])

history = bi_lstm_model.fit(
train_data, validation_data = valid_data, epochs=10)
# 테스트 데이터에서 평가
test_results = bi_lstm_model.evaluate(test_data)
print('테스트 정확도 : {:.2f}%'.format(test_results[1]*100))

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embed-layer (Embedding)      (None, None, 20)          1750140   
_________________________________________________________________
bidir-lstm (Bidirectional)   (None, 128)               43520     
_________________________________________________________________
dense_6 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 65        
Total params: 1,801,981
Trainable params: 1,801,981
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
테스트 정확도 : 85.08%
