# 1. 다대일 구조의 다층 RNN 구현

## 영화 리뷰 데이터 준비

In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd

In [2]:
!mkdir ../ch08
!wget https://github.com/rickiepark/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz -O ../ch08/movie_data.csv.gz

--2023-02-02 12:02:41--  https://github.com/rickiepark/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/rickiepark/python-machine-learning-book-3rd-edition/master/ch08/movie_data.csv.gz [following]
--2023-02-02 12:02:42--  https://raw.githubusercontent.com/rickiepark/python-machine-learning-book-3rd-edition/master/ch08/movie_data.csv.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26521894 (25M) [application/octet-stream]
Saving to: ‘../ch08/movie_data.csv.gz’


2023-02-02 12:02:42 (157 MB/s) - ‘../ch08/movie_data.csv.g

In [16]:
import os
import gzip
import shutil

with gzip.open('../ch08/movie_data.csv.gz', 'rb') as f_in, open('movie_data.csv', 'wb') as f_out:
  shutil.copyfileobj(f_in, f_out)

In [13]:
os.listdir()

['.config', 'movie_data.csv', 'sample_data']

In [18]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')

df.tail()
# 0 : 부정
# 1 : 긍정

Unnamed: 0,review,sentiment
49995,"OK, lets start with the best. the building. al...",0
49996,The British 'heritage film' industry is out of...,0
49997,I don't even know where to begin on this one. ...,0
49998,Richard Tyler is a little boy who is scared of...,0
49999,I waited long to watch this movie. Also becaus...,1


## 데이터 전처리
1. 텐서플로 객체 생성 후 훈련,테스트,검증셋으로 나눔
2. 훈련 데이터셋에 있는 고유 단어 찾기
3. 고유 단어를 고유 정수로 매핑한 후, 리뷰 텍스트를 정수배열로 인코딩
4. 모델에 입력하기 위해 데이터셋을 미니 배치로 나눔

In [20]:
# 1단계
target = df.pop('sentiment')
ds_raw = tf.data.Dataset.from_tensor_slices((df.values, target.values))

In [33]:
for ex in ds_raw.take(3):
  tf.print(ex[0].numpy()[0][:50], ex[1])

b'In 1974, the teenager Martha Moxley (Maggie Grace)' 1
b'OK... so... I really like Kris Kristofferson and h' 0
b'***SPOILER*** Do not read this, if you think about' 0


In [34]:
tf.random.set_seed(1)
ds_raw = ds_raw.shuffle(5000, reshuffle_each_iteration=False)

In [37]:
ds_raw_test = ds_raw.take(25000)  # 평가 데이터셋
ds_raw_train_valid = ds_raw.skip(25000)
ds_raw_train = ds_raw_train_valid.take(20000)   # 훈련 데이터셋
ds_raw_valid = ds_raw_train_valid.skip(20000)   # 검증 데이터셋

In [38]:
# 2단계: 고유 토큰 찾기
from collections import Counter
tokenizer = tfds.deprecated.text.Tokenizer()
token_counts = Counter()

for example in ds_raw_train:
  tokens = tokenizer.tokenize(example[0].numpy()[0])
  token_counts.update(tokens)

print('어휘 사전 크기:', len(token_counts))

어휘 사전 크기: 87343


In [42]:
# 3단계: 고유 토큰을 정수로 매핑
encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)
example_str = 'This is an example!'
print(encoder.encode(example_str))

[249, 44, 172, 782]


In [43]:
# 3단계-1: 변환을 위한 함수 정의
def encode(text_tensor, label):
  text = text_tensor.numpy()[0]
  encoded_text = encoder.encode(text)
  return encoded_text, label

In [44]:
# 3단계-2: 함수를 TF 연산으로 변환
def encode_map_fn(text, label):
  return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

In [45]:
ds_train = ds_raw_train.map(encode_map_fn)
ds_valid = ds_raw_valid.map(encode_map_fn)
ds_test = ds_raw_test.map(encode_map_fn)

In [47]:
tf.random.set_seed(1)
for example in ds_train.shuffle(1000).take(5):
  print('시퀀스 길이:', example[0].shape)

시퀀스 길이: (159,)
시퀀스 길이: (167,)
시퀀스 길이: (496,)
시퀀스 길이: (118,)
시퀀스 길이: (606,)


In [50]:
# 4단계: 미니 배치로 나누기
# 배치에 포함된 모든 원소를 0으로 패딩하여 모든 시퀀스를 동일한 크기로 만듦
train_data = ds_train.padded_batch(32, padded_shapes=([-1], []))
valid_data = ds_valid.padded_batch(32, padded_shapes=([-1], []))
test_data = ds_test.padded_batch(32, padded_shapes=([-1], []))

## 문장 인코딩을 위한 임베딩 층
인덱스를 입력 특성으로 변환하는 방법으로   
중요한 특성을 자동으로 학습

In [51]:
from tensorflow.keras.layers import Embedding
model = tf.keras.Sequential()
model.add(Embedding(input_dim=100, # 토큰 개수 + 2(패딩 인덱스, 토큰 집합에 없는 단어를 위한 인덱스 1개)
                    output_dim=6, # 임베딩 특성 크기(특성 개수: 한 단어의 벡터 크기)
                    input_length=20, 
                    name='embed-layer'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embed-layer (Embedding)     (None, 20, 6)             600       
                                                                 
Total params: 600
Trainable params: 600
Non-trainable params: 0
_________________________________________________________________


## RNN 모델
* SimpleRNN : 완전 연결 순환 층인 기본 RNN
* LSTM : 긴 의존성을 감지할 수 있는 RNN
* GRU : LSTM 대안인 GRU 유닛을 사용한 순환층 

In [53]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import Dense

In [54]:
model = Sequential()

In [56]:
model.add(Embedding(input_dim=1000, output_dim=32))
model.add(SimpleRNN(32, return_sequences=True))
model.add(SimpleRNN(32))
model.add(Dense(1))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          32000     
                                                                 
 simple_rnn (SimpleRNN)      (None, None, 32)          2080      
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 32)                2080      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 36,193
Trainable params: 36,193
Non-trainable params: 0
_________________________________________________________________


## 감성 분석을 위한 RNN 모델

In [60]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [57]:
embedding_dim = 20
vocab_size = len(token_counts) + 2
tf.random.set_seed(1)

In [61]:
bi_lstm_model = Sequential([Embedding(input_dim=vocab_size, 
                                      output_dim=embedding_dim, 
                                      name='embed-layer'), 
                            
                            Bidirectional(LSTM(64, name='lstm-layer'),   # 입력시퀀스를 첨-끝, 끝-첨 양방향 순환
                                          name='bidir-lstm'),
                            
                            Dense(64, activation='relu'), 

                            Dense(1, activation='sigmoid')])

bi_lstm_model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embed-layer (Embedding)     (None, None, 20)          1746900   
                                                                 
 bidir-lstm (Bidirectional)  (None, 128)               43520     
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,798,741
Trainable params: 1,798,741
Non-trainable params: 0
_________________________________________________________________


In [64]:
bi_lstm_model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
                      loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                      metrics=['accuracy'])

In [None]:
history = bi_lstm_model.fit(train_data, validation_data=valid_data, epochs=10)

In [67]:
test_results = bi_lstm_model.evaluate(test_data)



In [68]:
print('테스트 정확도 {:.2f}%'.format(test_results[1]*100))

테스트 정확도 83.90%
