In [85]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

In [86]:
tfds.disable_progress_bar()

# 1. 입력 데이터 파이프라인 구축
## 1) tf.data.Dataset 데이터
### Motivation
기존의 데이터는 `host memory` 위에 있지만 `tf.data.Dataset` 객체 속 데이터는 GPU에 존재함
### 데이터 로드
`tensorflow_datasets`을 이용해 데이터 로드

- 사실 디스크에 있는 텍스트 테이터를 Dataset으로 로드하기 위해선 `tf.keras.utils.text_dataset_from_directory`가 필요하지만, 저는 `tensorflow_datasets` 을 통해 Datasets 객체를 바로 생성했습니다.

In [87]:
dataset, info = tfds.load('imdb_reviews', with_info=True,
                          as_supervised=True)

### Dataset 구조
- `Dataset.element_spec`을 통해 확인가능
- `Dataset.element_spec`의 결과의 시퀀스로 구성됨
> 예를 들어 `TensorSpec(shape=(10,), dtype=tf.float32, name=None)` 라면 이 데이터는 10개의 차원을 가진 벡터들의 이터러블한 조합이라고 볼 수 있다.

In [88]:
train_dataset, test_dataset = dataset['train'], dataset['test']

train_dataset.element_spec

(TensorSpec(shape=(), dtype=tf.string, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [89]:
dataset.keys()

dict_keys(['train', 'test', 'unsupervised'])

In [19]:
for example, label in train_dataset.take(1):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

<class 'tensorflow.python.framework.ops.EagerTensor'>
text:  tf.Tensor(b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.", shape=(), dtype=string)
label:  0


2022-08-10 15:03:55.532798: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


## 2) 데이터 셋 요소 배치 처리
- 입력 데이터를(여러 문장을) 배치 단위로 나누어 학습하기 위해
- `BUFFER_SIZE` 안에서 랜덤으로 `BATCH_SIZE` 만큼을 선택
- `.reshape()`과 비슷하다.
- 예를 들어(`BATCH_SIZE` = 2), 1 * 10 -> 5 * 2

In [92]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64
# train_dataset만 shuffle하는 이유?
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

- `train_dataset`: [([text1, text2, ...], [label1, label2, ...]), ...]

In [93]:
for example, label in train_dataset.take(1):
  print('texts: ', example.numpy()[:2])
  print()
  print('labels: ', label.numpy()[:3])

texts:  [b'How come I\'ve never seen or even heard about this junk-movie before? It\'s right up my alley with bloody teenkill, laughable plotting and an irresistible 80\'s cheese-atmosphere hanging around it. For some reason nobody is really interested in, the staff and students of an elite Catholic university are butchered by an unknown psychopath. Freshly recruited teacher Julie Parker becomes involved when all the people she has contact with either turn up dead or behave strangely. This movie is hilariously bad! There\'s absolutely no logic or coherence and every character is equally meaningless to the others. For example, there\'s a girl killed and her body dumped in a container. Then, and for no reason, the story suddenly moves forward three weeks yet the murdered girl is never mentioned or even missed. Not even by her boyfriend! The acting is pitiful and there isn\'t even a bit of nudity to enjoy. The revelation of the killer is quite funny because the makers really seemed convin

2022-08-10 21:40:06.591767: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


## 3) TextVectorization 레이어 
- 입력 데이터 (한 문장) -> `one-hot encording`벡터의 집합 -> 토큰 인덱스의 시퀀스 (단어 집합 이용)


In [94]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)

In [95]:
encoder.adapt(train_dataset.map(lambda text, label: text))

2022-08-10 21:40:10.658789: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


### 단어 집합 구하기 (모든 입력 데이터 -> corpus 빈도 수 -> 단어 집합)

In [98]:
vocab = np.array(encoder.get_vocabulary())
vocab.shape

(1000,)

`64`개의 문장은 `988` 길이의 `Int[]` 로 표현됨

In [97]:
encoded_example = encoder(example).numpy()
encoded_example.shape

(64, 988)

## 4) Embedding
- 단어 인덱스 시퀀스를 벡터 시퀀스로 변환
- 인풋(원 핫 벡터)의 길이와 아웃풋(임베딩 벡터 길이) 길이 필요

In [None]:
embedding_layer = tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),

# 2. 모델 구성
## 1) sequential
하나의 입력(한 문장)에 대해 하나의 출력(긍정 or 부정)

In [112]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64)
])

In [113]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_3 (TextV  (None, None)             0         
 ectorization)                                                   
                                                                 
 embedding_3 (Embedding)     (None, None, 64)          64000     
                                                                 
Total params: 64,000
Trainable params: 64,000
Non-trainable params: 0
_________________________________________________________________


## 2) 모델 생성 
### RNN
- 입력 단어 벡터: `xt`
- hidden state vector: `ht`
- `ht` = `tanh(ht-1 x Wh + xt x Wx + b)`
- 다대일 구조 (`return_sequences=False`)

In [118]:
hidden_units = 64 # hidden state vector 크기
RNN_layer = tf.keras.layers.SimpleRNN(hidden_units )
# RNN_layer = tf.keras.layers.SimpleRNN(
#     hidden_units,
#     activation='tanh',
#     use_bias=True,
#     kernel_initializer='glorot_uniform',
#     recurrent_initializer='orthogonal',
#     bias_initializer='zeros',
#     kernel_regularizer=None,
#     recurrent_regularizer=None,
#     bias_regularizer=None,
#     activity_regularizer=None,
#     kernel_constraint=None,
#     recurrent_constraint=None,
#     bias_constraint=None,
#     dropout=0.0,
#     recurrent_dropout=0.0,
#     return_sequences=False,
#     return_state=False,
#     go_backwards=False,
#     stateful=False,
#     unroll=False
# )

In [119]:
model.add(RNN_layer)
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_3 (TextV  (None, None)             0         
 ectorization)                                                   
                                                                 
 embedding_3 (Embedding)     (None, None, 64)          64000     
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 64)                8256      
                                                                 
Total params: 72,256
Trainable params: 72,256
Non-trainable params: 0
_________________________________________________________________


### binary classification
- sigmoid 이용

In [124]:
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_3 (TextV  (None, None)             0         
 ectorization)                                                   
                                                                 
 embedding_3 (Embedding)     (None, None, 64)          64000     
                                                                 
 simple_rnn_2 (SimpleRNN)    (None, 64)                8256      
                                                                 
 dense_8 (Dense)             (None, 1)                 65        
                                                                 
Total params: 72,321
Trainable params: 72,321
Non-trainable params: 0
_________________________________________________________________


### 학습


In [125]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [126]:
history = model.fit(train_dataset, epochs=10,
                    validation_data=test_dataset,
                    validation_steps=30)

Epoch 1/10


  return dispatch_target(*args, **kwargs)
2022-08-11 08:52:52.431815: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
