In [2]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

In [3]:
tfds.disable_progress_bar()

# 1. 입력 데이터 파이프라인 구축
## 1) tf.data.Dataset 데이터
### Motivation
기존의 데이터는 `host memory` 위에 있지만 `tf.data.Dataset` 객체 속 데이터는 GPU에 존재함
### 데이터 로드
`tensorflow_datasets`을 이용해 데이터 로드

- 사실 디스크에 있는 텍스트 테이터를 Dataset으로 로드하기 위해선 `tf.keras.utils.text_dataset_from_directory`가 필요하지만, 저는 `tensorflow_datasets` 을 통해 Datasets 객체를 바로 생성했습니다.

In [4]:
dataset, info = tfds.load('imdb_reviews', with_info=True,
                          as_supervised=True)

Metal device set to: Apple M1

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2022-08-11 09:02:48.165340: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-08-11 09:02:48.165675: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


### Dataset 구조
- `Dataset.element_spec`을 통해 확인가능
- `Dataset.element_spec`의 결과의 시퀀스로 구성됨
> 예를 들어 `TensorSpec(shape=(10,), dtype=tf.float32, name=None)` 라면 이 데이터는 10개의 차원을 가진 벡터들의 이터러블한 조합이라고 볼 수 있다.

In [5]:
train_dataset, test_dataset = dataset['train'], dataset['test']

train_dataset.element_spec

(TensorSpec(shape=(), dtype=tf.string, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [6]:
dataset.keys()

dict_keys(['train', 'test', 'unsupervised'])

In [7]:
for example, label in train_dataset.take(1):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

text:  b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
label:  0


2022-08-11 09:02:48.418952: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-08-11 09:02:48.437313: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


## 2) 데이터 셋 요소 배치 처리
- 입력 데이터를(여러 문장을) 배치 단위로 나누어 학습하기 위해
- `BUFFER_SIZE` 안에서 랜덤으로 `BATCH_SIZE` 만큼을 선택
- `.reshape()`과 비슷하다.
- 예를 들어(`BATCH_SIZE` = 2), 1 * 10 -> 5 * 2

In [8]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64
# train_dataset만 shuffle하는 이유?
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

- `train_dataset`: [([text1, text2, ...], [label1, label2, ...]), ...]

In [9]:
for example, label in train_dataset.take(1):
  print('texts: ', example.numpy()[:2])
  print()
  print('labels: ', label.numpy()[:3])

texts:  [b'My older sister was born in March of 1985 and has cerebral palsy. in her 22 years of life, she has seen nothing but the walls of our house and her school which is also occupied with other disabled kids. i have been the butt of everyone\'s jokes because my sister is disabled, and i still think to this day that nobody is, or ever will give a damn about her and her condition. Then i saw this film.<br /><br />I knew what Christy\'s family was going through. but they were lucky. Christy could talk, he could communicate, and he had artistic skills. my sister can walk, but she can\'t utter a word, and she can\'t use her hands to do anything but grab onto things. but this film made me realize there were other people in the world like my sister, and the ending (to tell the truth) made me cry. AND I\'VE SEEN SHAWSHANK!!! This film is seriously underrated, and it shouldn\'t. This movie tells people something. that people should be proud of their own lives. thinking you can\'t write wel

2022-08-11 09:02:48.749531: W tensorflow/core/kernels/data/cache_dataset_ops.cc:856] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


## 3) TextVectorization 레이어 
- 입력 데이터 (한 문장) -> `one-hot encording`벡터의 집합 -> 토큰 인덱스의 시퀀스 (단어 집합 이용)


In [10]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)

In [11]:
encoder.adapt(train_dataset.map(lambda text, label: text))

2022-08-11 09:02:48.875576: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


### 단어 집합 구하기 (모든 입력 데이터 -> corpus 빈도 수 -> 단어 집합)

In [12]:
vocab = np.array(encoder.get_vocabulary())
vocab.shape

(1000,)

`64`개의 문장은 `988` 길이의 `Int[]` 로 표현됨

In [13]:
encoded_example = encoder(example).numpy()
encoded_example.shape

(64, 1002)

## 4) Embedding
- 단어 인덱스 시퀀스를 벡터 시퀀스로 변환
- 인풋(원 핫 벡터)의 길이와 아웃풋(임베딩 벡터 길이) 길이 필요

In [14]:
embedding_layer = tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),

# 2. 모델 구성
## 1) sequential
하나의 입력(한 문장)에 대해 하나의 출력(긍정 or 부정)

In [15]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64)
])

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, None)             0         
 torization)                                                     
                                                                 
 embedding_1 (Embedding)     (None, None, 64)          64000     
                                                                 
Total params: 64,000
Trainable params: 64,000
Non-trainable params: 0
_________________________________________________________________


## 2) 모델 생성 
### RNN
- 입력 단어 벡터: `xt`
- hidden state vector: `ht`
- `ht` = `tanh(ht-1 x Wh + xt x Wx + b)`
- 다대일 구조 (`return_sequences=False`)

In [17]:
hidden_units = 64 # hidden state vector 크기
RNN_layer = tf.keras.layers.SimpleRNN(hidden_units)
# RNN_layer = tf.keras.layers.SimpleRNN(
#     hidden_units,
#     activation='tanh',
#     use_bias=True,
#     kernel_initializer='glorot_uniform',
#     recurrent_initializer='orthogonal',
#     bias_initializer='zeros',
#     kernel_regularizer=None,
#     recurrent_regularizer=None,
#     bias_regularizer=None,
#     activity_regularizer=None,
#     kernel_constraint=None,
#     recurrent_constraint=None,
#     bias_constraint=None,
#     dropout=0.0,
#     recurrent_dropout=0.0,
#     return_sequences=False,
#     return_state=False,
#     go_backwards=False,
#     stateful=False,
#     unroll=False
# )

In [18]:
model.add(RNN_layer)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, None)             0         
 torization)                                                     
                                                                 
 embedding_1 (Embedding)     (None, None, 64)          64000     
                                                                 
 simple_rnn (SimpleRNN)      (None, 64)                8256      
                                                                 
Total params: 72,256
Trainable params: 72,256
Non-trainable params: 0
_________________________________________________________________


### binary classification
- sigmoid 이용

In [19]:
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, None)             0         
 torization)                                                     
                                                                 
 embedding_1 (Embedding)     (None, None, 64)          64000     
                                                                 
 simple_rnn (SimpleRNN)      (None, 64)                8256      
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 72,321
Trainable params: 72,321
Non-trainable params: 0
_________________________________________________________________


### 학습


In [20]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [21]:
history = model.fit(train_dataset, epochs=10,
                    validation_data=test_dataset,
                    validation_steps=30)

Epoch 1/10


  return dispatch_target(*args, **kwargs)
2022-08-11 09:02:51.196675: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
