<a href="https://colab.research.google.com/github/leegw1211/begin-ml/blob/main/tf_tutorial3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [125]:
# https://www.tensorflow.org/tutorials/keras/text_classification?hl=ko
# 리뷰 텍스트를 긍정적 또는 부정적으로 분류하는 모델, 이진 분류

In [126]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses

In [127]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file("aclImdb_v1", url, untar=True, cache_dir='.', cache_subdir='')

In [128]:
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
train_dir = os.path.join(dataset_dir, 'train')
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [129]:
# 머신러닝을 할 때 보통 데이터세트를 train, validation, test의 세 부분으로 나눈다.

# text_dataset_from_directory는 keras에서 제공하는, 디스크에 있는 텍스트 파일을 dataset으로 바꿔주는 메소드
# tf.data.dataset객체로 바꿔준다. 이는 모델에 집어넣기 가장 최적화된 형태이다.
raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=32,
    validation_split = 0.2,
    subset='training',
    seed=42
)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [130]:
for batch in raw_train_ds.take(1):
  print(batch)

(<tf.Tensor: shape=(32,), dtype=string, numpy=
array([b'"Pandemonium" is a horror movie spoof that comes off more stupid than funny. Believe me when I tell you, I love comedies. Especially comedy spoofs. "Airplane", "The Naked Gun" trilogy, "Blazing Saddles", "High Anxiety", and "Spaceballs" are some of my favorite comedies that spoof a particular genre. "Pandemonium" is not up there with those films. Most of the scenes in this movie had me sitting there in stunned silence because the movie wasn\'t all that funny. There are a few laughs in the film, but when you watch a comedy, you expect to laugh a lot more than a few times and that\'s all this film has going for it. Geez, "Scream" had more laughs than this film and that was more of a horror film. How bizarre is that?<br /><br />*1/2 (out of four)',
       b"David Mamet is a very interesting and a very un-equal director. His first movie 'House of Games' was the one I liked best, and it set a series of films with characters whose persp

In [131]:
raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=32,
    validation_split=0.2,
    subset='validation',
    seed=42
)

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [132]:
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/test',
    batch_size=32
)

Found 25000 files belonging to 2 classes.


In [133]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [134]:
# tf.keras.layers.TextVectorization 레이어를 사용하여 데이터를 표준화, 토큰화 및 벡터화합니다.
# 표준화는 일반적으로 구두점이나 HTML 요소를 제거하여 데이터세트를 단순화하기 위해 텍스트를 전처리하는 것을 말합니다.
# 토큰화는 문자열을 여러 토큰으로 분할하는 것을 말합니다(예: 화이트스페이스에서 분할하여 문장을 개별 단어로 분할).
# 벡터화는 토큰을 숫자로 변환하여 신경망에 공급될 수 있도록 하는 것을 말합니다. 이러한 모든 작업을 이 레이어에서 수행할 수 있습니다.
max_features = 10000
vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=250
)

In [135]:
# TextVectorization레이어의 adapt메소드는 주어진 데이터를 토큰화하고 정수 인코딩하는 전처리 단계를 수행한다.
train_text = raw_train_ds.map(lambda x,y: x)
vectorize_layer.adapt(train_text)

In [136]:
# 각 토큰이 정수로 대체된 것을 확인할 수 있다.
text_batch, label_batch = next(iter(raw_train_ds))
first_review, first_label = text_batch[0], label_batch[0]
print("Review", first_review)
print("vectorized review", vectorize_layer(tf.expand_dims(first_review, -1))) # 모델에 넣을땐 데이터가 1개여도 배열로 만들어서 넣어야 한다

Review tf.Tensor(b'Great movie - especially the music - Etta James - "At Last". This speaks volumes when you have finally found that special someone.', shape=(), dtype=string)
vectorized review tf.Tensor(
[[  86   17  260    2  222    1  571   31  229   11 2418    1   51   22
    25  404  251   12  306  282    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
   

In [137]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [138]:
# 각 데이터셋은 input과 label로 이루어져 있음
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [139]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [140]:
# 신경망 만들기
embedding_dim = 16
model = tf.keras.Sequential()

In [141]:
# 임베딩 layer가 있는데 일단은 그냥 넘어감. 나중에 공부해보기
# dropout layer는 일부 뉴런을 강제로 제거하여 모델의 과적합을 방지
model.add(layers.Embedding(max_features + 1, embedding_dim))
model.add(layers.Dropout(0.2))

In [142]:
# globalaveragePooling1D layer는 길이가 다른 입력들에 대해 평균을 계산하여 고정된 길이의 출력 벡터 반환.
# 길이가 다른 입력을 다루는 가장 간단한 방법이라고 함
model.add(layers.GlobalAveragePooling1D())
model.add(layers.Dropout(0.2))
model.add(layers.Dense(1))

In [144]:
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, None, 16)          160016    
                                                                 
 dropout_13 (Dropout)        (None, None, 16)          0         
                                                                 
 global_average_pooling1d_1  (None, 16)                0         
 0 (GlobalAveragePooling1D)                                      
                                                                 
 dropout_14 (Dropout)        (None, 16)                0         
                                                                 
 dense_6 (Dense)             (None, 1)                 17        
                                                                 
Total params: 160033 (625.13 KB)
Trainable params: 160033 (625.13 KB)
Non-trainable params: 0 (0.00 Byte)
_____________

In [145]:
# 텐서플로 공식 문서의 binarycrossentropy 설명 (https://www.tensorflow.org/api_docs/python/tf/keras/losses/BinaryCrossentropy)
# Use this cross-entropy loss for binary (0 or 1) classification applications. The loss function requires the following inputs:
# y_true (true label): This is either 0 or 1.
# y_pred (predicted value): This is the model's prediction, i.e, a single floating-point value which either represents a logit, (i.e, value in [-inf, inf] when from_logits=True) or a probability (i.e, value in [0., 1.] when from_logits=False).
# Recommended Usage: (set from_logits=True)

model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

In [146]:
model.fit(train_ds, validation_data=val_ds, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7d56faf29cf0>

In [147]:
export_model = tf.keras.Sequential([
  vectorize_layer,
  model,
  layers.Activation('sigmoid')
])

export_model.compile(
    loss=losses.BinaryCrossentropy(from_logits=False),
    optimizer="adam",
    metrics=['accuracy']
)

# Test it with `raw_test_ds`, which yields raw strings
loss, accuracy = export_model.evaluate(raw_train_ds)
print(accuracy)

0.9237499833106995


In [150]:
examples = [
  "The movie was great!",
  "The movie was okay.",
  "The movie was terrible..."
]
export_model.predict(examples)



array([[0.610402  ],
       [0.43225014],
       [0.35039252]], dtype=float32)