# Classificação de Texto com BERT

In [None]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization

## Dataset

[Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/) - Reviews do IMDb.

In [None]:
url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

dataset = tf.keras.utils.get_file(
        'aclImdb_v1.tar.gz', url,
        untar=True, cache_dir='/tmp/aclImdb',
        cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

train_dir = os.path.join(dataset_dir, 'train')

# remove unused folders to make it easier to load the data
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [None]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 12
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    os.path.join(dataset_dir, 'train'),
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
val_ds = tf.keras.utils.text_dataset_from_directory(
    os.path.join(dataset_dir, 'train'),
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
test_ds = tf.keras.utils.text_dataset_from_directory(
    os.path.join(dataset_dir, 'test'),
    batch_size=batch_size)

test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
for text_batch, label_batch in train_ds.take(1):
    # we'll print 3 reviews from the batch
    for i in range(3):
        print(f'Review: {text_batch.numpy()[i]}')
        label = label_batch.numpy()[i]
        print(f'Label : {label} ({class_names[label]})')
        print()

## Carregando o modelo

O modelo BERT utilizado será baixado do [tensorflow hub](https://tfhub.dev/google/collections/bert/1), juntamente com o pipeline de preprocessamento utilizado.

In [None]:
model_size = [
    (2, 128, 2),
    (6, 256, 4),
    (10, 256, 4),
    (2, 768, 12),
    (12, 768, 12),
][3]

# Number of layers (i.e., residual blocks)
L = model_size[0]

# Size of hidden layers
H = model_size[1]

# Number of attention heads
A = model_size[2]

tfhub_handle_encoder = f"https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-{L}_H-{H}_A-{A}/2"
tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"

### Pipeline de preprocessamento

Para garantir que nosso modelo funcione corretamente, devemos utilizar o mesmo preprocessamento de texto empregado no modelo pré-treinado. No intuito de facilitar as coisas, o pipeline de preprocessamente é fornecido em conjunto com o modelo que o utilizou.

In [None]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

In [None]:
text_test = ['this is such an amazing movie!']
text_preprocessed = bert_preprocess_model(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

### Modelo BERT

Agora, carregamos o modelo propriamente dito.

In [None]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [None]:
bert_results = bert_model(text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

## Modelo de Classificação

Utilizando os modelos carregados, construímos um modelo de classificação, adicionando uma **camada densa** ao final.

In [None]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    
    outputs = 
    
    net = 
    net = 
    net = 
    
    return tf.keras.Model(text_input, net)

In [None]:
classifier_model = build_classifier_model()

text_test = ['this is such an amazing movie!']
bert_raw_result = classifier_model(tf.constant(text_test))

print(tf.sigmoid(bert_raw_result))

In [None]:
tf.keras.utils.plot_model(classifier_model)

### Treinamento

Primeiramente, devemos definir nossa função de custo: Entropia cruzada.

In [None]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

Em seguida, definimos nosso _optimizer_ (um método que irá alterar os pesos do modelo durante o treinamento).

In [None]:
epochs = 15
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(
        init_lr=init_lr, num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps, optimizer_type='adamw')

In [None]:
classifier_model.compile(
        optimizer=optimizer, loss=loss,
        metrics=metrics)

history = []

history = classifier_model.fit(
        x=train_ds, validation_data=val_ds,
        epochs=epochs)

### Avaliação do Modelo

In [None]:
loss, accuracy = classifier_model.evaluate(test_ds)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

[Colab com o Código](https://colab.research.google.com/drive/1jZQmKk61-LprE5I42TRsbcQebVVbdOpm?usp=sharing)