## Carrengado os dados

In [1]:
url ='https://github.com/allanspadini/curso-tensorflow-proxima-palavra/raw/main/dados/train.zip'

In [2]:
import pandas as pd

df = pd.read_csv(url, header=None, names=["ClassIndex", "Título", "Descrição"])
df.head()

Unnamed: 0,ClassIndex,Título,Descrição
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


## Pré-processamento

In [3]:
df["Texto"] = df["Título"] + " " + df["Descrição"]
df.head()

Unnamed: 0,ClassIndex,Título,Descrição,Texto
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",Wall St. Bears Claw Back Into the Black (Reute...
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,Carlyle Looks Toward Commercial Aerospace (Reu...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,Iraq Halts Oil Exports from Main Southern Pipe...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...","Oil prices soar to all-time record, posing new..."


O campo "ClassIndex" possui valores de 1 até 4. É importante que os dados de índice comecem em zero para que o TensorFlow possa trabalhar com eles de forma eficiente. O TensorFlow espera que os dados de entrada sejam representados como números inteiros, e esses números devem começar do zero.

In [4]:
df["ClassIndex"].unique()

array([3, 4, 2, 1])

In [5]:
df["ClassIndex"] = df["ClassIndex"] - 1
df["ClassIndex"].unique()

array([2, 3, 1, 0])

## Separando os dados em treino e teste

In [6]:
from sklearn.model_selection import train_test_split

X = df["Texto"].values
y = df["ClassIndex"].values

X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, test_size=0.2, random_state=4256)

## Realizando a tokenização

In [7]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"
import tensorflow as tf
import keras

tf.config.set_visible_devices([], 'GPU')

VOCAB_SIZE = 1000

encoder = keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(X_treino)

## Criando o modelo

In [8]:
modelo = keras.Sequential([
    encoder,
    keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=16,
        mask_zero=True
    ),
    keras.layers.GlobalAveragePooling1D(),
    keras.layers.Dense(16, activation="relu"),
    keras.layers.Dense(4, activation="softmax")
])

modelo.compile(
    optimizer=keras.optimizers.Adam(1e-4),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

## Treinando o modelo

### Modelo 1

In [9]:
epocas = 10

history = modelo.fit(
    X_treino,
    y_treino,
    epochs=epocas,
    validation_data=(X_teste, y_teste)
)

Epoch 1/10


2024-08-13 09:18:52.506422: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 18ms/step - accuracy: 0.5370 - loss: 1.3189 - val_accuracy: 0.7308 - val_loss: 0.9059
Epoch 2/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 17ms/step - accuracy: 0.7639 - loss: 0.8010 - val_accuracy: 0.8095 - val_loss: 0.5995
Epoch 3/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 17ms/step - accuracy: 0.8151 - loss: 0.5714 - val_accuracy: 0.8248 - val_loss: 0.5099
Epoch 4/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 17ms/step - accuracy: 0.8322 - loss: 0.4994 - val_accuracy: 0.8365 - val_loss: 0.4703
Epoch 5/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 16ms/step - accuracy: 0.8415 - loss: 0.4614 - val_accuracy: 0.8439 - val_loss: 0.4477
Epoch 6/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 17ms/step - accuracy: 0.8471 - loss: 0.4413 - val_accuracy: 0.8492 - val_loss: 0.4330
Epoch 7/10
[1m

In [10]:
import pandas as pd

from plotly.subplots import make_subplots
import plotly.graph_objects as go


def plot_results(history):
    df_history = pd.DataFrame(history.history)

    fig = make_subplots(rows=1, cols=2)

    fig.add_trace(
        go.Scatter(
            name="Acurácia", 
            x=df_history.index, 
            y=df_history["accuracy"]
        ),
        row=1,
        col=1,
    )
    fig.add_trace(
        go.Scatter(
            name="Acurácia de Validação",
            x=df_history.index,
            y=df_history["val_accuracy"],
        ),
        row=1,
        col=1,
    )
    fig.add_trace(
        go.Scatter(
            name="Loss", 
            x=df_history.index, 
            y=df_history["loss"]
        ), 
        row=1, 
        col=2
    )
    fig.add_trace(
        go.Scatter(
            name="Loss de Validação", 
            x=df_history.index, 
            y=df_history["val_loss"]
        ),
        row=1,
        col=2,
    )

    fig.update_layout(title_text="Desempenho do modelo por épocas")
    fig.update_yaxes(title_text="Valor", row=1, col=1)

    fig.show()

In [11]:
plot_results(history)

### Modelo 2

In [12]:
modelo = keras.Sequential([
    encoder,
    keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=16,
        mask_zero=False
    ),
    keras.layers.Conv1D(64, kernel_size=3, activation="relu"),
    keras.layers.MaxPooling1D(),
    keras.layers.Conv1D(128, kernel_size=4, activation="relu"),
    keras.layers.GlobalAveragePooling1D(),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(16, activation="relu"),
    keras.layers.Dense(4, activation="softmax")
])

modelo.compile(
    optimizer=keras.optimizers.Adam(1e-4),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

In [13]:
epocas = 10

history = modelo.fit(
    X_treino,
    y_treino,
    epochs=epocas,
    validation_data=(X_teste, y_teste)
)

Epoch 1/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 34ms/step - accuracy: 0.4496 - loss: 1.1673 - val_accuracy: 0.8470 - val_loss: 0.4689
Epoch 2/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 31ms/step - accuracy: 0.8366 - loss: 0.4895 - val_accuracy: 0.8566 - val_loss: 0.4267
Epoch 3/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m593s[0m 198ms/step - accuracy: 0.8542 - loss: 0.4413 - val_accuracy: 0.8602 - val_loss: 0.4157
Epoch 4/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 33ms/step - accuracy: 0.8546 - loss: 0.4287 - val_accuracy: 0.8612 - val_loss: 0.4082
Epoch 5/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 31ms/step - accuracy: 0.8575 - loss: 0.4172 - val_accuracy: 0.8631 - val_loss: 0.4026
Epoch 6/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 31ms/step - accuracy: 0.8596 - loss: 0.4124 - val_accuracy: 0.8633 - val_loss: 0.3971
E

In [14]:
plot_results(history)

### Modelo com camadas LSTM

In [15]:
modelo = keras.Sequential([
    encoder,
    keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        mask_zero=True
    ),
    keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True)),
    keras.layers.Bidirectional(keras.layers.LSTM(32)),
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(4, activation="softmax")
])

modelo.compile(
    optimizer=keras.optimizers.Adam(1e-4),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

In [17]:
import tensorflow as tf

with tf.device('CPU: 0'):
    epocas = 20

    history = modelo.fit(
        X_treino,
        y_treino,
        epochs=epocas,
        validation_data=(X_teste, y_teste)
    )

Epoch 1/20
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 45ms/step - accuracy: 0.8386 - loss: 0.4700 - val_accuracy: 0.8375 - val_loss: 0.4503
Epoch 2/20
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 38ms/step - accuracy: 0.8439 - loss: 0.4581 - val_accuracy: 0.8454 - val_loss: 0.4299
Epoch 3/20
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 39ms/step - accuracy: 0.8505 - loss: 0.4396 - val_accuracy: 0.8517 - val_loss: 0.4165
Epoch 4/20
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 37ms/step - accuracy: 0.8577 - loss: 0.4185 - val_accuracy: 0.8613 - val_loss: 0.4007
Epoch 5/20
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 37ms/step - accuracy: 0.8599 - loss: 0.4192 - val_accuracy: 0.8573 - val_loss: 0.4030
Epoch 6/20
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 38ms/step - accuracy: 0.8626 - loss: 0.4065 - val_accuracy: 0.8640 - val_loss: 0.388

In [18]:
plot_results(history)

## Otimização por hiperparâmetros

In [8]:
import keras_tuner as kt

def build_model(hp):
    model = keras.Sequential([
        encoder,
        keras.layers.Embedding(
            input_dim=len(encoder.get_vocabulary()),
            output_dim=hp.Int("embedding_dim", min_value=32, max_value=128, step=32),
            mask_zero=True
        ),
        keras.layers.Bidirectional(keras.layers.LSTM(units=hp.Int("lstm_units", min_value=32, max_value=128, step=32), return_sequences=True)),
        keras.layers.Bidirectional(keras.layers.LSTM(units=hp.Int("lstm_units", min_value=16, max_value=64, step=16))),
        keras.layers.Dense(units=hp.Int("dense_units", min_value=32, max_value=128, step=32), activation="relu"),
        keras.layers.Dropout(rate=hp.Float("dropout", min_value=0.2, max_value=0.5, step=0.1)),
        keras.layers.Dense(4, activation="softmax")
    ])

    model.compile(
        optimizer=keras.optimizers.Adam(1e-4),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )

    return model

tuner = kt.Hyperband(
    build_model,
    objective="val_accuracy",
    max_epochs=10,
    factor=3,
    directory="hypermodels",
    project_name="classification_optimization"
)

Reloading Tuner from hypermodels/classification_optimization/tuner0.json


In [9]:
from sklearn.model_selection import KFold

# Função para fazer a validação cruzada e otimização de hiperparâmetros
def run_tuner(X, y, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    for train_index, val_index in kf.split(X):
        X_train_fold, X_val_fold = X[train_index], X[val_index]
        y_train_fold, y_val_fold = y[train_index], y[val_index]

        tuner.search(
            X_train_fold, 
            y_train_fold, 
            epochs=10, 
            validation_data=(X_val_fold, y_val_fold)
        )

    # Resumo dos melhores hiperparâmetros encontrados
    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
    print(f"""
        A pesquisa de hiperparâmetros foi concluída. O número ideal de dimensões de incorporação é {best_hps.get('embedding_dim')},
        o número ideal de unidades LSTM é {best_hps.get('lstm_units')}, e
        o número ideal de unidades densas é {best_hps.get('dense_units')},
        e a taxa de abandono ideal é {best_hps.get('dropout')}.
    """)

    return best_hps

In [10]:

best_hps = run_tuner(X_treino, y_treino)

Trial 17 Complete [00h 36m 42s]
val_accuracy: 0.8677083253860474

Best val_accuracy So Far: 0.8677083253860474
Total elapsed time: 03h 28m 19s

Search: Running Trial #18

Value             |Best Value So Far |Hyperparameter
128               |96                |embedding_dim
96                |96                |lstm_units
64                |128               |dense_units
0.4               |0.2               |dropout
10                |10                |tuner/epochs
4                 |4                 |tuner/initial_epoch
2                 |2                 |tuner/bracket
2                 |2                 |tuner/round
0013              |0012              |tuner/trial_id

Epoch 5/10
[1m2400/2400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m347s[0m 144ms/step - accuracy: 0.8587 - loss: 0.4164 - val_accuracy: 0.8614 - val_loss: 0.3917
Epoch 6/10
[1m2400/2400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m355s[0m 148ms/step - accuracy: 0.8664 - loss: 0.3830 - val_accuracy: 0.86

KeyboardInterrupt: 