## Carrengado os dados

In [1]:
url ='https://github.com/allanspadini/curso-tensorflow-proxima-palavra/raw/main/dados/train.zip'

In [2]:
import pandas as pd

df = pd.read_csv(url, header=None, names=["ClassIndex", "Título", "Descrição"])
df.head()

Unnamed: 0,ClassIndex,Título,Descrição
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


## Pré-processamento

In [3]:
df["Texto"] = df["Título"] + " " + df["Descrição"]
df.head()

Unnamed: 0,ClassIndex,Título,Descrição,Texto
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",Wall St. Bears Claw Back Into the Black (Reute...
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,Carlyle Looks Toward Commercial Aerospace (Reu...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,Iraq Halts Oil Exports from Main Southern Pipe...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...","Oil prices soar to all-time record, posing new..."


O campo "ClassIndex" possui valores de 1 até 4. É importante que os dados de índice comecem em zero para que o TensorFlow possa trabalhar com eles de forma eficiente. O TensorFlow espera que os dados de entrada sejam representados como números inteiros, e esses números devem começar do zero.

In [4]:
df["ClassIndex"].unique()

array([3, 4, 2, 1])

In [5]:
df["ClassIndex"] = df["ClassIndex"] - 1
df["ClassIndex"].unique()

array([2, 3, 1, 0])

## Separando os dados em treino e teste

In [6]:
from sklearn.model_selection import train_test_split

X = df["Texto"].values
y = df["ClassIndex"].values

X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, test_size=0.2, random_state=4256)

## Realizando a tokenização

In [7]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"
import keras

VOCAB_SIZE = 1000

encoder = keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(X_treino)

2024-08-13 08:15:16.199644: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-08-13 08:15:16.199664: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-08-13 08:15:16.199668: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-08-13 08:15:16.199681: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-08-13 08:15:16.199692: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


## Criando o modelo

In [8]:
modelo = keras.Sequential([
    encoder,
    keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=16,
        mask_zero=True
    ),
    keras.layers.GlobalAveragePooling1D(),
    keras.layers.Dense(16, activation="relu"),
    keras.layers.Dense(4, activation="softmax")
])

modelo.compile(
    optimizer=keras.optimizers.Adam(1e-4),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

## Treinando o modelo

In [9]:
epocas = 10

history = modelo.fit(
    X_treino,
    y_treino,
    epochs=epocas,
    validation_data=(X_teste, y_teste)
)

Epoch 1/10


2024-08-13 09:18:52.506422: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 18ms/step - accuracy: 0.5370 - loss: 1.3189 - val_accuracy: 0.7308 - val_loss: 0.9059
Epoch 2/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 17ms/step - accuracy: 0.7639 - loss: 0.8010 - val_accuracy: 0.8095 - val_loss: 0.5995
Epoch 3/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 17ms/step - accuracy: 0.8151 - loss: 0.5714 - val_accuracy: 0.8248 - val_loss: 0.5099
Epoch 4/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 17ms/step - accuracy: 0.8322 - loss: 0.4994 - val_accuracy: 0.8365 - val_loss: 0.4703
Epoch 5/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 16ms/step - accuracy: 0.8415 - loss: 0.4614 - val_accuracy: 0.8439 - val_loss: 0.4477
Epoch 6/10
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 17ms/step - accuracy: 0.8471 - loss: 0.4413 - val_accuracy: 0.8492 - val_loss: 0.4330
Epoch 7/10
[1m

In [10]:
import pandas as pd

from plotly.subplots import make_subplots
import plotly.graph_objects as go


def plot_results(history):
    df_history = pd.DataFrame(history.history)

    fig = make_subplots(rows=1, cols=2)

    fig.add_trace(
        go.Scatter(
            name="Acurácia", 
            x=df_history.index, 
            y=df_history["accuracy"]
        ),
        row=1,
        col=1,
    )
    fig.add_trace(
        go.Scatter(
            name="Acurácia de Validação",
            x=df_history.index,
            y=df_history["val_accuracy"],
        ),
        row=1,
        col=1,
    )
    fig.add_trace(
        go.Scatter(
            name="Loss", 
            x=df_history.index, 
            y=df_history["loss"]
        ), 
        row=1, 
        col=2
    )
    fig.add_trace(
        go.Scatter(
            name="Loss de Validação", 
            x=df_history.index, 
            y=df_history["val_loss"]
        ),
        row=1,
        col=2,
    )

    fig.update_layout(title_text="Desempenho do modelo por épocas")
    fig.update_yaxes(title_text="Valor", row=1, col=1)

    fig.show()

In [11]:
plot_results(history)