# Introdução
Este notebook contém o treinamento de uma rede neural para a classificação de textos utilizando as bibliotecas Keras e TensorFlow. Ele é baseado no tutorial oficial de classificação de texto do TensorFlow, que aborda os conceitos e as técnicas necessárias para realizar tarefas de processamento de linguagem natural (PLN) e pode ser acessado [nesse link](https://www.tensorflow.org/tutorials/keras/text_classification?hl=pt-br).



In [None]:
# Importações:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pickle

## Parte 1: Criaçãod o modelo

In [None]:
# Baixando a base de dados
imdb = keras.datasets.imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

In [None]:
# Convertendo inteiros de volta para palavras
word_index = imdb.get_word_index()
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2
word_index["<UNUSED>"] = 3

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

In [None]:
# Exibindo o texto decodificado da primeira revisão
print(decode_review(train_data[0]))

In [None]:
# Preparando os dados
train_data = keras.preprocessing.sequence.pad_sequences(train_data,
                                                        value=word_index["<PAD>"],
                                                        padding='post',
                                                        maxlen=256)

test_data = keras.preprocessing.sequence.pad_sequences(test_data,
                                                       value=word_index["<PAD>"],
                                                       padding='post',
                                                       maxlen=256)

In [None]:
# Construindo o modelo
vocab_size = 10000
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))

model.summary()

In [None]:
# Compilando o modelo
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
# Criando um conjunto de validação
x_val = train_data[:10000]
partial_x_train = train_data[10000:]

y_val = train_labels[:10000]
partial_y_train = train_labels[10000:]

# Treinando o modelo
history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=40,
                    batch_size=512,
                    validation_data=(x_val, y_val),
                    verbose=1)


In [None]:
# Avaliando o modelo
results = model.evaluate(test_data, test_labels, verbose=2)
print(results)

In [None]:
# Salvando o modelo usando pickle
with open('text_classification_model.pkl', 'wb') as file:
    pickle.dump(model, file)


In [None]:
# Função para classificar novos dados
def classify_new_review(new_review, model_file='text_classification_model.pkl'):
    # Carregando o modelo treinado
    with open(model_file, 'rb') as file:
        model = pickle.load(file)

    # Processando o novo review
    new_review = keras.preprocessing.sequence.pad_sequences([new_review],
                                                             value=word_index["<PAD>"],
                                                             padding='post',
                                                             maxlen=256)
    # Fazendo a previsão
    prediction = model.predict(new_review)
    return prediction[0][0]

In [None]:
# Exemplo de uso da função classify_new_review
new_review = [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]

print(classify_new_review(new_review))


## Parte 2: Classificação binária de "recomendação para um amigo" do dataset da B2W

In [None]:
import pandas as pd
data = pd.read_csv('B2W-Reviews01.csv')

data.head()

In [None]:
# Selecionando colunas relevantes e removendo linhas com valores ausentes
data = data[['review_text', 'recommend_to_a_friend']].dropna()

# Convertendo a coluna 'recommend_to_a_friend' para valores binários
data['recommend_to_a_friend'] = data['recommend_to_a_friend'].apply(lambda x: 1 if x.lower() == 'yes' else 0)

# Separando as features (X) e os labels (y)
X = data['review_text']
y = data['recommend_to_a_friend']

# Tokenização e padronização das sequências de texto
tokenizer = keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = keras.preprocessing.sequence.pad_sequences(X, maxlen=256)

# Dividindo os dados em conjuntos de treinamento e teste
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Construindo o modelo
vocab_size = 10000
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))

model.summary()

# Compilando o modelo
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Treinando o modelo
history = model.fit(X_train, y_train,
                    epochs=40,
                    batch_size=512,
                    validation_split=0.2,
                    verbose=1)

# Avaliando o modelo
results = model.evaluate(X_test, y_test, verbose=2)
print(results)

# Salvando o modelo usando pickle
with open('b2w_text_classification_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Salvando o tokenizador usando pickle
with open('b2w_tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

# Função para classificar novos dados
def classify_new_review(new_review, model_file='b2w_text_classification_model.pkl', tokenizer_file='b2w_tokenizer.pkl'):
    # Carregando o modelo treinado
    with open(model_file, 'rb') as file:
        model = pickle.load(file)

    # Carregando o tokenizador
    with open(tokenizer_file, 'rb') as file:
        tokenizer = pickle.load(file)

    # Processando o novo review
    new_review_seq = tokenizer.texts_to_sequences([new_review])
    new_review_padded = keras.preprocessing.sequence.pad_sequences(new_review_seq, maxlen=256)

    # Fazendo a previsão
    prediction = model.predict(new_review_padded)
    return 'Yes' if prediction[0][0] > 0.5 else 'No'

In [None]:
new_review = "Este produto é excelente e eu recomendaria para qualquer pessoa!"
print(classify_new_review(new_review))

In [None]:
new_review = "Eu odiei muito, nunca mais vou assistit!"
print(classify_new_review(new_review))