In [None]:
from recurrent_neural_net import RecurrentNeuralNetwork
from tokenizer import  AdvancedTokenizer, RobustTokenizer, SimpleTokenizer
from data import read_csv, read_csv_once
from activation import SigmoidActivation, ReLUActivation
from layers import DenseLayer, EmbeddingLayer, DropoutLayer, BatchNormalizationLayer, RNNLayer
from losses import BinaryCrossEntropy   
from metrics import accuracy
import pandas as pd
import numpy as np
import random

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
csv = '../../../datasets/final_dataset.csv'

In [None]:
GLOBAL_SEED = 42
random.seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)

In [None]:
VOCAB_SIZE = 10000 
EMBEDDING_DIM = 100 
LEARNING_RATE = 0.0001
EPOCHS = 1
BATCH_SIZE = 32


print("Tokenizing csv...")
tokenizer = SimpleTokenizer(num_words=10000, seed=GLOBAL_SEED)
print("CSV tokenized!")
train_data, validation_data, test_data = read_csv(csv, tokenizer, seed=GLOBAL_SEED)
# Creating a RNN model
rnn = RecurrentNeuralNetwork(
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    momentum=0.9,
    verbose=True,
    seed=GLOBAL_SEED
)

print('Created model architecture')
n_features = train_data.X.shape[1]
# Build RNN architecture
rnn.add(EmbeddingLayer(vocab_size=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM, input_shape=(n_features,)))
rnn.add(RNNLayer(128, return_sequences=False, bptt_trunc=None)) 
rnn.add(DropoutLayer(dropout_rate=0.5))
rnn.add(DenseLayer(1))
rnn.add(SigmoidActivation())
print('Added layers to model')

# Train the model
rnn.fit(train_data, validation_data=validation_data, patience=5)
print('Model trained')

test_predictions = rnn.predict(test_data)
test_score = rnn.score(test_data, test_predictions)
print(f"Accuracy no dataset de teste: {test_score:.4f}")

In [None]:
file = pd.read_csv("../../../datasets/validation_dataset.csv", sep=";")
new_file = pd.DataFrame()
# Mapear labels
category_mapping = {"Human": 0, "AI": 1, "student": 0}
new_file["Text"] = file["Text"]
new_file["Label"] = file["Label"].map(category_mapping)

# Processar os dados para a rede
new_data = read_csv_once(new_file, tokenizer, seed=GLOBAL_SEED)

# Fazer previsões
predictions = rnn.predict(new_data)
predictions = np.squeeze(predictions)

# Converter previsões em rótulos
labels = ["AI" if pred > 0.5 else "Human" for pred in predictions]

# Criar DataFrame com previsões
output_df = pd.DataFrame({
    "Label": labels,
    "Prediction": predictions
})


# Carregar os rótulos reais e convertê-los para strings
ground_truth = file["Label"]

# Calcular precisão
accuracys = (output_df["Label"] == ground_truth).mean()

# Imprimir precisão
print(f"Accuracy: {accuracys:.4f}")

# Comparar previsões com rótulos reais
output_df["Label_actual"] = ground_truth

print("\nPredictions:")
print(output_df)

## Mostrar amostras mal classificadas
#misclassified = output_df[output_df["Label"] != output_df["Label_actual"]]
#print("\nMisclassified Samples:")
#print(misclassified)