In [38]:
from recurrent_neural_net import RecurrentNeuralNetwork
from tokenizer import  AdvancedTokenizer, RobustTokenizer, SimpleTokenizer
from data import read_csv, read_csv_once
from activation import SigmoidActivation, ReLUActivation
from layers import DenseLayer, EmbeddingLayer, DropoutLayer, BatchNormalizationLayer, RNNLayer
from losses import BinaryCrossEntropy   
from metrics import accuracy
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [39]:
csv = '../../../datasets/final_dataset.csv'

In [40]:

print("Tokenizing csv...")
tokenizer = SimpleTokenizer(num_words=10000)
print("CSV tokenized!")
train_data, validation_data, test_data = read_csv(csv, tokenizer)
# Creating a RNN model
rnn = RecurrentNeuralNetwork(
    epochs=1,
    batch_size=32,
    learning_rate=0.01,
    momentum=0.9,
    verbose=True
)

print('Created model architecture')
n_features = train_data.X.shape[1]
# Build RNN architecture
rnn.add(EmbeddingLayer(vocab_size=10000, embedding_dim=8, input_shape=(n_features,)))
rnn.add(RNNLayer(32, return_sequences=True, bptt_trunc=None))
rnn.add(RNNLayer(16, return_sequences=False, bptt_trunc=None))
rnn.add(BatchNormalizationLayer())
rnn.add(ReLUActivation())
rnn.add(DropoutLayer(dropout_rate=0.3))
rnn.add(DenseLayer(8))
rnn.add(ReLUActivation())
rnn.add(DenseLayer(1))
rnn.add(SigmoidActivation())
print('Added layers to model')

# Train the model
rnn.fit(train_data, validation_data=validation_data, patience=5)
print('Model trained')

test_predictions = rnn.predict(test_data)
test_score = rnn.score(test_data, test_predictions)
print(f"Accuracy no dataset de teste: {test_score:.4f}")

Tokenizing csv...
CSV tokenized!
Created model architecture
Added layers to model

Epoch 1/1
Epoch 1/1 - loss: 0.7143 - accuracy: 0.5066 - val_loss: 0.7060 - val_accuracy: 0.4811
Model trained
Accuracy no dataset de teste: 0.4828


In [41]:
file = pd.read_csv("../../../datasets/validation_dataset.csv", sep=";")
new_file = pd.DataFrame()
# Mapear labels
category_mapping = {"Human": 0, "AI": 1, "student": 0}
new_file["Text"] = file["Text"]
new_file["Label"] = file["Label"].map(category_mapping)

# Processar os dados para a rede
new_data = read_csv_once(new_file, tokenizer)

# Fazer previsões
predictions = rnn.predict(new_data)
predictions = np.squeeze(predictions)

# Converter previsões em rótulos
labels = ["AI" if pred > 0.5 else "Human" for pred in predictions]

# Criar DataFrame com previsões
output_df = pd.DataFrame({
    "Label": labels,
    "Prediction": predictions
})


# Carregar os rótulos reais e convertê-los para strings
ground_truth = file["Label"]

# Calcular precisão
accuracys = (output_df["Label"] == ground_truth).mean()

# Imprimir precisão
print(f"Accuracy: {accuracys:.4f}")

# Comparar previsões com rótulos reais
output_df["Label_actual"] = ground_truth

print(output_df)

## Mostrar amostras mal classificadas
#misclassified = output_df[output_df["Label"] != output_df["Label_actual"]]
#print("\nMisclassified Samples:")
#print(misclassified)

Accuracy: 0.4875
   Label  Prediction Label_actual
0     AI    0.557933        Human
1     AI    0.557937        Human
2     AI    0.557555           AI
3     AI    0.557938        Human
4     AI    0.557930        Human
5     AI    0.557685           AI
6     AI    0.557729           AI
7     AI    0.557919        Human
8     AI    0.557936        Human
9     AI    0.557932           AI
10    AI    0.557907           AI
11    AI    0.557728           AI
12    AI    0.557937           AI
13    AI    0.557933           AI
14    AI    0.557937        Human
15    AI    0.557847        Human
16    AI    0.557824           AI
17    AI    0.557938        Human
18    AI    0.557653           AI
19    AI    0.557527        Human
20    AI    0.557117        Human
21    AI    0.557868        Human
22    AI    0.557936           AI
23    AI    0.557895        Human
24    AI    0.557849           AI
25    AI    0.557821        Human
26    AI    0.557622        Human
27    AI    0.557554        Hum