In [30]:
from recurrent_neural_net import RecurrentNeuralNetwork
from tokenizer import  AdvancedTokenizer, RobustTokenizer, SimpleTokenizer
from data import read_csv, read_csv_once
from activation import SigmoidActivation, ReLUActivation
from layers import DenseLayer, EmbeddingLayer, DropoutLayer, BatchNormalizationLayer, RNNLayer
from losses import BinaryCrossEntropy   
from metrics import accuracy
import pandas as pd
import numpy as np
import random

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [31]:
csv = '../../../datasets/final_dataset.csv'

In [32]:
GLOBAL_SEED = 42
random.seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)

In [33]:
VOCAB_SIZE = 10000 
EMBEDDING_DIM = 100 
LEARNING_RATE = 0.0001
EPOCHS = 5
BATCH_SIZE = 32


print("Tokenizing csv...")
tokenizer = SimpleTokenizer(num_words=10000, seed=GLOBAL_SEED)
print("CSV tokenized!")
train_data, validation_data, test_data = read_csv(csv, tokenizer, seed=GLOBAL_SEED)
# Creating a RNN model
rnn = RecurrentNeuralNetwork(
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    momentum=0.9,
    verbose=True,
    seed=GLOBAL_SEED
)

print('Created model architecture')
n_features = train_data.X.shape[1]
# Build RNN architecture
rnn.add(EmbeddingLayer(vocab_size=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM, input_shape=(n_features,)))
rnn.add(RNNLayer(128, return_sequences=False, bptt_trunc=None)) 
rnn.add(DropoutLayer(dropout_rate=0.5))
rnn.add(DenseLayer(1))
rnn.add(SigmoidActivation())
print('Added layers to model')

# Train the model
rnn.fit(train_data, validation_data=validation_data, patience=5)
print('Model trained')

test_predictions = rnn.predict(test_data)
test_score = rnn.score(test_data, test_predictions)
print(f"Accuracy no dataset de teste: {test_score:.4f}")

Tokenizing csv...
CSV tokenized!
Created model architecture
Added layers to model

Epoch 1/5
Epoch 1/5 - loss: 0.7033 - accuracy: 0.5200 - val_loss: 0.6921 - val_accuracy: 0.5206

Epoch 2/5
Epoch 2/5 - loss: 0.7056 - accuracy: 0.5020 - val_loss: 0.6927 - val_accuracy: 0.5058

Epoch 3/5
Epoch 3/5 - loss: 0.6922 - accuracy: 0.5278 - val_loss: 0.6901 - val_accuracy: 0.5437

Epoch 4/5
Epoch 4/5 - loss: 0.6871 - accuracy: 0.5474 - val_loss: 0.6848 - val_accuracy: 0.5931

Epoch 5/5
Epoch 5/5 - loss: 0.6775 - accuracy: 0.5791 - val_loss: 0.6705 - val_accuracy: 0.6705
Model trained
Accuracy no dataset de teste: 0.6831


In [34]:
file = pd.read_csv("../../../datasets/validation_dataset.csv", sep=";")
new_file = pd.DataFrame()
# Mapear labels
category_mapping = {"Human": 0, "AI": 1, "student": 0}
new_file["Text"] = file["Text"]
new_file["Label"] = file["Label"].map(category_mapping)

# Processar os dados para a rede
new_data = read_csv_once(new_file, tokenizer, seed=GLOBAL_SEED)

# Fazer previsões
predictions = rnn.predict(new_data)
predictions = np.squeeze(predictions)

# Converter previsões em rótulos
labels = ["AI" if pred > 0.5 else "Human" for pred in predictions]

# Criar DataFrame com previsões
output_df = pd.DataFrame({
    "Label": labels,
    "Prediction": predictions
})


# Carregar os rótulos reais e convertê-los para strings
ground_truth = file["Label"]

# Calcular precisão
accuracys = (output_df["Label"] == ground_truth).mean()

# Imprimir precisão
print(f"Accuracy: {accuracys:.4f}")

# Comparar previsões com rótulos reais
output_df["Label_actual"] = ground_truth

print("\nPredictions:")
print(output_df)

## Mostrar amostras mal classificadas
#misclassified = output_df[output_df["Label"] != output_df["Label_actual"]]
#print("\nMisclassified Samples:")
#print(misclassified)

Accuracy: 0.5750

Predictions:
    Label  Prediction Label_actual
0   Human    0.456037        Human
1   Human    0.479780        Human
2   Human    0.496886           AI
3   Human    0.462592        Human
4   Human    0.462459        Human
5   Human    0.468886           AI
6   Human    0.463228           AI
7   Human    0.496758        Human
8   Human    0.482349        Human
9   Human    0.457116           AI
10  Human    0.449208           AI
11  Human    0.452166           AI
12     AI    0.542790           AI
13  Human    0.447342           AI
14  Human    0.486260        Human
15  Human    0.447898        Human
16     AI    0.524998           AI
17  Human    0.463383        Human
18  Human    0.448392           AI
19  Human    0.446157        Human
20  Human    0.471652        Human
21  Human    0.451515        Human
22  Human    0.478057           AI
23  Human    0.466906        Human
24  Human    0.463049           AI
25  Human    0.462574        Human
26  Human    0.461016   