In [6]:
from recurrent_neural_net import RecurrentNeuralNetwork
from tokenizer import  AdvancedTokenizer, RobustTokenizer, SimpleTokenizer
from data import read_csv, read_csv_once
from activation import SigmoidActivation, ReLUActivation
from layers import DenseLayer, EmbeddingLayer, DropoutLayer, BatchNormalizationLayer, RNNLayer, LSTMLayer
from losses import BinaryCrossEntropy   
from metrics import accuracy
import pandas as pd
import numpy as np
import random

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [7]:
csv = '../../../datasets/final_dataset.csv'

In [8]:
GLOBAL_SEED = 42
random.seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)

In [9]:
VOCAB_SIZE = 10000 
EMBEDDING_DIM = 100 
LEARNING_RATE = 0.01
EPOCHS = 10
BATCH_SIZE = 64


print("Tokenizing csv...")
tokenizer = SimpleTokenizer(num_words=10000, seed=GLOBAL_SEED)
print("CSV tokenized!")
train_data, validation_data, test_data = read_csv(csv, tokenizer, seed=GLOBAL_SEED)
# Creating a RNN model
rnn = RecurrentNeuralNetwork(
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    momentum=0.9,
    verbose=True,
    seed=GLOBAL_SEED
)


print('Created model architecture')
n_features = train_data.X.shape[1]
# Build RNN architecture
rnn.add(EmbeddingLayer(vocab_size=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM, input_shape=(n_features,)))
rnn.add(LSTMLayer(128, return_sequences=False, bptt_trunc=None))  
rnn.add(BatchNormalizationLayer())
rnn.add(DenseLayer(16))
rnn.add(DenseLayer(1))
print('Added layers to model')

# Train the model
rnn.fit(train_data, validation_data=validation_data, patience=10)
print('Model trained')

test_predictions = rnn.predict(test_data)
test_score = rnn.score(test_data, test_predictions)
print(f"Accuracy no dataset de teste: {test_score:.4f}")

Tokenizing csv...
CSV tokenized!
Created model architecture
Added layers to model

Epoch 1/10
Epoch 1/10 - loss: 3.2933 - accuracy: 0.4710 - val_loss: 0.6899 - val_accuracy: 0.5189

Epoch 2/10
Epoch 2/10 - loss: 0.6938 - accuracy: 0.5613 - val_loss: 0.6727 - val_accuracy: 0.5980

Epoch 3/10
Epoch 3/10 - loss: 0.6891 - accuracy: 0.5748 - val_loss: 0.6703 - val_accuracy: 0.6145

Epoch 4/10
Epoch 4/10 - loss: 0.6834 - accuracy: 0.5870 - val_loss: 0.6386 - val_accuracy: 0.6409

Epoch 5/10
Epoch 5/10 - loss: 0.6297 - accuracy: 0.6189 - val_loss: 0.6728 - val_accuracy: 0.6540

Epoch 6/10
Epoch 6/10 - loss: 0.5996 - accuracy: 0.6419 - val_loss: 0.7039 - val_accuracy: 0.6689

Epoch 7/10
Epoch 7/10 - loss: 0.5636 - accuracy: 0.6686 - val_loss: 0.6875 - val_accuracy: 0.6722

Epoch 8/10
Epoch 8/10 - loss: 0.5381 - accuracy: 0.6895 - val_loss: 0.6733 - val_accuracy: 0.6738

Epoch 9/10
Epoch 9/10 - loss: 0.5169 - accuracy: 0.7037 - val_loss: 0.7163 - val_accuracy: 0.6755

Epoch 10/10
Epoch 10/10 - 

In [10]:
file = pd.read_csv("../../../datasets/validation_dataset.csv", sep=";")
new_file = pd.DataFrame()
# Mapear labels
category_mapping = {"Human": 0, "AI": 1, "student": 0}
new_file["Text"] = file["Text"]
new_file["Label"] = file["Label"].map(category_mapping)

# Processar os dados para a rede
new_data = read_csv_once(new_file, tokenizer, seed=GLOBAL_SEED)

# Fazer previsões
predictions = rnn.predict(new_data)
predictions = np.squeeze(predictions)

# Converter previsões em rótulos
labels = ["AI" if pred > 0.5 else "Human" for pred in predictions]

# Criar DataFrame com previsões
output_df = pd.DataFrame({
    "Label": labels,
    "Prediction": predictions
})


# Carregar os rótulos reais e convertê-los para strings
ground_truth = file["Label"]

# Calcular precisão
accuracys = (output_df["Label"] == ground_truth).mean()

# Imprimir precisão
print(f"Accuracy: {accuracys:.4f}")

# Comparar previsões com rótulos reais
output_df["Label_actual"] = ground_truth

print("\nPredictions:")
print(output_df)

## Mostrar amostras mal classificadas
#misclassified = output_df[output_df["Label"] != output_df["Label_actual"]]
#print("\nMisclassified Samples:")
#print(misclassified)

Accuracy: 0.5375

Predictions:
    Label  Prediction Label_actual
0   Human    0.457174        Human
1   Human    0.444379        Human
2      AI    0.516675           AI
3   Human    0.442420        Human
4   Human    0.451128        Human
5   Human    0.447276           AI
6   Human    0.464635           AI
7   Human    0.467537        Human
8   Human    0.449512        Human
9   Human    0.448090           AI
10  Human    0.475332           AI
11  Human    0.496871           AI
12  Human    0.452765           AI
13  Human    0.464529           AI
14  Human    0.444053        Human
15     AI    0.516990        Human
16  Human    0.494854           AI
17  Human    0.461721        Human
18     AI    0.513972           AI
19     AI    0.534323        Human
20  Human    0.478530        Human
21  Human    0.443476        Human
22  Human    0.446877           AI
23  Human    0.472626        Human
24  Human    0.475261           AI
25     AI    0.508756        Human
26     AI    0.508470   