In [1]:
from recurrent_neural_net import RecurrentNeuralNetwork
from tokenizer import  AdvancedTokenizer, RobustTokenizer, SimpleTokenizer
from data import read_csv, read_csv_once
from activation import SigmoidActivation, ReLUActivation
from layers import DenseLayer, EmbeddingLayer, DropoutLayer, BatchNormalizationLayer, RNNLayer
from losses import BinaryCrossEntropy   
from metrics import accuracy
import pandas as pd
import numpy as np
import random

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
csv = '../../../datasets/final_dataset.csv'

In [3]:
GLOBAL_SEED = 42
random.seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)

In [4]:

print("Tokenizing csv...")
tokenizer = SimpleTokenizer(num_words=10000, seed=GLOBAL_SEED)
print("CSV tokenized!")
train_data, validation_data, test_data = read_csv(csv, tokenizer, seed=GLOBAL_SEED)
# Creating a RNN model
rnn = RecurrentNeuralNetwork(
    epochs=5,
    batch_size=32,
    learning_rate=0.01,
    momentum=0.9,
    verbose=True,
    seed=GLOBAL_SEED
)

print('Created model architecture')
n_features = train_data.X.shape[1]
# Build RNN architecture
rnn.add(EmbeddingLayer(vocab_size=10000, embedding_dim=8, input_shape=(n_features,)))
rnn.add(RNNLayer(32, return_sequences=True, bptt_trunc=None))
rnn.add(RNNLayer(16, return_sequences=False, bptt_trunc=None))
rnn.add(BatchNormalizationLayer())
rnn.add(ReLUActivation())
rnn.add(DropoutLayer(dropout_rate=0.3))
rnn.add(DenseLayer(8))
rnn.add(ReLUActivation())
rnn.add(DenseLayer(1))
rnn.add(SigmoidActivation())
print('Added layers to model')

# Train the model
rnn.fit(train_data, validation_data=validation_data, patience=5)
print('Model trained')

test_predictions = rnn.predict(test_data)
test_score = rnn.score(test_data, test_predictions)
print(f"Accuracy no dataset de teste: {test_score:.4f}")

Tokenizing csv...
CSV tokenized!
Created model architecture
Added layers to model

Epoch 1/5
Epoch 1/5 - loss: 0.8054 - accuracy: 0.4730 - val_loss: 0.6413 - val_accuracy: 0.7628

Epoch 2/5
Epoch 2/5 - loss: 0.6934 - accuracy: 0.5512 - val_loss: 0.7100 - val_accuracy: 0.4811

Epoch 3/5
Epoch 3/5 - loss: 0.6904 - accuracy: 0.5435 - val_loss: 0.6997 - val_accuracy: 0.4761

Epoch 4/5
Epoch 4/5 - loss: 0.6889 - accuracy: 0.5480 - val_loss: 0.6976 - val_accuracy: 0.4382

Epoch 5/5
Epoch 5/5 - loss: 0.6871 - accuracy: 0.5508 - val_loss: 0.6927 - val_accuracy: 0.5222
Model trained
Accuracy no dataset de teste: 0.5172


In [5]:
file = pd.read_csv("../../../datasets/validation_dataset.csv", sep=";")
new_file = pd.DataFrame()
# Mapear labels
category_mapping = {"Human": 0, "AI": 1, "student": 0}
new_file["Text"] = file["Text"]
new_file["Label"] = file["Label"].map(category_mapping)

# Processar os dados para a rede
new_data = read_csv_once(new_file, tokenizer, seed=GLOBAL_SEED)

# Fazer previsões
predictions = rnn.predict(new_data)
predictions = np.squeeze(predictions)

# Converter previsões em rótulos
labels = ["AI" if pred > 0.5 else "Human" for pred in predictions]

# Criar DataFrame com previsões
output_df = pd.DataFrame({
    "Label": labels,
    "Prediction": predictions
})


# Carregar os rótulos reais e convertê-los para strings
ground_truth = file["Label"]

# Calcular precisão
accuracys = (output_df["Label"] == ground_truth).mean()

# Imprimir precisão
print(f"Accuracy: {accuracys:.4f}")

# Comparar previsões com rótulos reais
output_df["Label_actual"] = ground_truth

# Mostrar amostras mal classificadas
misclassified = output_df[output_df["Label"] != output_df["Label_actual"]]
print("\nMisclassified Samples:")
print(misclassified)

Accuracy: 0.5125

Misclassified Samples:
    Label  Prediction Label_actual
2   Human    0.483094           AI
5   Human    0.485387           AI
6   Human    0.486265           AI
9   Human    0.486478           AI
10  Human    0.486469           AI
11  Human    0.486256           AI
12  Human    0.486482           AI
13  Human    0.486482           AI
16  Human    0.486175           AI
18  Human    0.486179           AI
22  Human    0.486481           AI
24  Human    0.486473           AI
28  Human    0.486480           AI
33  Human    0.486509           AI
34  Human    0.486483           AI
35  Human    0.486482           AI
37  Human    0.486480           AI
38  Human    0.486465           AI
39  Human    0.486482           AI
40  Human    0.486468           AI
42  Human    0.486513           AI
43  Human    0.486089           AI
44  Human    0.481505           AI
45  Human    0.486451           AI
51  Human    0.486483           AI
53  Human    0.486466           AI
55  Human    0