In [1]:
from deep_neural_net import DeepNeuralNetwork
from tokenizer import  AdvancedTokenizer, RobustTokenizer, SimpleTokenizer
from data import read_csv, read_csv_once
from activation import SigmoidActivation, ReLUActivation
from layers import DenseLayer, EmbeddingLayer, DropoutLayer, BatchNormalizationLayer, GlobalAveragePooling1D
from losses import BinaryCrossEntropy   
from metrics import accuracy
import pandas as pd
import numpy as np
import random

In [2]:
csv = '../../../datasets/final_dataset.csv'

In [3]:
GLOBAL_SEED = 42
random.seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)

In [4]:
print("Tokenizing csv...")
tokenizer = AdvancedTokenizer(num_words=7500, seed=GLOBAL_SEED)
print("CSV tokenized!")

train_data, validation_data, test_data = read_csv(csv, tokenizer, seed=GLOBAL_SEED)

net = DeepNeuralNetwork(epochs=10, batch_size=16, learning_rate=0.001, verbose=True,
                        loss=BinaryCrossEntropy, metric=accuracy, seed=GLOBAL_SEED)

n_features = train_data.X.shape[1]

net.add(EmbeddingLayer(vocab_size=7500, embedding_dim=100, input_shape=(n_features,)))
net.add(GlobalAveragePooling1D())


net.add(DenseLayer(64, l2=0.001))
net.add(ReLUActivation())
net.add(DropoutLayer(0.5))
net.add(BatchNormalizationLayer())

net.add(DenseLayer(32, l2=0.001))
net.add(ReLUActivation())
net.add(DropoutLayer(0.5))
net.add(BatchNormalizationLayer())

net.add(DenseLayer(16, l2=0.001))
net.add(ReLUActivation())
net.add(DropoutLayer(0.3))

net.add(DenseLayer(1))  
net.add(SigmoidActivation())

net.fit(train_data,validation_data=validation_data, patience=20)

test_predictions = net.predict(test_data)
test_score = net.score(test_data, test_predictions)
print(f"Accuracy no dataset de teste: {test_score:.4f}")




Tokenizing csv...
CSV tokenized!
Epoch 1/10 - loss: 0.8770 - accuracy: 0.5248 - val_loss: 0.7790 - val_accuracy: 0.5189
Epoch 2/10 - loss: 0.8557 - accuracy: 0.5230 - val_loss: 0.6946 - val_accuracy: 0.4514
Epoch 3/10 - loss: 0.8805 - accuracy: 0.5121 - val_loss: 0.8067 - val_accuracy: 0.5189
Epoch 4/10 - loss: 0.8680 - accuracy: 0.5192 - val_loss: 0.7357 - val_accuracy: 0.5189
Epoch 5/10 - loss: 0.8549 - accuracy: 0.5073 - val_loss: 0.7348 - val_accuracy: 0.5189
Epoch 6/10 - loss: 0.8428 - accuracy: 0.5199 - val_loss: 0.7340 - val_accuracy: 0.5189
Epoch 7/10 - loss: 0.8491 - accuracy: 0.5232 - val_loss: 0.7497 - val_accuracy: 0.5189
Epoch 8/10 - loss: 0.7495 - accuracy: 0.5073 - val_loss: 0.6942 - val_accuracy: 0.4811
Epoch 9/10 - loss: 0.7046 - accuracy: 0.5115 - val_loss: 0.6931 - val_accuracy: 0.5206
Epoch 10/10 - loss: 0.7097 - accuracy: 0.5025 - val_loss: 0.6938 - val_accuracy: 0.4811
Accuracy no dataset de teste: 0.4844


In [5]:
file = pd.read_csv("../../../datasets/validation_dataset.csv", sep=";")
new_file = pd.DataFrame()
# Mapear labels
category_mapping = {"Human": 0, "AI": 1, "student": 0}
new_file["Text"] = file["Text"]
new_file["Label"] = file["Label"].map(category_mapping)

# Processar os dados para a rede
new_data = read_csv_once(new_file, tokenizer, seed=GLOBAL_SEED)

# Fazer previsões
predictions = net.predict(new_data)
predictions = np.squeeze(predictions)

# Converter previsões em rótulos
labels = ["AI" if pred > 0.5 else "Human" for pred in predictions]

# Criar DataFrame com previsões
output_df = pd.DataFrame({
    "Label": labels,
    "Prediction": predictions
})


# Carregar os rótulos reais e convertê-los para strings
ground_truth = file["Label"]

# Calcular precisão
accuracys = (output_df["Label"] == ground_truth).mean()

# Imprimir precisão
print(f"Accuracy: {accuracys:.4f}")

# Comparar previsões com rótulos reais
output_df["Label_actual"] = ground_truth

# Mostrar amostras mal classificadas
misclassified = output_df[output_df["Label"] != output_df["Label_actual"]]
print("\nMisclassified Samples:")
print(misclassified)

Accuracy: 0.4875

Misclassified Samples:
   Label  Prediction Label_actual
0     AI    0.507357        Human
1     AI    0.507357        Human
3     AI    0.507357        Human
4     AI    0.507357        Human
7     AI    0.507357        Human
8     AI    0.507357        Human
14    AI    0.507357        Human
15    AI    0.507357        Human
17    AI    0.507357        Human
19    AI    0.507357        Human
20    AI    0.507357        Human
21    AI    0.507357        Human
23    AI    0.507357        Human
25    AI    0.507357        Human
26    AI    0.507357        Human
27    AI    0.507357        Human
29    AI    0.507357        Human
30    AI    0.507357        Human
31    AI    0.507357        Human
32    AI    0.507357        Human
36    AI    0.507357        Human
41    AI    0.507357        Human
46    AI    0.507357        Human
47    AI    0.507357        Human
48    AI    0.507357        Human
49    AI    0.507357        Human
50    AI    0.507357        Human
52    A