In [None]:
from deep_neural_net import DeepNeuralNetwork
from tokenizer import  AdvancedTokenizer, RobustTokenizer, SimpleTokenizer
from data import read_csv, read_csv_once
from activation import SigmoidActivation, ReLUActivation
from layers import DenseLayer, EmbeddingLayer, DropoutLayer, BatchNormalizationLayer, GlobalAveragePooling1D
from losses import BinaryCrossEntropy   
from metrics import accuracy
import pandas as pd
import numpy as np

In [2]:
csv = '../../../datasets/final_dataset.csv'

In [3]:
print("Tokenizing csv...")
tokenizer = AdvancedTokenizer(num_words=7500)
print("CSV tokenized!")

train_data, validation_data, test_data = read_csv(csv, tokenizer)

net = DeepNeuralNetwork(epochs=50, batch_size=16, learning_rate=0.001, verbose=True,
                        loss=BinaryCrossEntropy, metric=accuracy)

n_features = train_data.X.shape[1]

net.add(EmbeddingLayer(vocab_size=7500, embedding_dim=100, input_shape=(n_features,)))
net.add(GlobalAveragePooling1D())


net.add(DenseLayer(64, l2=0.001))
net.add(ReLUActivation())
net.add(DropoutLayer(0.5))
net.add(BatchNormalizationLayer())

net.add(DenseLayer(32, l2=0.001))
net.add(ReLUActivation())
net.add(DropoutLayer(0.5))
net.add(BatchNormalizationLayer())

net.add(DenseLayer(16, l2=0.001))
net.add(ReLUActivation())
net.add(DropoutLayer(0.3))

net.add(DenseLayer(1))  
net.add(SigmoidActivation())

net.fit(train_data,validation_data=validation_data, patience=20)

test_predictions = net.predict(test_data)
test_score = net.score(test_data, test_predictions)
print(f"Accuracy no dataset de teste: {test_score:.4f}")




Tokenizing csv...
CSV tokenized!
Epoch 1/50 - loss: 0.8608 - accuracy: 0.4995 - val_loss: 0.7046 - val_accuracy: 0.5025
Epoch 2/50 - loss: 0.8259 - accuracy: 0.5104 - val_loss: 0.7111 - val_accuracy: 0.7183
Epoch 3/50 - loss: 0.8031 - accuracy: 0.5287 - val_loss: 0.7023 - val_accuracy: 0.6672
Epoch 4/50 - loss: 0.7997 - accuracy: 0.5386 - val_loss: 0.7138 - val_accuracy: 0.5848
Epoch 5/50 - loss: 0.8130 - accuracy: 0.5129 - val_loss: 0.7149 - val_accuracy: 0.5124
Epoch 6/50 - loss: 0.7890 - accuracy: 0.5198 - val_loss: 0.7203 - val_accuracy: 0.7249
Epoch 7/50 - loss: 0.7875 - accuracy: 0.5198 - val_loss: 0.7333 - val_accuracy: 0.6903
Epoch 8/50 - loss: 0.7888 - accuracy: 0.5355 - val_loss: 0.7353 - val_accuracy: 0.7512
Epoch 9/50 - loss: 0.7716 - accuracy: 0.5381 - val_loss: 0.7206 - val_accuracy: 0.6672
Epoch 10/50 - loss: 0.7517 - accuracy: 0.5603 - val_loss: 0.7084 - val_accuracy: 0.7479
Epoch 11/50 - loss: 0.7895 - accuracy: 0.5202 - val_loss: 0.7183 - val_accuracy: 0.6936
Epoch 12

In [4]:
file = pd.read_csv("../../../datasets/validation_dataset.csv", sep=";")
new_file = pd.DataFrame()
# Mapear labels
category_mapping = {"Human": 0, "AI": 1, "student": 0}
new_file["Text"] = file["Text"]
new_file["Label"] = file["Label"].map(category_mapping)

# Processar os dados para a rede
new_data = read_csv_once(new_file, tokenizer)

# Fazer previsões
predictions = net.predict(new_data)
predictions = np.squeeze(predictions)

# Converter previsões em rótulos
labels = ["AI" if pred > 0.5 else "Human" for pred in predictions]

# Criar DataFrame com previsões
output_df = pd.DataFrame({
    "Label": labels,
    "Prediction": predictions
})


# Carregar os rótulos reais e convertê-los para strings
ground_truth = file["Label"]

# Calcular precisão
accuracys = (output_df["Label"] == ground_truth).mean()

# Imprimir precisão
print(f"Accuracy: {accuracys:.4f}")

# Comparar previsões com rótulos reais
output_df["Label_actual"] = ground_truth

# Mostrar amostras mal classificadas
misclassified = output_df[output_df["Label"] != output_df["Label_actual"]]
print("\nMisclassified Samples:")
print(misclassified)

Accuracy: 0.5250

Misclassified Samples:
    Label  Prediction Label_actual
0      AI    0.501468        Human
1      AI    0.519318        Human
4      AI    0.505369        Human
7      AI    0.521645        Human
8      AI    0.588319        Human
14     AI    0.511539        Human
15     AI    0.506762        Human
17     AI    0.512766        Human
20     AI    0.518158        Human
21     AI    0.518169        Human
23     AI    0.513017        Human
25     AI    0.503831        Human
26     AI    0.501189        Human
29     AI    0.507897        Human
30     AI    0.509726        Human
32     AI    0.521390        Human
35  Human    0.493783           AI
36     AI    0.518719        Human
38  Human    0.495883           AI
41     AI    0.513945        Human
46     AI    0.508618        Human
49     AI    0.505494        Human
50     AI    0.509480        Human
53  Human    0.491547           AI
54     AI    0.507158        Human
56     AI    0.521856        Human
58     AI    0