In [1]:
from deep_neural_net import DeepNeuralNetwork
from tokenizer import SimpleTokenizer
from data import read_csv, read_csv_once
from activation import SigmoidActivation, ReLUActivation
from layers import DenseLayer, EmbeddingLayer, FlattenLayer, DropoutLayer, BatchNormalizationLayer, GlobalAveragePoolingLayer, GlobalAveragePooling1D
from losses import BinaryCrossEntropy   
from metrics import accuracy
import pandas as pd
import numpy as np

In [2]:
file = '../../../datasets/final_dataset.csv'

In [3]:
print("Tokenizing csv...")
tokenizer = SimpleTokenizer(num_words=10000)
print("CSV tokenized!")

train_data, validation_data, test_data = read_csv(file, tokenizer)

net = DeepNeuralNetwork(epochs=5, batch_size=32, learning_rate=0.0005, verbose=True,
                        loss=BinaryCrossEntropy, metric=accuracy)

n_features = train_data.X.shape[1]

net.add(EmbeddingLayer(vocab_size=10000, embedding_dim=128, input_shape=(n_features,)))
net.add(GlobalAveragePooling1D())


net.add(DenseLayer(64, l2=0.003))
net.add(ReLUActivation())
net.add(DropoutLayer(0.4))
net.add(BatchNormalizationLayer())

net.add(DenseLayer(32, l2=0.003))
net.add(ReLUActivation())
net.add(DropoutLayer(0.4))
net.add(BatchNormalizationLayer())

net.add(DenseLayer(16, l2=0.003))
net.add(ReLUActivation())
net.add(DropoutLayer(0.5))

net.add(DenseLayer(8, l2=0.003))
net.add(ReLUActivation())
net.add(DropoutLayer(0.5))

net.add(DenseLayer(1))  
net.add(SigmoidActivation())

net.fit(train_data,validation_data=validation_data, patience=20)

test_predictions = net.predict(test_data)
test_score = net.score(test_data, test_predictions)
print(f"Accuracy no dataset de teste: {test_score:.4f}")


while True:
    opt = input("Queres guardar? [y/n]")
    if opt == "y":
        net.save("../../../models/modelo_dnn.pkl")
        break
    elif opt == "n":
        break

Tokenizing csv...
CSV tokenized!
Epoch 1/5 - loss: 0.9085 - accuracy: 0.4959 - val_loss: 0.6913 - val_accuracy: 0.5404
Epoch 2/5 - loss: 0.9312 - accuracy: 0.4927 - val_loss: 0.6929 - val_accuracy: 0.6293
Epoch 3/5 - loss: 0.9028 - accuracy: 0.4948 - val_loss: 0.6963 - val_accuracy: 0.5914
Epoch 4/5 - loss: 0.8869 - accuracy: 0.5092 - val_loss: 0.6920 - val_accuracy: 0.5338
Epoch 5/5 - loss: 0.9097 - accuracy: 0.5030 - val_loss: 0.6949 - val_accuracy: 0.4646
Accuracy no dataset de teste: 0.4499


In [15]:
file = pd.read_csv("../../../datasets/validation_dataset.csv", sep=";")
new_file = pd.DataFrame()
# Mapear labels
category_mapping = {"Human": 0, "AI": 1, "student": 0}
new_file["Text"] = file["Text"]
new_file["Label"] = file["Label"].map(category_mapping)

# Processar os dados para a rede
new_data = read_csv_once(new_file, tokenizer)

# Fazer previsões
predictions = net.predict(new_data)
predictions = np.squeeze(predictions)

# Converter previsões em rótulos
labels = ["AI" if pred > 0.5 else "Human" for pred in predictions]

# Criar DataFrame com previsões
output_df = pd.DataFrame({
    "Label": labels,
    "Prediction": predictions
})

# Carregar os rótulos reais e convertê-los para strings
ground_truth = file["Label"]

# Calcular precisão
accuracy = (output_df["Label"] == ground_truth).mean()

# Imprimir precisão
print(f"Accuracy: {accuracy:.4f}")

# Comparar previsões com rótulos reais
output_df["Label_actual"] = ground_truth

# Mostrar amostras mal classificadas
misclassified = output_df[output_df["Label"] != output_df["Label_actual"]]
print("\nMisclassified Samples:")
print(misclassified)

Accuracy: 0.4750

Misclassified Samples:
    Label  Prediction Label_actual
0      AI    0.510056        Human
1      AI    0.513711        Human
2   Human    0.476828           AI
3      AI    0.518418        Human
4      AI    0.527135        Human
7      AI    0.531038        Human
8      AI    0.524576        Human
14     AI    0.527318        Human
15     AI    0.543836        Human
17     AI    0.524992        Human
18  Human    0.494940           AI
19     AI    0.514036        Human
20     AI    0.535467        Human
21     AI    0.534872        Human
23     AI    0.505499        Human
25     AI    0.518747        Human
26     AI    0.543864        Human
27     AI    0.579276        Human
29     AI    0.516758        Human
30     AI    0.521080        Human
31     AI    0.547332        Human
32     AI    0.526132        Human
36     AI    0.523975        Human
41     AI    0.521060        Human
44  Human    0.498297           AI
47     AI    0.511771        Human
48     AI    0