In [4]:
import sys
sys.path.append("../../src/CustomModels/model_DNN")
from deep_neural_net import DeepNeuralNetwork
from tokenizer import  AdvancedTokenizer, RobustTokenizer, SimpleTokenizer
from data import read_csv, read_new_data
from activation import SigmoidActivation, ReLUActivation, LeakyReLUActivation
from layers import DenseLayer, EmbeddingLayer, DropoutLayer, BatchNormalizationLayer, GlobalAveragePooling1D, GRULayer, FlattenLayer
from losses import BinaryCrossEntropy
from metrics import accuracy
import pandas as pd
import numpy as np
import random

In [5]:
csv = '../../datasets/final_dataset.csv'

In [6]:
GLOBAL_SEED = 42
random.seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)

In [8]:
print("Tokenizing csv...")
tokenizer = AdvancedTokenizer(num_words=20000, seed=GLOBAL_SEED, min_freq=100, remove_stopwords=False)
print("CSV tokenized!")

train_data, validation_data, test_data = read_csv(csv, tokenizer, seed=GLOBAL_SEED)


net = DeepNeuralNetwork(epochs=26, batch_size=32, learning_rate=0.0001, verbose=True,
                        loss=BinaryCrossEntropy, metric=accuracy, seed=GLOBAL_SEED)

n_features = train_data.X.shape[1]

net.add(EmbeddingLayer(vocab_size=20000, embedding_dim=100, input_shape=(n_features,)))
net.add(GRULayer(n_units=8, return_sequences=True))
net.add(DropoutLayer(0.4))
net.add(GlobalAveragePooling1D())

net.add(DenseLayer(4, l2=0.001))
net.add(LeakyReLUActivation())
net.add(BatchNormalizationLayer())
net.add(DropoutLayer(0.6))

net.add(DenseLayer(1))  
net.add(SigmoidActivation())

net.fit(train_data,validation_data=validation_data, patience=20)

test_predictions = net.predict(test_data)
test_score = net.score(test_data, test_predictions)
print(f"Accuracy no dataset de teste: {test_score:.4f}")

Tokenizing csv...
CSV tokenized!
Epoch 1/5 - loss: 0.8965 - accuracy: 0.4142 - val_loss: 0.7079 - val_accuracy: 0.3556
Epoch 2/5 - loss: 0.8817 - accuracy: 0.4280 - val_loss: 0.7252 - val_accuracy: 0.3679
Epoch 3/5 - loss: 0.8685 - accuracy: 0.4289 - val_loss: 0.7225 - val_accuracy: 0.3778
Epoch 4/5 - loss: 0.8550 - accuracy: 0.4335 - val_loss: 0.7200 - val_accuracy: 0.3827
Epoch 5/5 - loss: 0.8424 - accuracy: 0.4363 - val_loss: 0.7178 - val_accuracy: 0.3901
Accuracy no dataset de teste: 0.3719


In [14]:
file = pd.read_csv("../../datasets/validation_dataset.csv", sep=";")

new_file = pd.DataFrame()
# Mapear labels
category_mapping = {"Human": 0, "AI": 1, "student": 0}
new_file["Text"] = file["Text"]
new_file["Label"] = file["Label"].map(category_mapping)

# Processar os dados para a rede
new_data = read_new_data(new_file, tokenizer, seed=GLOBAL_SEED)

# Fazer previsões
predictions = net.predict(new_data)
predictions = np.squeeze(predictions)

# Converter previsões em rótulos
labels = ["AI" if pred > 0.5 else "Human" for pred in predictions]

# Criar DataFrame com previsões
output_df = pd.DataFrame({
    "Label": labels,
    "Prediction": predictions
})


# Carregar os rótulos reais e convertê-los para strings
ground_truth = file["Label"]

# Calcular precisão
accuracys = (output_df["Label"] == ground_truth).mean()

# Imprimir precisão
print(f"Accuracy: {accuracys:.4f}")

# Comparar previsões com rótulos reais
output_df["Label_actual"] = ground_truth



# for _, row in output_df.iterrows():
#     print(f"{row['Label']} - {row['Prediction']}")

    
misclassified = output_df[output_df["Label"] != output_df["Label_actual"]]
print("\nMisclassified Samples:")
print(misclassified)


Accuracy: 0.5750

Misclassified Samples:
    Label  Prediction Label_actual
1      AI    0.581674        Human
2   Human    0.437257           AI
3      AI    0.559224        Human
6   Human    0.359242           AI
8      AI    0.533037        Human
10  Human    0.413121           AI
13  Human    0.472326           AI
14     AI    0.595648        Human
15     AI    0.557086        Human
16  Human    0.461640           AI
21     AI    0.683000        Human
22  Human    0.316102           AI
29     AI    0.642195        Human
31     AI    0.556055        Human
33  Human    0.457123           AI
36     AI    0.584414        Human
37  Human    0.441942           AI
41     AI    0.528720        Human
51  Human    0.401298           AI
53  Human    0.229541           AI
55  Human    0.382901           AI
57  Human    0.486281           AI
58     AI    0.645560        Human
60     AI    0.602299        Human
62     AI    0.635695        Human
67  Human    0.436916           AI
68     AI    0