In [1]:
from deep_neural_net import DeepNeuralNetwork
from tokenizer import  AdvancedTokenizer, RobustTokenizer, SimpleTokenizer
from data import read_csv, read_new_data
from activation import SigmoidActivation, ReLUActivation, LeakyReLUActivation
from layers import DenseLayer, EmbeddingLayer, DropoutLayer, BatchNormalizationLayer, GlobalAveragePooling1D, GRULayer, lstm_layer, FlattenLayer
from losses import BinaryCrossEntropy
from metrics import accuracy
import pandas as pd
import numpy as np
import random

In [2]:
csv = '../../../datasets/final_dataset.csv'

In [3]:
GLOBAL_SEED = 42
random.seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)

In [None]:
print("Tokenizing csv...")
tokenizer = AdvancedTokenizer(num_words=20000, seed=GLOBAL_SEED, min_freq=100, remove_stopwords=False)
print("CSV tokenized!")

train_data, validation_data, test_data = read_csv(csv, tokenizer, seed=GLOBAL_SEED)


net = DeepNeuralNetwork(epochs=26, batch_size=32, learning_rate=0.0001, verbose=True,
                        loss=BinaryCrossEntropy, metric=accuracy, seed=GLOBAL_SEED)

n_features = train_data.X.shape[1]

net.add(EmbeddingLayer(vocab_size=20000, embedding_dim=100, input_shape=(n_features,)))
net.add(GRULayer(n_units=8, return_sequences=True))
net.add(DropoutLayer(0.4))
net.add(GlobalAveragePooling1D())

net.add(DenseLayer(4, l2=0.001))
net.add(LeakyReLUActivation())
net.add(BatchNormalizationLayer())
net.add(DropoutLayer(0.6))

net.add(DenseLayer(1))  
net.add(SigmoidActivation())

net.fit(train_data,validation_data=validation_data, patience=20)

test_predictions = net.predict(test_data)
test_score = net.score(test_data, test_predictions)
print(f"Accuracy no dataset de teste: {test_score:.4f}")

Tokenizing csv...
CSV tokenized!
Epoch 1/26 - loss: 0.8965 - accuracy: 0.4142 - val_loss: 0.7079 - val_accuracy: 0.3556
Epoch 2/26 - loss: 0.8817 - accuracy: 0.4280 - val_loss: 0.7252 - val_accuracy: 0.3679
Epoch 3/26 - loss: 0.8685 - accuracy: 0.4289 - val_loss: 0.7225 - val_accuracy: 0.3778
Epoch 4/26 - loss: 0.8550 - accuracy: 0.4335 - val_loss: 0.7200 - val_accuracy: 0.3827
Epoch 5/26 - loss: 0.8424 - accuracy: 0.4363 - val_loss: 0.7178 - val_accuracy: 0.3901
Epoch 6/26 - loss: 0.8309 - accuracy: 0.4372 - val_loss: 0.7167 - val_accuracy: 0.4000
Epoch 7/26 - loss: 0.8192 - accuracy: 0.4430 - val_loss: 0.7141 - val_accuracy: 0.4148
Epoch 8/26 - loss: 0.8084 - accuracy: 0.4470 - val_loss: 0.7125 - val_accuracy: 0.4222
Epoch 9/26 - loss: 0.7983 - accuracy: 0.4507 - val_loss: 0.7112 - val_accuracy: 0.4346
Epoch 10/26 - loss: 0.7887 - accuracy: 0.4531 - val_loss: 0.7098 - val_accuracy: 0.4667
Epoch 11/26 - loss: 0.7794 - accuracy: 0.4559 - val_loss: 0.7089 - val_accuracy: 0.4716
Epoch 12

In [6]:
file = pd.read_csv("../../../datasets/validation_dataset.csv", sep=";")
new_file = pd.DataFrame()
# Mapear labels
category_mapping = {"Human": 0, "AI": 1, "student": 0}
new_file["Text"] = file["Text"]
new_file["Label"] = file["Label"].map(category_mapping)

# Processar os dados para a rede
new_data = read_new_data(new_file, tokenizer, seed=GLOBAL_SEED)

# Fazer previsões
predictions = net.predict(new_data)
predictions = np.squeeze(predictions)

# Converter previsões em rótulos
labels = ["AI" if pred > 0.5 else "Human" for pred in predictions]

# Criar DataFrame com previsões
output_df = pd.DataFrame({
    "Label": labels,
    "Prediction": predictions
})


# Carregar os rótulos reais e convertê-los para strings
ground_truth = file["Label"]

# Calcular precisão
accuracys = (output_df["Label"] == ground_truth).mean()

# Imprimir precisão
print(f"Accuracy: {accuracys:.4f}")

# Comparar previsões com rótulos reais
output_df["Label_actual"] = ground_truth



# for _, row in output_df.iterrows():
#     print(f"{row['Label']} - {row['Prediction']}")

    
misclassified = output_df[output_df["Label"] != output_df["Label_actual"]]
print("\nMisclassified Samples:")
print(misclassified)


Accuracy: 0.4750

Misclassified Samples:
    Label  Prediction Label_actual
0      AI    0.506122        Human
2   Human    0.260921           AI
5   Human    0.332236           AI
6   Human    0.435405           AI
9   Human    0.474481           AI
10  Human    0.339109           AI
11  Human    0.407024           AI
12  Human    0.241769           AI
16  Human    0.386204           AI
18  Human    0.302188           AI
22  Human    0.474170           AI
24  Human    0.357727           AI
28  Human    0.486840           AI
29     AI    0.527071        Human
30     AI    0.607551        Human
33  Human    0.340189           AI
34  Human    0.346047           AI
37  Human    0.305807           AI
39  Human    0.323415           AI
40  Human    0.471619           AI
42  Human    0.348788           AI
43  Human    0.411375           AI
44  Human    0.444063           AI
45  Human    0.312116           AI
51  Human    0.390923           AI
53  Human    0.451253           AI
55  Human    0