In [1]:
from deep_neural_net import DeepNeuralNetwork
from tokenizer import  AdvancedTokenizer, RobustTokenizer, SimpleTokenizer
from data import read_csv, read_new_data
from activation import SigmoidActivation, ReLUActivation
from layers import DenseLayer, EmbeddingLayer, DropoutLayer, BatchNormalizationLayer, GlobalAveragePooling1D, GRULayer, lstm_layer, FlattenLayer
from losses import BinaryCrossEntropy
from metrics import accuracy
import pandas as pd
import numpy as np
import random

In [2]:
csv = '../../../datasets/final_dataset.csv'

In [3]:
GLOBAL_SEED = 42
random.seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)

In [4]:
print("Tokenizing csv...")
tokenizer = AdvancedTokenizer(num_words=20000, seed=GLOBAL_SEED, min_freq=100)
print("CSV tokenized!")

train_data, validation_data, test_data = read_csv(csv, tokenizer, seed=GLOBAL_SEED)


net = DeepNeuralNetwork(epochs=10, batch_size=64, learning_rate=0.0005, verbose=True,
                        loss=BinaryCrossEntropy, metric=accuracy, seed=GLOBAL_SEED)

n_features = train_data.X.shape[1]

net.add(EmbeddingLayer(vocab_size=20000, embedding_dim=100, input_shape=(n_features,)))
net.add(GRULayer(n_units=64, return_sequences=True))
net.add(GRULayer(n_units=32, return_sequences=True))
net.add(GlobalAveragePooling1D())

net.add(DenseLayer(16, l2=0.001))
net.add(BatchNormalizationLayer())
net.add(ReLUActivation())
net.add(DropoutLayer(0.6))

net.add(DenseLayer(8, l2=0.005))
net.add(BatchNormalizationLayer())
net.add(ReLUActivation())
net.add(DropoutLayer(0.7))

net.add(DenseLayer(4, l2=0.01))
net.add(ReLUActivation())
net.add(DropoutLayer(0.8))

net.add(DenseLayer(1))  
net.add(SigmoidActivation())

net.fit(train_data,validation_data=validation_data, patience=15)

test_predictions = net.predict(test_data)
test_score = net.score(test_data, test_predictions)
print(f"Accuracy no dataset de teste: {test_score:.4f}")

Tokenizing csv...
CSV tokenized!
Epoch 1/10 - loss: 1.1234 - accuracy: 0.5050 - val_loss: 0.6930 - val_accuracy: 0.5189
Epoch 2/10 - loss: 1.0763 - accuracy: 0.5099 - val_loss: 0.6942 - val_accuracy: 0.5222
Epoch 3/10 - loss: 1.0335 - accuracy: 0.5137 - val_loss: 0.7055 - val_accuracy: 0.4349
Epoch 4/10 - loss: 0.9923 - accuracy: 0.5175 - val_loss: 0.7035 - val_accuracy: 0.4646
Epoch 5/10 - loss: 0.9556 - accuracy: 0.5179 - val_loss: 0.7033 - val_accuracy: 0.4860
Epoch 6/10 - loss: 0.9224 - accuracy: 0.5210 - val_loss: 0.7021 - val_accuracy: 0.5255
Epoch 7/10 - loss: 0.8939 - accuracy: 0.5287 - val_loss: 0.7014 - val_accuracy: 0.5519
Epoch 8/10 - loss: 0.8715 - accuracy: 0.5335 - val_loss: 0.7023 - val_accuracy: 0.5815
Epoch 9/10 - loss: 0.8512 - accuracy: 0.5349 - val_loss: 0.7010 - val_accuracy: 0.5980
Epoch 10/10 - loss: 0.8308 - accuracy: 0.5384 - val_loss: 0.7009 - val_accuracy: 0.6293
Accuracy no dataset de teste: 0.6240


In [10]:
file = pd.read_csv("../../../datasets/validation_dataset.csv", sep=";")
new_file = pd.DataFrame()
# Mapear labels
category_mapping = {"Human": 0, "AI": 1, "student": 0}
new_file["Text"] = file["Text"]
new_file["Label"] = file["Label"].map(category_mapping)

# Processar os dados para a rede
new_data = read_new_data(new_file, tokenizer, seed=GLOBAL_SEED)

print(new_data.X)

# Fazer previsões
predictions = net.predict(new_data)
predictions = np.squeeze(predictions)

# Converter previsões em rótulos
labels = ["AI" if pred > 0.5 else "Human" for pred in predictions]

# Criar DataFrame com previsões
output_df = pd.DataFrame({
    "Label": labels,
    "Prediction": predictions
})


# Carregar os rótulos reais e convertê-los para strings
ground_truth = file["Label"]

# Calcular precisão
accuracys = (output_df["Label"] == ground_truth).mean()

# Imprimir precisão
print(f"Accuracy: {accuracys:.4f}")

# Comparar previsões com rótulos reais
output_df["Label_actual"] = ground_truth



# Mostrar amostras mal classificadas
misclassified = output_df[output_df["Label"] != output_df["Label_actual"]]
print("\nMisclassified Samples:")
print(misclassified)

[[ 47   9  27 ...   0   0   0]
 [ 32   0  17 ...   0   0   0]
 [  0   0   9 ...   0   0   0]
 ...
 [135   0 721 ...   0   0   0]
 [  1   0 202 ...   0   0   0]
 [  1 982   2 ...   0   0   0]]
Accuracy: 0.4875

Misclassified Samples:
    Label  Prediction Label_actual
4      AI    0.518538        Human
5   Human    0.483092           AI
6   Human    0.479262           AI
9   Human    0.484483           AI
10  Human    0.479262           AI
11  Human    0.479262           AI
12  Human    0.483443           AI
13  Human    0.479262           AI
16  Human    0.488984           AI
18  Human    0.498644           AI
22  Human    0.479262           AI
23     AI    0.588392        Human
24  Human    0.479262           AI
26     AI    0.567399        Human
28  Human    0.479262           AI
33  Human    0.479262           AI
34  Human    0.479262           AI
35  Human    0.490476           AI
37  Human    0.475124           AI
38  Human    0.497035           AI
39  Human    0.479262           