In [5]:
from deep_neural_net import DeepNeuralNetwork
from tokenizer import SimpleTokenizer
from data import read_csv
from activation import SigmoidActivation, ReLUActivation
from layers import DenseLayer, EmbeddingLayer, FlattenLayer, DropoutLayer, BatchNormalizationLayer, GlobalAveragePoolingLayer, GlobalAveragePooling1D
from losses import BinaryCrossEntropy   
from metrics import accuracy
import pandas as pd
import numpy as np

In [2]:
file = '../../../datasets/final_dataset.csv'

In [3]:
print("Tokenizing csv...")
tokenizer = SimpleTokenizer(num_words=10000)
print("CSV tokenized!")

train_data, validation_data, test_data = read_csv(file, tokenizer)

net = DeepNeuralNetwork(epochs=25, batch_size=32, learning_rate=0.0005, verbose=True,
                        loss=BinaryCrossEntropy, metric=accuracy)

n_features = train_data.X.shape[1]

net.add(EmbeddingLayer(vocab_size=10000, embedding_dim=128, input_shape=(n_features,)))
net.add(GlobalAveragePooling1D())


net.add(DenseLayer(64, l2=0.003))
net.add(ReLUActivation())
net.add(DropoutLayer(0.4))
net.add(BatchNormalizationLayer())

net.add(DenseLayer(32, l2=0.003))
net.add(ReLUActivation())
net.add(DropoutLayer(0.4))
net.add(BatchNormalizationLayer())

net.add(DenseLayer(16, l2=0.003))
net.add(ReLUActivation())
net.add(DropoutLayer(0.5))

net.add(DenseLayer(8, l2=0.003))
net.add(ReLUActivation())
net.add(DropoutLayer(0.5))

net.add(DenseLayer(1))  
net.add(SigmoidActivation())

net.fit(train_data,validation_data=validation_data, patience=20)

test_predictions = net.predict(test_data)
test_score = net.score(test_data, test_predictions)
print(f"Accuracy no dataset de teste: {test_score:.4f}")


while True:
    opt = input("Queres guardar? [y/n]")
    if opt == "y":
        net.save("../../../models/modelo_dnn.pkl")
        break
    elif opt == "n":
        break

Tokenizing csv...
CSV tokenized!
Epoch 1/25 - loss: 0.9840 - accuracy: 0.5071 - val_loss: 0.7117 - val_accuracy: 0.5288
Epoch 2/25 - loss: 0.9914 - accuracy: 0.4968 - val_loss: 0.7028 - val_accuracy: 0.5585
Epoch 3/25 - loss: 0.9804 - accuracy: 0.5059 - val_loss: 0.6989 - val_accuracy: 0.5914
Epoch 4/25 - loss: 0.9536 - accuracy: 0.5105 - val_loss: 0.7002 - val_accuracy: 0.5371
Epoch 5/25 - loss: 1.0449 - accuracy: 0.4971 - val_loss: 0.6939 - val_accuracy: 0.5997
Epoch 6/25 - loss: 0.9285 - accuracy: 0.5182 - val_loss: 0.6998 - val_accuracy: 0.6112
Epoch 7/25 - loss: 0.9879 - accuracy: 0.5024 - val_loss: 0.6988 - val_accuracy: 0.5750
Epoch 8/25 - loss: 0.9878 - accuracy: 0.5057 - val_loss: 0.7041 - val_accuracy: 0.5881
Epoch 9/25 - loss: 0.9537 - accuracy: 0.5097 - val_loss: 0.7022 - val_accuracy: 0.5997
Epoch 10/25 - loss: 0.9423 - accuracy: 0.5099 - val_loss: 0.6971 - val_accuracy: 0.6079
Epoch 11/25 - loss: 0.9085 - accuracy: 0.5105 - val_loss: 0.7046 - val_accuracy: 0.5733
Epoch 12

In [12]:
# Load new data
new_data = pd.read_csv("/Users/jvale03/Documents/Universidade/4ºano/2semestre/AP/DeepLearning/datasets/validation_dataset.csv", delimiter=";")

if new_data.shape[1] != 2:
    raise ValueError("O dataset deve ter exatamente duas colunas: uma independente e uma dependente.")

texts = new_data.iloc[:, 0].astype(str).tolist()

category_mapping = {"Human": 0, "AI": 1, "student": 0}
new_data["Label"] = new_data["Label"].map(category_mapping)
new_data = new_data[["Text", "Label"]]

labels = new_data.iloc[:, 1].astype(np.float32).to_numpy()

tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)  # Ensure maxlen is consistent with your training data

# Make predictions
predictions = net.predict(sequences)

# Convert predictions to labels based on threshold
labels = ["AI" if pred > 0.5 else "Human" for pred in predictions]

# Create output DataFrame with predictions
output_df = pd.DataFrame({
    "Label": labels,
    "Prediction": predictions
})

# Load the ground truth labels (from the same dataset)
# Since the labels are in the 'Label' column, we'll compare them with predictions.
ground_truth = new_data["Label"]

# Calculate accuracy
accuracy = (output_df["Label"] == ground_truth).mean()

# Print the accuracy
print(f"Accuracy: {accuracy:.4f}")

# Merge predictions with ground truth for comparison
comparison_df = output_df.copy()
comparison_df["Label_actual"] = ground_truth

# Show misclassified samples
misclassified = comparison_df[comparison_df["Label"] != comparison_df["Label_actual"]]
print("\nMisclassified Samples:")
print(misclassified)

AttributeError: 'list' object has no attribute 'X'