Imports

In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

import numpy as np
import random
import pandas as pd
from sklearn.model_selection import train_test_split

Read CSV

In [2]:
# Set seed
np.random.seed(42)
random.seed(42)
tf.random.set_seed(42)

# Load the dataset
dataset = pd.read_csv("../../datasets/final_dataset.csv")

# First split: train and test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    dataset["Text"], dataset["Label"], test_size=0.2, random_state=42, stratify=dataset["Label"]
)

# Second split: train and validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42, stratify=train_labels
)

Process Data

In [3]:
# Tokenize the data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_texts)

X_train_seq = tokenizer.texts_to_sequences(train_texts)
X_val_seq = tokenizer.texts_to_sequences(val_texts)
X_test_seq = tokenizer.texts_to_sequences(test_texts)

X_train = pad_sequences(X_train_seq, maxlen=120)
X_val = pad_sequences(X_val_seq, maxlen=120)
X_test = pad_sequences(X_test_seq, maxlen=120)

y_train = np.array(train_labels)
y_val = np.array(val_labels)
y_test = np.array(test_labels)


Neural Network

In [4]:
# RNN simples

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.optimizers import Adam


# Definir hiperparâmetros do modelo
VOCAB_SIZE = 10000  # Tamanho do vocabulário
SEQUENCE_LENGTH = 120  # Comprimento das sequências
EMBEDDING_DIM = 100  # Tamanho do vetor de embedding
RNN_UNITS = 128  # Número de unidades na camada RNN
LEARNING_RATE = 0.01  # Taxa de aprendizado
EPOCHS = 20
BATCH_SIZE = 32

# Criar o modelo RNN simples
model = Sequential([
    Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=SEQUENCE_LENGTH),
    SimpleRNN(RNN_UNITS, activation="tanh"),
    Dropout(0.5),
    Dense(1, activation="sigmoid")
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

history = model.fit(X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=20,
                    batch_size=32)

Epoch 1/20




[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 26ms/step - accuracy: 0.5243 - loss: 0.7161 - val_accuracy: 0.6903 - val_loss: 0.6210
Epoch 2/20
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - accuracy: 0.7069 - loss: 0.5576 - val_accuracy: 0.8120 - val_loss: 0.4537
Epoch 3/20
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.9634 - loss: 0.1735 - val_accuracy: 0.9445 - val_loss: 0.1862
Epoch 4/20
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.9773 - loss: 0.0715 - val_accuracy: 0.9769 - val_loss: 0.0791
Epoch 5/20
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.9952 - loss: 0.0271 - val_accuracy: 0.9861 - val_loss: 0.0550
Epoch 6/20
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - accuracy: 0.9982 - loss: 0.0090 - val_accuracy: 0.9846 - val_loss: 0.0508
Epoch 7/20
[1m82/82[0m [32m━━━━━━━━━━━━━━━

Results

In [5]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_acc:.4f}\n")

val_loss, val_acc = model.evaluate(X_val, y_val)
print(f"Validation accuracy: {val_acc:.4f}")

[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9669 - loss: 0.1462
Test accuracy: 0.9630

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9742 - loss: 0.0962
Validation accuracy: 0.9753


Benchmarking

In [6]:
# Load new data
new_data = pd.read_csv("../../datasets/validation_dataset.csv", delimiter=";")

# Tokenize and pad the new data (using the same tokenizer you trained on)
X_new_seq = tokenizer.texts_to_sequences(new_data["Text"])
X_new = pad_sequences(X_new_seq, maxlen=120)  # Ensure maxlen is consistent with your training data

# Make predictions
predictions = model.predict(X_new).flatten()

# Convert predictions to labels based on threshold
labels = ["AI" if pred > 0.5 else "Human" for pred in predictions]

# Create output DataFrame with predictions
output_df = pd.DataFrame({
    "Label": labels,
    "Prediction": predictions
})

# Load the ground truth labels (from the same dataset)
# Since the labels are in the 'Label' column, we'll compare them with predictions.
ground_truth = new_data["Label"]

# Calculate accuracy
accuracy = (output_df["Label"] == ground_truth).mean()

# Print the accuracy
print(f"Accuracy: {accuracy:.4f}")

# Merge predictions with ground truth for comparison
comparison_df = output_df.copy()
comparison_df["Label_actual"] = ground_truth

# Show misclassified samples
misclassified = comparison_df[comparison_df["Label"] != comparison_df["Label_actual"]]
print("\nMisclassified Samples:")
print(misclassified)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
Accuracy: 0.6250

Misclassified Samples:
    Label  Prediction Label_actual
5   Human    0.000422           AI
6   Human    0.001916           AI
9   Human    0.049526           AI
10  Human    0.013541           AI
11  Human    0.084151           AI
12  Human    0.001797           AI
13  Human    0.001265           AI
18  Human    0.004650           AI
22  Human    0.001986           AI
23     AI    0.549941        Human
24  Human    0.001352           AI
28  Human    0.003649           AI
33  Human    0.015642           AI
34  Human    0.002210           AI
35  Human    0.436676           AI
38  Human    0.003462           AI
39  Human    0.007967           AI
42  Human    0.003840           AI
43  Human    0.414019           AI
48     AI    0.994812        Human
51  Human    0.002854           AI
53  Human    0.080440           AI
57  Human    0.009751           AI
59  Human    0.001142           AI
61  Human   

Test Sentences Manually

In [7]:
# Hardcoded sentence
hardcoded_sentence = """
The human immune system is the body’s defense mechanism against harmful invaders like bacteria, viruses, fungi, and parasites. It consists of two main parts: the innate immune system and the adaptive immune system. The innate immune system provides a rapid, non-specific response to infections, using barriers like the skin and white blood cells. The adaptive immune system, on the other hand, targets specific pathogens and creates lasting immunity. Key components of the adaptive system include T cells and B cells, which remember previous infections and help the body fight them more effectively in the future. A healthy immune system is essential for preventing diseases, and its dysfunction can lead to conditions like autoimmune disorders or immunodeficiency.
"""

# Tokenize and pad the sentence
X_hardcoded_seq = tokenizer.texts_to_sequences([hardcoded_sentence])  # Use the same tokenizer
X_hardcoded = pad_sequences(X_hardcoded_seq, maxlen=120)

# Make prediction
prediction = model.predict(X_hardcoded).flatten()[0] 

# Convert prediction to label
label = "AI" if prediction > 0.5 else "Human"

# Print result
print(f"Sentence: {hardcoded_sentence}")
print(f"Predicted Label: {label}")
print(f"Prediction Score: {prediction:.4f}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Sentence: 
The human immune system is the body’s defense mechanism against harmful invaders like bacteria, viruses, fungi, and parasites. It consists of two main parts: the innate immune system and the adaptive immune system. The innate immune system provides a rapid, non-specific response to infections, using barriers like the skin and white blood cells. The adaptive immune system, on the other hand, targets specific pathogens and creates lasting immunity. Key components of the adaptive system include T cells and B cells, which remember previous infections and help the body fight them more effectively in the future. A healthy immune system is essential for preventing diseases, and its dysfunction can lead to conditions like autoimmune disorders or immunodeficiency.

Predicted Label: AI
Prediction Score: 0.9891
