Imports

In [31]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

import numpy as np
import random
import pandas as pd
from sklearn.model_selection import train_test_split

Read CSV

In [32]:
# Set seed
np.random.seed(144)
random.seed(144)
tf.random.set_seed(144)

# Load the dataset
dataset = pd.read_csv("../../datasets/final_dataset.csv")

# First split: train and test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    dataset["Text"], dataset["Label"], test_size=0.2, random_state=144, stratify=dataset["Label"]
)

# Second split: train and validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=144, stratify=train_labels
)

Process Data

In [33]:
# Tokenize the data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_texts)

X_train_seq = tokenizer.texts_to_sequences(train_texts)
X_val_seq = tokenizer.texts_to_sequences(val_texts)
X_test_seq = tokenizer.texts_to_sequences(test_texts)

X_train = pad_sequences(X_train_seq, maxlen=100)
X_val = pad_sequences(X_val_seq, maxlen=100)
X_test = pad_sequences(X_test_seq, maxlen=100)

y_train = np.array(train_labels)
y_val = np.array(val_labels)
y_test = np.array(test_labels)


Neural Network

In [34]:
# Define the RNN model
model = Sequential([
    Embedding(input_dim=10000, output_dim=50, input_length=100),
    SimpleRNN(64, activation="relu"),
    
    Dense(1, activation="sigmoid")
])

# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/10




[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.5697 - loss: 0.7204 - val_accuracy: 0.7381 - val_loss: 0.6405
Epoch 2/10
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.7632 - loss: 0.5712 - val_accuracy: 0.9091 - val_loss: 0.3309
Epoch 3/10
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.9462 - loss: 0.1945 - val_accuracy: 0.9461 - val_loss: 0.1856
Epoch 4/10
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.9488 - loss: 0.3272 - val_accuracy: 0.8552 - val_loss: 0.3150
Epoch 5/10
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.9139 - loss: 0.2146 - val_accuracy: 0.9168 - val_loss: 0.2074
Epoch 6/10
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.9841 - loss: 0.1216 - val_accuracy: 0.9337 - val_loss: 0.1711
Epoch 7/10
[1m82/82[0m [32m━━━━━━━━━━━━━━━

Results

In [35]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_acc:.4f}\n")

val_loss, val_acc = model.evaluate(X_val, y_val)
print(f"Validation accuracy: {val_acc:.4f}")

[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.8593 - loss: 0.3401
Test accuracy: 0.8705

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8876 - loss: 0.2670
Validation accuracy: 0.8860


Benchmarking

In [36]:
# Load new data
new_data = pd.read_csv("../../datasets/validation_dataset.csv", delimiter=";")

# Tokenize and pad the new data (using the same tokenizer you trained on)
X_new_seq = tokenizer.texts_to_sequences(new_data["Text"])
X_new = pad_sequences(X_new_seq, maxlen=100)  # Ensure maxlen is consistent with your training data

# Make predictions
predictions = model.predict(X_new).flatten()

# Convert predictions to labels based on threshold
labels = ["AI" if pred > 0.5 else "Human" for pred in predictions]

# Create output DataFrame with predictions
output_df = pd.DataFrame({
    "Label": labels,
    "Prediction": predictions
})

# Load the ground truth labels (from the same dataset)
# Since the labels are in the 'Label' column, we'll compare them with predictions.
ground_truth = new_data["Label"]

# Calculate accuracy
accuracy = (output_df["Label"] == ground_truth).mean()

# Print the accuracy
print(f"Accuracy: {accuracy:.4f}")

# Merge predictions with ground truth for comparison
comparison_df = output_df.copy()
comparison_df["Label_actual"] = ground_truth

# Show misclassified samples
misclassified = comparison_df[comparison_df["Label"] != comparison_df["Label_actual"]]
print("\nMisclassified Samples:")
print(misclassified)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
Accuracy: 0.6500

Misclassified Samples:
    Label  Prediction Label_actual
5   Human    0.007376           AI
6   Human    0.004963           AI
9   Human    0.063584           AI
10  Human    0.259195           AI
11  Human    0.177374           AI
12  Human    0.029857           AI
13  Human    0.001452           AI
18  Human    0.183009           AI
22  Human    0.021208           AI
24  Human    0.133533           AI
25     AI    0.968817        Human
33  Human    0.231999           AI
34  Human    0.198664           AI
36     AI    0.568051        Human
37  Human    0.385023           AI
38  Human    0.326807           AI
39  Human    0.251432           AI
42  Human    0.042118           AI
43  Human    0.401822           AI
48     AI    0.776427        Human
51  Human    0.013412           AI
57  Human    0.143792           AI
59  Human    0.466122           AI
61  Human    0.399194           AI
69  Human   

Test Sentences Manually

In [None]:
# Hardcoded sentence
hardcoded_sentence = """
The human immune system is the body’s defense mechanism against harmful invaders like bacteria, viruses, fungi, and parasites. It consists of two main parts: the innate immune system and the adaptive immune system. The innate immune system provides a rapid, non-specific response to infections, using barriers like the skin and white blood cells. The adaptive immune system, on the other hand, targets specific pathogens and creates lasting immunity. Key components of the adaptive system include T cells and B cells, which remember previous infections and help the body fight them more effectively in the future. A healthy immune system is essential for preventing diseases, and its dysfunction can lead to conditions like autoimmune disorders or immunodeficiency.
"""

# Tokenize and pad the sentence
X_hardcoded_seq = tokenizer.texts_to_sequences([hardcoded_sentence])  # Use the same tokenizer
X_hardcoded = pad_sequences(X_hardcoded_seq, maxlen=100)

# Make prediction
prediction = model.predict(X_hardcoded).flatten()[0] 

# Convert prediction to label
label = "AI" if prediction > 0.5 else "Human"

# Print result
print(f"Sentence: {hardcoded_sentence}")
print(f"Predicted Label: {label}")
print(f"Prediction Score: {prediction:.4f}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Sentence: 
The human immune system is the body’s defense mechanism against harmful invaders like bacteria, viruses, fungi, and parasites. It consists of two main parts: the innate immune system and the adaptive immune system. The innate immune system provides a rapid, non-specific response to infections, using barriers like the skin and white blood cells. The adaptive immune system, on the other hand, targets specific pathogens and creates lasting immunity. Key components of the adaptive system include T cells and B cells, which remember previous infections and help the body fight them more effectively in the future. A healthy immune system is essential for preventing diseases, and its dysfunction can lead to conditions like autoimmune disorders or immunodeficiency.


Predicted Label: AI
Prediction Score: 0.6756
