Imports

In [43]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization, Flatten
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

import numpy as np
import random
import pandas as pd
from sklearn.model_selection import train_test_split

Read CSV

In [44]:
# Set seed
np.random.seed(42)
random.seed(42)
tf.random.set_seed(42)


# Load the dataset
dataset = pd.read_csv("../../datasets/final_dataset.csv")

# First split: train and test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    dataset["Text"], dataset["Label"], test_size=0.2, random_state=42, stratify=dataset["Label"]
)

# Second split: train and validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42, stratify=train_labels
)

Process Data

In [45]:
# Tokenize the data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_texts)

X_train_seq = tokenizer.texts_to_sequences(train_texts)
X_val_seq = tokenizer.texts_to_sequences(val_texts)
X_test_seq = tokenizer.texts_to_sequences(test_texts)

X_train = pad_sequences(X_train_seq, maxlen=100)
X_val = pad_sequences(X_val_seq, maxlen=100)
X_test = pad_sequences(X_test_seq, maxlen=100)

y_train = np.array(train_labels)
y_val = np.array(val_labels)
y_test = np.array(test_labels)


Neural Network

In [46]:
model = Sequential([
    # Keep the Embedding layer since we're working with text
    Embedding(input_dim=10000, output_dim=64),
    
    # Add a GlobalAveragePooling1D layer to convert sequences to fixed-size vectors
    # This replaces the recurrent nature of LSTMs
    tf.keras.layers.GlobalAveragePooling1D(),
    
    # First Dense layer (larger than LSTM output)
    Dense(64, activation="relu", kernel_regularizer=l2(0.003)),
    Dropout(0.6),
    BatchNormalization(),
    
    # Second Dense layer
    Dense(32, activation="relu", kernel_regularizer=l2(0.003)),
    Dropout(0.6),
    BatchNormalization(),
    
    # Third Dense layer (similar to your original model)
    Dense(16, activation="relu", kernel_regularizer=l2(0.003)),
    Dropout(0.7),

    Dense(8, activation="relu", kernel_regularizer=l2(0.003)),
    Dropout(0.7),

    # Output layer stays the same
    Dense(1, activation="sigmoid")
])

learning_rate = 0.0005
epoch = 20
batch_size = 32

# Compile with a lower learning rate
optimizer = Adam(learning_rate=learning_rate)  # Further reduced learning rate
model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"])
# Early Stopping (More aggressive stopping)
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=2,  # Decreased patience for early stopping
    restore_best_weights=True
)
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.3,  # More aggressive reduction
    patience=1,
    min_lr=0.00005
)
# Train with adjusted dropout, regularization, and patience for early stopping
history = model.fit(
    X_train, y_train,
    epochs=epoch,  # Keep more epochs to allow gradual improvement
    batch_size=batch_size,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping, reduce_lr]
)


Epoch 1/20
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.5100 - loss: 1.5501 - val_accuracy: 0.4823 - val_loss: 1.0940 - learning_rate: 5.0000e-04
Epoch 2/20
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.4911 - loss: 1.3445 - val_accuracy: 0.4823 - val_loss: 1.0767 - learning_rate: 5.0000e-04
Epoch 3/20
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.5102 - loss: 1.2337 - val_accuracy: 0.4823 - val_loss: 1.0605 - learning_rate: 5.0000e-04
Epoch 4/20
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.5329 - loss: 1.1322 - val_accuracy: 0.4823 - val_loss: 1.0459 - learning_rate: 5.0000e-04
Epoch 5/20
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.5250 - loss: 1.1111 - val_accuracy: 0.4823 - val_loss: 1.0324 - learning_rate: 5.0000e-04
Epoch 6/20
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

Results

In [47]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_acc:.4f}\n")

val_loss, val_acc = model.evaluate(X_val, y_val)
print(f"Validation accuracy: {val_acc:.4f}")

[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9605 - loss: 0.5353 
Test accuracy: 0.9618

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9680 - loss: 0.5278 
Validation accuracy: 0.9676


Benchmarking

In [48]:
# Load new data
new_data = pd.read_csv("../../datasets/validation_dataset.csv", delimiter=";")

# Tokenize and pad the new data (using the same tokenizer you trained on)
X_new_seq = tokenizer.texts_to_sequences(new_data["Text"])
X_new = pad_sequences(X_new_seq, maxlen=100)  # Ensure maxlen is consistent with your training data

# Make predictions
predictions = model.predict(X_new).flatten()

# Convert predictions to labels based on threshold
labels = ["AI" if pred > 0.5 else "Human" for pred in predictions]

# Create output DataFrame with predictions
output_df = pd.DataFrame({
    "Label": labels,
    "Prediction": predictions
})

# Load the ground truth labels (from the same dataset)
# Since the labels are in the 'Label' column, we'll compare them with predictions.
ground_truth = new_data["Label"]

# Calculate accuracy
accuracy = (output_df["Label"] == ground_truth).mean()

# Print the accuracy
print(f"Accuracy: {accuracy:.4f}")

# Merge predictions with ground truth for comparison
comparison_df = output_df.copy()
comparison_df["Label_actual"] = ground_truth

# Show misclassified samples
misclassified = comparison_df[comparison_df["Label"] != comparison_df["Label_actual"]]
print("\nMisclassified Samples:")
print(misclassified)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
Accuracy: 0.6500

Misclassified Samples:
    Label  Prediction Label_actual
2   Human    0.402408           AI
5   Human    0.011797           AI
6   Human    0.244139           AI
9   Human    0.018257           AI
11  Human    0.014018           AI
12  Human    0.056686           AI
13  Human    0.192245           AI
16  Human    0.364192           AI
18  Human    0.337724           AI
22  Human    0.059323           AI
24  Human    0.052490           AI
28  Human    0.038217           AI
32     AI    0.576500        Human
35  Human    0.317834           AI
37  Human    0.424016           AI
38  Human    0.471474           AI
39  Human    0.478572           AI
42  Human    0.463638           AI
48     AI    0.576500        Human
50     AI    0.576500        Human
53  Human    0.005264           AI
57  Human    0.188895           AI
59  Human    0.031762           AI
61  Human    0.287380           AI
67  Human   

In [49]:
### These other combinations also provide good results
### i.e., >= 0.6 for the professor data

# Epochs: 10, Batch Size: 16, Learning Rate: 0.001
# Epochs: 10, Batch Size: 16, Learning Rate: 0.0002
# Epochs: 10, Batch Size: 32, Learning Rate: 0.0005
# Epochs: 20, Batch Size: 16, Learning Rate: 0.0002
# Epochs: 20, Batch Size: 64, Learning Rate: 0.0002