# Imports

### TensorFlow

In [41]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow import keras
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Processing Data

In [43]:
dataset = pd.read_csv("../datasets/AI_Human.csv")

train_texts, test_texts, train_labels, test_labels = train_test_split(
    dataset["text"], dataset["generated"], test_size=0.2, random_state=42, stratify=dataset["generated"]
)

In [44]:
vectorizer = TfidfVectorizer(max_features=5000)

X_train = vectorizer.fit_transform(train_texts).toarray()
X_test = vectorizer.fit_transform(test_texts).toarray()

y_train = np.array(train_labels)
y_test = np.array(test_labels)

# Neural Network

In [45]:
model = Sequential([
    Dense(64, activation="relu", input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.2),

    Dense(32, activation="relu", kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.2),

    Dense(16, activation="relu", kernel_regularizer=l2(0.002)),
    BatchNormalization(),
    Dropout(0.3),

    Dense(1, activation="sigmoid")
])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [46]:
optimizer = Adam(learning_rate=0.005)

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

early_stop = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stop])

Epoch 1/20
[1m12181/12181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 8ms/step - accuracy: 0.9726 - loss: 0.1827 - val_accuracy: 0.7958 - val_loss: 0.6966
Epoch 2/20
[1m12181/12181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 6ms/step - accuracy: 0.9901 - loss: 0.0826 - val_accuracy: 0.7969 - val_loss: 0.6879
Epoch 3/20
[1m12181/12181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 6ms/step - accuracy: 0.9904 - loss: 0.0779 - val_accuracy: 0.7079 - val_loss: 1.2545
Epoch 4/20
[1m12181/12181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 6ms/step - accuracy: 0.9903 - loss: 0.0755 - val_accuracy: 0.7862 - val_loss: 0.7363
Epoch 5/20
[1m12181/12181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 5ms/step - accuracy: 0.9906 - loss: 0.0714 - val_accuracy: 0.7388 - val_loss: 0.8682
Epoch 6/20
[1m12181/12181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 6ms/step - accuracy: 0.9905 - loss: 0.0705 - val_accuracy: 0.7830 - val_loss: 0.755

<keras.src.callbacks.history.History at 0x2170c2e21b0>

In [47]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")

[1m3046/3046[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.7972 - loss: 0.6918
Loss: 0.6879, Accuracy: 0.7969


In [48]:
# Load new data
new_data = pd.read_csv("dataset_inputs.csv", delimiter="\t")  # Ensure correct delimiter

# Transform text using the same vectorizer
X_new = vectorizer.transform(new_data["Text"]).toarray()  # Ensure column name matches

# Make predictions
predictions = model.predict(X_new).flatten()  # Get raw prediction values

# Convert predictions to labels
labels = ["AI" if pred > 0.5 else "Human" for pred in predictions]

# Create output DataFrame with both predicted labels and raw prediction values
output_df = pd.DataFrame({"ID": new_data["ID"], "Label": labels, "Prediction": predictions})

# Save to TSV (tab-separated file)
output_df.to_csv("predictions_output.csv", index=False, sep="\t")

print("Predictions saved to predictions_output.tsv")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step
Predictions saved to predictions_output.tsv


In [49]:
# Load the correct labels (ground truth)
ground_truth = pd.read_csv("dataset_outputs.csv", delimiter="\t")  # Ensure it's tab-separated

# Load the predictions
predictions = pd.read_csv("predictions_output.csv", delimiter="\t")

# Merge the two datasets on "ID" to align predictions with correct labels
comparison_df = predictions.merge(ground_truth, on="ID", suffixes=("_predicted", "_actual"))

# Calculate accuracy
accuracy = (comparison_df["Label_predicted"] == comparison_df["Label_actual"]).mean()

# Print results
print(f"Accuracy: {accuracy:.4f}")

# Show misclassified samples
misclassified = comparison_df[comparison_df["Label_predicted"] != comparison_df["Label_actual"]]
print("\nMisclassified Samples:")
print(misclassified)


Accuracy: 0.6000

Misclassified Samples:
       ID Label_predicted  Prediction Label_actual
4    D1-5              AI    0.984370        Human
6    D1-7              AI    0.592310        Human
8    D1-9              AI    0.999896        Human
9   D1-10           Human    0.150132           AI
11  D1-12           Human    0.326781           AI
12  D1-13              AI    0.790196        Human
18  D1-19              AI    0.995056        Human
20  D1-21              AI    0.709150        Human
22  D1-23              AI    0.998602        Human
26  D1-27              AI    0.973348        Human
28  D1-29              AI    0.999905        Human
29  D1-30           Human    0.494273           AI
