In [None]:
# Imports
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dropout, Dense, ReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
import pandas as pd
import matplotlib.pyplot as plt
import ast
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


In [None]:
# Loading the data
dataset = pd.read_csv("../datasets/AI_Human.csv")

train_texts, test_texts, train_labels, test_labels = train_test_split(
    dataset["text"], dataset["generated"], test_size=0.2, random_state=42, stratify=dataset["generated"]
)

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)

X_train = vectorizer.fit_transform(train_texts).toarray()
X_test = vectorizer.fit_transform(test_texts).toarray()

y_train = np.array(train_labels)
y_test = np.array(test_labels)

In [None]:
max_length = max(len(seq) for seq in X_train) 
print(max_length)

In [None]:
print(X_train.shape)

In [None]:
model = keras.models.Sequential()
model.add(keras.Input(shape=(max_length,)))

embedding_dim = 32
vocab_size = 5000
model.add(keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim))
model.add(keras.layers.Bidirectional(keras.layers.SimpleRNN(32, activation='tanh')))
model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.Dense(32, activation='relu'))
model.add(keras.layers.Dense(1))

loss = keras.losses.BinaryCrossentropy(from_logits=True)
optim = keras.optimizers.RMSprop(learning_rate=0.001)
metrics = ["accuracy"]

model.compile(loss=loss, optimizer=optim, metrics=metrics)

batch_size = 256
epochs = 1

model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=2)

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")

In [None]:
# Saving the model
model.save('../models/model_RNN_tf.h5')

In [None]:
# Load new data
new_data = pd.read_csv("dataset_inputs.csv", delimiter="\t")  # Ensure correct delimiter

# Transform text using the same vectorizer
X_new = vectorizer.transform(new_data["Text"]).toarray()  # Ensure column name matches

# Make predictions
predictions = model.predict(X_new).flatten()  # Get raw prediction values

# Convert predictions to labels
labels = ["AI" if pred > 0.5 else "Human" for pred in predictions]

# Create output DataFrame with both predicted labels and raw prediction values
output_df = pd.DataFrame({"ID": new_data["ID"], "Label": labels, "Prediction": predictions})

# Save to TSV (tab-separated file)
output_df.to_csv("predictions_output.csv", index=False, sep="\t")

print("Predictions saved to predictions_output.tsv")


In [None]:
# Load the correct labels (ground truth)
ground_truth = pd.read_csv("dataset_outputs.csv", delimiter="\t")  # Ensure it's tab-separated

# Load the predictions
predictions = pd.read_csv("predictions_output.csv", delimiter="\t")

# Merge the two datasets on "ID" to align predictions with correct labels
comparison_df = predictions.merge(ground_truth, on="ID", suffixes=("_predicted", "_actual"))

# Calculate accuracy
accuracy = (comparison_df["Label_predicted"] == comparison_df["Label_actual"]).mean()

# Print results
print(f"Accuracy: {accuracy:.4f}")

# Show misclassified samples
misclassified = comparison_df[comparison_df["Label_predicted"] != comparison_df["Label_actual"]]
print("\nMisclassified Samples:")
print(misclassified)