Imports

In [102]:
import numpy as np
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

Read CSV

In [103]:
# Set seed
np.random.seed(42)
random.seed(42)

# Load the dataset
dataset = pd.read_csv("../../datasets/final_dataset.csv")

# First split: train and test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    dataset["Text"], dataset["Label"], test_size=0.2, random_state=42, stratify=dataset["Label"]
)

# Second split: train and validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42, stratify=train_labels
)

Process Data

In [104]:
# Vectorize the data using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000)

X_train = vectorizer.fit_transform(train_texts)
X_val = vectorizer.transform(val_texts)
X_test = vectorizer.transform(test_texts)

y_train = np.array(train_labels)
y_val = np.array(val_labels)
y_test = np.array(test_labels)

Logistic Regression Model

In [105]:
# Create and train the Logistic Regression model
log_reg_model = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),
    ('log_reg', LogisticRegression(max_iter=1000))
])

log_reg_model.fit(X_train, y_train)

# Evaluate the model
val_predictions = log_reg_model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation accuracy: {val_accuracy:.4f}")

test_predictions = log_reg_model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
print(f"Test accuracy: {test_accuracy:.4f}")

Validation accuracy: 0.9877
Test accuracy: 0.9815


Benchmarking

In [106]:
# Load new data
new_data = pd.read_csv("../../datasets/validation_dataset.csv", delimiter=";")

# Transform new data using the trained vectorizer
X_new = vectorizer.transform(new_data["Text"])

# Make predictions using the trained logistic regression model
predictions = log_reg_model.predict_proba(X_new)[:, 1]  # Get probability for the positive class

# Convert predictions to labels based on threshold
labels = ["AI" if pred > 0.5 else "Human" for pred in predictions]

# Create output DataFrame with predictions
output_df = pd.DataFrame({
    "Label": labels,
    "Prediction": predictions
})

# Load the ground truth labels (from the same dataset)
ground_truth = new_data["Label"]

# Calculate accuracy
accuracy = (output_df["Label"] == ground_truth).mean()

# Print the accuracy
print(f"Accuracy: {accuracy:.4f}")

# Merge predictions with ground truth for comparison
comparison_df = output_df.copy()
comparison_df["Label_actual"] = ground_truth

# Show misclassified samples
misclassified = comparison_df[comparison_df["Label"] != comparison_df["Label_actual"]]
print("\nMisclassified Samples:")
print(misclassified)


Accuracy: 0.5750

Misclassified Samples:
    Label  Prediction Label_actual
2   Human    0.002311           AI
5   Human    0.000043           AI
6   Human    0.001837           AI
9   Human    0.099504           AI
10  Human    0.135632           AI
11  Human    0.159898           AI
12  Human    0.003541           AI
13  Human    0.002768           AI
18  Human    0.022847           AI
22  Human    0.000088           AI
24  Human    0.014780           AI
28  Human    0.000171           AI
34  Human    0.196442           AI
35  Human    0.159667           AI
37  Human    0.039608           AI
38  Human    0.035902           AI
39  Human    0.099126           AI
45  Human    0.470698           AI
50     AI    0.547938        Human
51  Human    0.029603           AI
53  Human    0.010637           AI
55  Human    0.167861           AI
57  Human    0.002813           AI
59  Human    0.002952           AI
61  Human    0.151567           AI
63  Human    0.009951           AI
65  Human    0