#Install & Imports

In [None]:
!pip install datasets huggingface_hub regex nltk

from dotenv import load_dotenv
import os
from huggingface_hub import login
from datasets import load_dataset
import pandas as pd
import re
import regex as re2
import nltk
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer
from collections import Counter
import unicodedata
import numpy as np
import math

#Loading the dataset



In [None]:
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
login(token=hf_token)

In [None]:
dataset = load_dataset("KFUPM-JRCAI/arabic-generated-abstracts")
dataset

In [None]:
# Combine all splits into one dataframe
import pandas as pd

splits = {
    "by_polishing": dataset["by_polishing"],
    "from_title": dataset["from_title"],
    "from_title_and_content": dataset["from_title_and_content"]
}

df_list = []

for split_name, split_data in splits.items():
    temp_df = pd.DataFrame(split_data)

    # Add a column indicating which split this example came from
    temp_df["source_split"] = split_name

    # Create a proper target label column
    # 1 = human-written
    # 0 = AI-generated (we duplicate rows later)
    temp_df["label"] = 1   # original abstract is human

    df_list.append(temp_df)

# Create unified dataframe of human-written abstracts
df_human = pd.concat(df_list, ignore_index=True)

print("Human dataframe shape:", df_human.shape)
df_human.head()

In [None]:
#Convert AI abstracts to standalone rows

ai_rows = []

for _, row in df_human.iterrows():
    ai_models = [
        ("allam", row["allam_generated_abstract"]),
        ("jais", row["jais_generated_abstract"]),
        ("llama", row["llama_generated_abstract"]),
        ("openai", row["openai_generated_abstract"]),
    ]

    for model_name, text in ai_models:
        ai_rows.append({
            "abstract_text": text,
            "source_split": row["source_split"],
            "generated_by": model_name,
            "label": 0  # AI
        })

# Convert to dataframe
df_ai = pd.DataFrame(ai_rows)

# Create human dataframe in same structure
df_h = pd.DataFrame({
    "abstract_text": df_human["original_abstract"],
    "source_split": df_human["source_split"],
    "generated_by": "human",
    "label": 1
})

# Final unified dataset
df = pd.concat([df_h, df_ai], ignore_index=True)

print("Final unified dataset shape:", df.shape)
df.head(10)

#Data Preprocessing

In [None]:
# Preprocessing utilities
import re
import string
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")



In [None]:
def remove_diacritics(text):
    arabic_diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    return re.sub(arabic_diacritics, '', text)

def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "و", text)
    text = re.sub("ئ", "ي", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("[^؀-ۿ ]+", " ", text)
    return text

arabic_stopwords = set(stopwords.words("arabic"))
stemmer = ISRIStemmer()

def preprocess_text(text):
    text = str(text)
    text = remove_diacritics(text)
    text = normalize_arabic(text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in arabic_stopwords]
    tokens = [stemmer.stem(w) for w in tokens]
    return " ".join(tokens)

In [None]:
# Apply preprocessing to the  dataset
df["clean_text"] = df["abstract_text"].apply(preprocess_text)

df.head(2)

#Feature Engineering

In [None]:
#TOKENIZATION FUNCTIONS
def simple_word_tokenize(text):
    return re2.findall(r"\p{Arabic}+|\w+|[^\s\w]", text, flags=re2.VERSION1)

def sentence_tokenize(text):
    parts = re.split(r'(?<=[\.\?\!\u061F\u061B])\s+', text)
    return [p.strip() for p in parts if p.strip()]

def paragraph_tokenize(text):
    if not isinstance(text, str):
        return []
    paragraphs = re.split(r'\s*\n\s*\n\s*', text.strip())
    return [p.strip() for p in paragraphs if p.strip()]

In [None]:
#APPLY TOKEN + SENTENCE EXTRACTION
df["tokens"] = df["clean_text"].apply(lambda t: [tok for tok in simple_word_tokenize(t) if tok.strip()])
df["words"] = df["tokens"].apply(lambda toks: [tok for tok in toks if re.search(r'\w', tok)])
df["sentences"] = df["abstract_text"].apply(sentence_tokenize)
df["paragraphs"] = df["abstract_text"].apply(paragraph_tokenize)

In [None]:
# ===== Words from ORIGINAL abstract text =====
df["tokens_raw"] = df["abstract_text"].apply(
    lambda t: [tok for tok in simple_word_tokenize(t) if tok.strip()]
)

df["words_raw"] = df["tokens_raw"].apply(
    lambda toks: [tok for tok in toks if re.search(r"\w", tok)]
)

#crating the required features

In [None]:
# F3 — Digits / Characters
df["f003_digits_over_C"] = df["clean_text"].apply(
    lambda t: len(re.findall(r'\d', str(t))) / len(str(t))
    if len(str(t)) > 0 else 0
)

In [None]:

#feature 26:Number of Commas
df["f026_commas"] = df["abstract_text"].apply(
    lambda t: str(t).count(",") if isinstance(t, str) else 0
)

In [None]:
#feature 49 :Number of Arabic Particles
arabic_particles = {
    'من','إلى','عن','على','في','ب','ك','ل',
    'و','أو','ثم','بل','لكن',
    'لا','لم','لن','ما',
    'هل','إن','إذا','أين','متى','كيف','كم','أيان',
    'قد','لمّا','حتى','أن','إنّ','إذن'
}

df["f049_num_particles_raw"] = df["words_raw"].apply(
    lambda words: sum(1 for w in words if w in arabic_particles)
    if isinstance(words, list) else 0
)

In [None]:
#Feature 72 : Count of Third-Person Pronouns
third_person_pronouns = {
    "هو","هي","هم","هما","هن",
    "ذلك","تلك","ذلكم","ذلكما","تلكم"
}

df["f072_third_person_pronouns_raw"] = df["words_raw"].apply(
    lambda words: sum(1 for w in words if w in third_person_pronouns)
    if isinstance(words, list) else 0
)


In [None]:
!pip install transformers torch


In [None]:
from transformers import pipeline

sentiment = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-xlm-roberta-base-sentiment"
)

In [None]:
#create Polarity Function
def polarity_of(sentence):
    if not isinstance(sentence, str) or sentence.strip() == "":
        return 0

    result = sentiment(sentence[:512])[0]["label"]

    if result == "POS":
        return 1
    elif result == "NEG":
        return -1
    else:
        return 0

In [None]:
#create Polarity Shift Frequency function
def polarity_shift_frequency(sentences):
    if not isinstance(sentences, list) or len(sentences) < 2:
        return 0

    polarities = [polarity_of(s) for s in sentences]

    return sum(
        1 for a, b in zip(polarities[:-1], polarities[1:])
        if a != b
    )


In [None]:
#Apply feature 95 in sentece variable
df["f095_polarity_shift"] = df["sentences"].apply(polarity_shift_frequency)


In [None]:
df.head(2)

#Spliting Data

In [None]:
from sklearn.model_selection import train_test_split

# First split: Train 70%, Temp 30%
train_df, temp_df = train_test_split(df, test_size=0.30, random_state=42, shuffle=True)

# Second split: Temp 30% → 15% Validation, 15% Test
val_df, test_df = train_test_split(temp_df, test_size=0.50, random_state=42, shuffle=True)

# Show sizes
print("TOTAL:", len(df))
print("TRAIN:", len(train_df))
print("VAL:", len(val_df))
print("TEST:", len(test_df))

#TF-IDF Features from Cleaned Text



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer for Arabic text
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,   # limit vocabulary
    ngram_range=(1,2),   # unigrams + bigrams
    analyzer='word'
)

# Fit only on training set
tfidf_vectorizer.fit(train_df["clean_text"])

# Transform train/validation/test sets
X_train_tfidf = tfidf_vectorizer.transform(train_df["clean_text"])
X_val_tfidf   = tfidf_vectorizer.transform(val_df["clean_text"])
X_test_tfidf  = tfidf_vectorizer.transform(test_df["clean_text"])

print("TF-IDF shapes:")
print("Train:", X_train_tfidf.shape)
print("Validation:", X_val_tfidf.shape)
print("Test:", X_test_tfidf.shape)

#Define X and y

In [None]:
# Target variable
y_train = train_df["label"]
y_val   = val_df["label"]
y_test  = test_df["label"]

# Features: TF-IDF from clean_text
X_train = X_train_tfidf
X_val   = X_val_tfidf
X_test  = X_test_tfidf

print("X and y are ready for ML models.")
print("Train:", X_train.shape, y_train.shape)
print("Validation:", X_val.shape, y_val.shape)
print("Test:", X_test.shape, y_test.shape)

#Task 4.1 — Baseline Model  — Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the model
lr_model = LogisticRegression(max_iter=1000, random_state=42)

# Train on training set
lr_model.fit(X_train, y_train)

# Predict on validation set
y_val_pred = lr_model.predict(X_val)

# Evaluate on validation set
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report (Validation):")
print(classification_report(y_val, y_val_pred))

In [None]:
# Evaluation
# Predict on test set
y_test_pred = lr_model.predict(X_test)

# Evaluate on test set
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report (Test):")
print(classification_report(y_test, y_test_pred))

# Optional: confusion matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_test_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Logistic Regression")
plt.show()

#Task 4.2: Traditional ML Models




In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# Dictionary to store models and results
models = {}

# -----------------------
#Support Vector Machine (SVM)
# -----------------------
svm_model = SVC(kernel='linear', C=1.0, random_state=42)
svm_model.fit(X_train, y_train)

y_val_pred_svm = svm_model.predict(X_val)
print("SVM Validation Accuracy:", accuracy_score(y_val, y_val_pred_svm))
print(classification_report(y_val, y_val_pred_svm))

models['SVM'] = svm_model

# -----------------------
#Random Forest
# -----------------------
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)

y_val_pred_rf = rf_model.predict(X_val)
print("Random Forest Validation Accuracy:", accuracy_score(y_val, y_val_pred_rf))
print(classification_report(y_val, y_val_pred_rf))

models['RandomForest'] = rf_model

# -----------------------
#XGBoost
xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)
xgb_model.fit(X_train, y_train)

y_val_pred_xgb = xgb_model.predict(X_val)
print("XGBoost Validation Accuracy:", accuracy_score(y_val, y_val_pred_xgb))
print(classification_report(y_val, y_val_pred_xgb))

models['XGBoost'] = xgb_model

In [None]:
#Evaluation
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# List of models to evaluate
model_names = ['SVM', 'RandomForest', 'XGBoost']

for name in model_names:
    model = models[name]

    # Predict on test set
    y_test_pred = model.predict(X_test)

    print(f"\n===== {name} Test Evaluation =====")
    print("Accuracy:", accuracy_score(y_test, y_test_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_test_pred))

    # Confusion matrix
    cm = confusion_matrix(y_test, y_test_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

#Task 4.3 — Deep Learning Models

##Feedforward Neural Network on Top of BERT Embeddings

###Step1: Extract BERT Embeddings (Sentence-level)

In [None]:
!pip install sentence-transformers



####We use sentence-transformers because it produces powerful text embeddings and is excellent for classification.

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load Arabic-compatible BERT model
bert_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

# Convert clean_text into embeddings
X_train_emb = bert_model.encode(train_df["clean_text"].tolist(), convert_to_numpy=True)
X_val_emb   = bert_model.encode(val_df["clean_text"].tolist(), convert_to_numpy=True)
X_test_emb  = bert_model.encode(test_df["clean_text"].tolist(), convert_to_numpy=True)

y_train = train_df["label"].values
y_val   = val_df["label"].values
y_test  = test_df["label"].values

print("Train embedding shape:", X_train_emb.shape)

###Step 2: Build a Feedforward Neural Network

In [None]:
#import tensorflow as tf
from tensorflow.keras import layers, models

# Basic feedforward classifier on embeddings
ffnn_model = models.Sequential([
    layers.Input(shape=(X_train_emb.shape[1],)),
    layers.Dense(256, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(128, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(1, activation="sigmoid")   # binary classification
])

ffnn_model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

ffnn_model.summary()

###Step3: Train the Model

In [None]:
history = ffnn_model.fit(
    X_train_emb, y_train,
    validation_data=(X_val_emb, y_val),
    epochs=10,
    batch_size=32
)

###Step 4: Evaluate on Test Set

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Predict
y_test_pred = (ffnn_model.predict(X_test_emb) > 0.5).astype(int)

print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

In [None]:
#Save models
