## Task 1.2: Use the datasets library from Hugging Face to download the arabic- generated-abstracts dataset directly into a Python environment (By Google Colab).

In [None]:
# !pip install datasets
# !pip install python-dotenv


In [None]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

load_dotenv()

hf_token = os.getenv("HF_TOKEN")
login(token=hf_token)

In [None]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
dataset = load_dataset("KFUPM-JRCAI/arabic-generated-abstracts")
print(dataset)


In [None]:
# Print split names
print("\nAvailable splits:", list(dataset.keys()))

### Task 1.3: Perform initial data exploration:



#### Explore Any Split


In [None]:
import pandas as pd

def explore_split(dataset, split_name):
    print(f"\n\n==============================================")
    print(f" Exploring Split: {split_name}")
    print("==============================================")

    split = dataset[split_name]

    # Inspect features
    print("\n➡ Column Names and Data Types:")
    print(split.features)

    # Convert to pandas
    df = split.to_pandas()

    print("\n➡ First 5 Rows:")
    display(df.head())

    print("\n➡ Dataset Shape:")
    print(df.shape)

    print("\n➡ Columns and Types:")
    print(df.dtypes)

    # ---- Target variable check ----
    if "label" in df.columns:
        print("\n➡ Target Variable Distribution (0 = Human, 1 = AI Generated):")
        print(df["label"].value_counts())

        print("\n➡ Percentage Distribution (%):")
        print(df["label"].value_counts(normalize=True) * 100)
    else:
        print("\n⚠ No 'label' column found — skipping label distribution.")

    # ---- Text length analysis ----
    text_columns = [c for c in df.columns if c != "label"]

    if text_columns:
        text_col = text_columns[0]
        df["text_length"] = df[text_col].astype(str).apply(len)

        print(f"\n➡ Text Length Summary for column: {text_col}")
        print(df["text_length"].describe())

        print(f"\n➡ Sample Text Example from {text_col}:")
        print("\n", df[text_col].iloc[0])

    return df

In [None]:
splits = ["by_polishing", "from_title", "from_title_and_content"]

dfs = {}  # store pandas dataframes

for split in splits:
    dfs[split] = explore_split(dataset, split)

#### 2- Check the distribution of the target variable (label: human vs. AI)


In [None]:
# Function to compute distribution for any split
def compute_distribution(split, split_name):
    print(f"\n===== Distribution for split: {split_name} =====")

    # Count human abstracts (1 per row)
    num_human = len(split["original_abstract"])

    # Count AI abstracts (4 per row)
    num_ai = (
        len(split["allam_generated_abstract"])
        + len(split["jais_generated_abstract"])
        + len(split["llama_generated_abstract"])
        + len(split["openai_generated_abstract"])
    )

    # Print raw counts
    print("Number of human abstracts:", num_human)
    print("Number of AI-generated abstracts:", num_ai)

    # Percentages
    total = num_human + num_ai
    if total > 0:
        print("Human %:", round(num_human / total * 100, 2))
        print("AI %:", round(num_ai / total * 100, 2))
    else:
        print("No data available.")


# Apply to the 3 main splits
compute_distribution(dataset["by_polishing"], "by_polishing")
compute_distribution(dataset["from_title"], "from_title")
compute_distribution(dataset["from_title_and_content"], "from_title_and_content")

#### 3- Assess data quality: check for missing values, duplicates, and inconsistencies:


Missing values → any None/NaN in columns

Duplicates → same abstract appearing multiple times

Inconsistencies → like empty strings " " or unusual data

In [None]:
import pandas as pd

splits = ["by_polishing", "from_title", "from_title_and_content"]

for split_name in splits:
    print("\n========================================")
    print(f"Data Quality Checks — Split: {split_name}")
    print("========================================\n")

    split = dataset[split_name]

    # Convert to pandas DataFrame
    df = split.to_pandas()

    # 1. Missing values
    print(" Missing values per column:")
    print(df.isnull().sum())
    print("--------------------------------------------------")

    # 2. Duplicate rows
    total_duplicates = df.duplicated().sum()
    print(f" Number of duplicate rows: {total_duplicates}")

    # Duplicates in each column
    for col in df.columns:
        col_duplicates = df[col].duplicated().sum()
        print(f"  Duplicates in column '{col}': {col_duplicates}")
    print("--------------------------------------------------")

    # 3. Inconsistencies: empty strings or only spaces
    print(" Empty or blank values per column:")
    for col in df.columns:
        empty_count = df[col].apply(lambda x: str(x).strip() == "").sum()
        print(f"  Column '{col}': {empty_count}")

    print("\n\n")  # space between splits


## Phase 2 -preprocessing

In [None]:
# task 2.1: Arabic Text Preprocessing

import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer
from datasets import load_dataset


In [None]:
# Download required NLTK resources
nltk.download('stopwords')

In [None]:
# Check columns
print(df.head())

#Define Arabic text cleaning functions

In [None]:
# Remove tashkeel (diacritics)
def remove_diacritics(text):
    arabic_diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    return re.sub(arabic_diacritics, '', text)

In [None]:
# Normalize Arabic text
def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "و", text)
    text = re.sub("ئ", "ي", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("[^؀-ۿ ]+", " ", text)  # remove non-Arabic chars
    return text

In [None]:
# Initialize stopwords and stemmer
arabic_stopwords = set(stopwords.words("arabic"))
stemmer = ISRIStemmer()

In [None]:
# Full preprocessing pipeline
def preprocess_text(text):
    text = str(text)
    text = remove_diacritics(text)
    text = normalize_arabic(text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in arabic_stopwords]
    tokens = [stemmer.stem(w) for w in tokens]
    return " ".join(tokens)

In [None]:
# Apply preprocessing
text_columns = [
    'original_abstract',
    'allam_generated_abstract',
    'jais_generated_abstract',
    'llama_generated_abstract',
    'openai_generated_abstract'
]
for col in text_columns:
    clean_col = col + "_clean"
    df[clean_col] = df[col].apply(preprocess_text)
print(" Preprocessing complete! Here are the new columns:")
print(df.columns)
df.head(2)



# Task 2.2: Exploratory Data Analysis (EDA)



In [None]:

import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
import seaborn as sns
import numpy as np

##Text Length Distribution Plot



In [None]:
for split, df in dfs.items():
    text_columns = [c for c in df.columns if c != "label"]
    if not text_columns:
        continue

    text_col = text_columns[0]

    plt.figure(figsize=(7, 4))
    df["text_length"].hist(bins=40)
    plt.title(f"Text Length Distribution — {split}")
    plt.xlabel("Length (characters)")
    plt.ylabel("Frequency")
    plt.show()

###Text Length per Label

In [None]:
# Choose a split
split_name = "by_polishing"
df = dataset[split_name].to_pandas()

# Identify text and label columns
text_col = [c for c in df.columns if "abstract" in c.lower()][0]  # pick first abstract column
label_col = "label" if "label" in df.columns else None

# Compute text length (characters)
df["text_length"] = df[text_col].astype(str).apply(len)

# Describe overall
print("Overall text length statistics:")
print(df["text_length"].describe())

# Text length per label
if label_col:
    print("\nText length stats per label:")
    print(df.groupby(label_col)["text_length"].describe())

In [None]:
# Plot histogram for text length
plt.figure(figsize=(7, 4))
df["text_length"].hist(bins=50, color="skyblue")
plt.title(f"Text Length Distribution — {split_name}")
plt.xlabel("Characters")
plt.ylabel("Frequency")
plt.show()

# Plot per label if exists
if label_col:
    plt.figure(figsize=(7, 4))
    for lbl in df[label_col].unique():
        subset = df[df[label_col]==lbl]
        plt.hist(subset["text_length"], bins=50, alpha=0.6, label=f"Label {lbl}")
    plt.title(f"Text Length Distribution by Label — {split_name}")
    plt.xlabel("Characters")
    plt.ylabel("Frequency")
    plt.legend()
    plt.show()

###Word Count per Text

In [None]:
# Word count
df["word_count"] = df[text_col].astype(str).apply(lambda x: len(x.split()))

print("Word count statistics overall:")
print(df["word_count"].describe())

if label_col:
    print("\nWord count stats per label:")
    print(df.groupby(label_col)["word_count"].describe())

In [None]:
from collections import Counter

# Overall
all_words = " ".join(df[text_col].astype(str)).split()
most_common = Counter(all_words).most_common(20)
print("\nMost common words overall:")
print(most_common)

# Per label
if label_col:
    for lbl in df[label_col].unique():
        words = " ".join(df[df[label_col]==lbl][text_col].astype(str)).split()
        most_common_lbl = Counter(words).most_common(15)
        print(f"\nMost common words for label {lbl}:")
        print(most_common_lbl)

###Top 20 Most Frequent Words

In [None]:
from collections import Counter

for split_name in splits:
    df = dataset[split_name].to_pandas()
    text_col = [c for c in df.columns if "abstract" in c.lower()][0]

    all_words = " ".join(df[text_col].astype(str)).split()
    most_common = Counter(all_words).most_common(20)

    words, counts = zip(*most_common)

    plt.figure(figsize=(10,5))
    plt.bar(words, counts, color="skyblue")
    plt.xticks(rotation=45, ha="right")
    plt.title(f"Top 20 Most Frequent Words — {split_name}")
    plt.show()