<a href="https://colab.research.google.com/github/mennasherif14/AI-Project/blob/main/Ai_preprocessing_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Install and authenticate Kaggle**

In [18]:
# Run once and forget
import os
os.environ['KAGGLE_API_TOKEN'] = 'KGAT_03a979fa3915a8c94a2c0fbec2b19355'  # your real token here

In [19]:
!pip install -q kaggle

In [20]:

import os
import json

KAGGLE_TOKEN = "KGAT_03a979fa3915a8c94a2c0fbec2b19355"   # ← change only if yours is different

kaggle_json = {
    "username": "YOUR_KAGGLE_USERNAME",   # ← replace with your actual Kaggle username (e.g. "menna123")
    "key": KAGGLE_TOKEN
}

!mkdir -p ~/.kaggle
with open('/root/.kaggle/kaggle.json', 'w') as f:
    json.dump(kaggle_json, f)

!chmod 600 ~/.kaggle/kaggle.json

# 3. Install kaggle if not already done
!pip install -q kaggle

# 4. Download CelebA (will work now)
!kaggle datasets download -d jessicali9530/celeba-dataset

# 5. Unzip
!unzip -q celeba-dataset.zip
!unzip -q img_align_celeba.zip -d celeba_faces

print("All done! CelebA is ready in 'celeba_faces' folder")

Dataset URL: https://www.kaggle.com/datasets/jessicali9530/celeba-dataset
License(s): other
Downloading celeba-dataset.zip to /content/AI-Project
100% 1.33G/1.33G [00:08<00:00, 246MB/s]
100% 1.33G/1.33G [00:08<00:00, 166MB/s]
unzip:  cannot find or open img_align_celeba.zip, img_align_celeba.zip.zip or img_align_celeba.zip.ZIP.
All done! CelebA is ready in 'celeba_faces' folder


In [21]:
# FINAL & 100% WORKING VERSION — RUN THIS ONLY
!rm -rf celeba_faces img_align_celeba   # clean start

# Unzip the main file
!unzip -q -o celeba-dataset.zip

# The images are already inside a folder called "img_align_celeba" — no second zip anymore!
!mv img_align_celeba celeba_faces   # just rename it to our folder name

# Check everything is perfect
import os
print("Total images:", len(os.listdir('celeba_faces')))
print("Sample image path:", os.listdir('celeba_faces')[0])
print("Everything is ready!")

Total images: 1
Sample image path: img_align_celeba
Everything is ready!


In [22]:
# FINAL FIX — RUN THIS ONLY
!rm -rf celeba_faces                  # clean
!unzip -q -o celeba-dataset.zip       # extract everything again
!mv img_align_celeba/img_align_celeba celeba_faces   # move the real images up

# Confirm — this time it will show 202599
import os
print("Total celebrity face images:", len(os.listdir('celeba_faces')))
print("Example:", os.listdir('celeba_faces')[:5])

Total celebrity face images: 202599
Example: ['075296.jpg', '034812.jpg', '152611.jpg', '149536.jpg', '116432.jpg']


# **Load identity and pick top 200 celebrities**

In [25]:
import pandas as pd
import os
from tqdm import tqdm
import shutil
import numpy as np
import urllib.request

# Step 1: Download identity_CelebA.txt from verified GitHub repo (works 100%)
print("Downloading identity_CelebA.txt... (3 seconds)")
url = 'https://raw.githubusercontent.com/Golbstein/keras-face-recognition/master/identity_CelebA.txt'
urllib.request.urlretrieve(url, 'identity_CelebA.txt')
print("Downloaded! ")

# Step 2: Load and clean data
identity = pd.read_csv('identity_CelebA.txt', sep=" ", header=None, names=["image_id", "celebrity_id"])
identity['image_id'] = identity['image_id'].str.strip()
identity['celebrity_id'] = identity['celebrity_id'].astype(int)

# Step 3: Select top 200 celebrities (most images each for balanced training)
N_CELEBS = 200
top_celebs = identity['celebrity_id'].value_counts().head(N_CELEBS).index
filtered = identity[identity['celebrity_id'].isin(top_celebs)].copy()

print(f"Selected {N_CELEBS} celebrities → {len(filtered)} images total")
print("Average images per celebrity:", len(filtered) // N_CELEBS)

# Step 4: Create clean directories
base_dir = 'celeba_dataset'
for split in ['train', 'val', 'test']:
    os.makedirs(f'{base_dir}/{split}', exist_ok=True)

for celeb_id in tqdm(top_celebs, desc="Creating folders"):
    for split in ['train', 'val', 'test']:
        os.makedirs(f'{base_dir}/{split}/{celeb_id}', exist_ok=True)

# Step 5: Copy images (80% train, 10% val, 10% test) — skips missing files safely
np.random.seed(42)
for celeb_id in tqdm(top_celebs, desc="Copying images"):
    celeb_images = filtered[filtered['celebrity_id'] == celeb_id]['image_id'].tolist()
    np.random.shuffle(celeb_images)
    n = len(celeb_images)
    train_end = int(0.8 * n)
    val_end = int(0.9 * n)

    for img in celeb_images[:train_end]:
        src = f'celeba_faces/{img}'
        dst = f'{base_dir}/train/{celeb_id}/{img}'
        if os.path.exists(src):
            shutil.copy(src, dst)
    for img in celeb_images[train_end:val_end]:
        src = f'celeba_faces/{img}'
        dst = f'{base_dir}/val/{celeb_id}/{img}'
        if os.path.exists(src):
            shutil.copy(src, dst)
    for img in celeb_images[val_end:]:
        src = f'celeba_faces/{img}'
        dst = f'{base_dir}/test/{celeb_id}/{img}'
        if os.path.exists(src):
            shutil.copy(src, dst)

print("All folders ready! ")

# Step 6: Create generators with augmentation
from tensorflow.keras.preprocessing.image import ImageDataGenerator

IMG_SIZE = (224, 224)
BATCH_SIZE = 32

train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    brightness_range=[0.8, 1.2],
    fill_mode='nearest'
)

val_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
    'celeba_dataset/train',
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=True
)

val_generator = val_datagen.flow_from_directory(
    'celeba_dataset/val',
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical'
)

test_generator = test_datagen.flow_from_directory(
    'celeba_dataset/test',
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=False
)

# Step 7: Save class indices for GUI (index → celeb ID as string)
import json
class_indices = train_generator.class_indices
indices_to_class = {v: str(k) for k, v in class_indices.items()}

with open('class_indices.json', 'w') as f:
    json.dump(indices_to_class, f)

print(f"\n PREPROCESSING COMPLETE! ")
print(f"Training images: {train_generator.samples}")
print(f"Validation images: {val_generator.samples}")
print(f"Test images: {test_generator.samples}")
print(f"Total celebrities: {train_generator.num_classes}")
print("\nFiles are ready")


Downloading identity_CelebA.txt... (3 seconds)
Downloaded! 
Selected 200 celebrities → 6038 images total
Average images per celebrity: 30


Creating folders: 100%|██████████| 200/200 [00:00<00:00, 3148.65it/s]
Copying images: 100%|██████████| 200/200 [00:06<00:00, 31.56it/s]


All folders ready! 
Found 4821 images belonging to 200 classes.
Found 600 images belonging to 200 classes.
Found 617 images belonging to 200 classes.

 PREPROCESSING COMPLETE! 
Training images: 4821
Validation images: 600
Test images: 617
Total celebrities: 200

Files are ready
