In [1]:
import zipfile
import os

In [2]:
def unzip_dataset(zip_path, extract_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

In [4]:
# Unzip UTKFace dataset
unzip_dataset('C:/Users/Dell/OneDrive/Desktop/Nationality_detection/source_data/archive (7).zip', 'datasets/UTKFace')

TypeError: unzip_dataset() missing 1 required positional argument: 'extract_path'

In [22]:
# Unzip FER-2013 dataset
unzip_dataset('C:/Users/Dell/OneDrive/Desktop/Nationality_detection/source_data/archive (8).zip', 'datasets/FER-2013')

In [23]:
print("Datasets unzipped successfully!")

Datasets unzipped successfully!


# Organize the dataset

In [13]:
import os
import shutil
import numpy as np
from PIL import Image
import csv

In [14]:
def check_directory_contents(directory):
    print(f"Checking contents of {directory}:")
    if not os.path.exists(directory):
        print(f"  Directory does not exist!")
        return
    
    files = os.listdir(directory)
    print(f"  Total files/directories: {len(files)}")
    print(f"  First few items: {files[:5]}")

check_directory_contents('datasets/UTKFace')
check_directory_contents('datasets/FER-2013')

Checking contents of datasets/UTKFace:
  Total files/directories: 3
  First few items: ['crop_part1', 'UTKFace', 'utkface_aligned_cropped']
Checking contents of datasets/FER-2013:
  Total files/directories: 2
  First few items: ['test', 'train']


In [15]:
def check_directory_contents(directory, depth=0):
    print("  " * depth + f"Checking contents of {directory}:")
    if not os.path.exists(directory):
        print("  " * (depth+1) + "Directory does not exist!")
        return
    
    files = os.listdir(directory)
    print("  " * (depth+1) + f"Total files/directories: {len(files)}")
    print("  " * (depth+1) + f"First few items: {files[:5]}")
    
    if depth < 2:  # Limit the depth to avoid excessive output
        for item in files[:5]:
            item_path = os.path.join(directory, item)
            if os.path.isdir(item_path):
                check_directory_contents(item_path, depth + 1)

check_directory_contents('datasets/UTKFace')
print("\n")
check_directory_contents('datasets/FER-2013')

Checking contents of datasets/UTKFace:
  Total files/directories: 3
  First few items: ['crop_part1', 'UTKFace', 'utkface_aligned_cropped']
  Checking contents of datasets/UTKFace\crop_part1:
    Total files/directories: 9780
    First few items: ['100_1_0_20170110183726390.jpg.chip.jpg', '100_1_2_20170105174847679.jpg.chip.jpg', '101_1_2_20170105174739309.jpg.chip.jpg', '10_0_0_20161220222308131.jpg.chip.jpg', '10_0_0_20170103200329407.jpg.chip.jpg']
  Checking contents of datasets/UTKFace\UTKFace:
    Total files/directories: 23708
    First few items: ['100_0_0_20170112213500903.jpg.chip.jpg', '100_0_0_20170112215240346.jpg.chip.jpg', '100_1_0_20170110183726390.jpg.chip.jpg', '100_1_0_20170112213001988.jpg.chip.jpg', '100_1_0_20170112213303693.jpg.chip.jpg']
  Checking contents of datasets/UTKFace\utkface_aligned_cropped:
    Total files/directories: 2
    First few items: ['crop_part1', 'UTKFace']
    Checking contents of datasets/UTKFace\utkface_aligned_cropped\crop_part1:
      T

In [16]:
def process_utkface_dataset(utkface_dir, custom_dir):
    print(f"Processing UTKFace dataset from {utkface_dir}")
    utkface_subdir = os.path.join(utkface_dir, 'UTKFace')
    if not os.path.exists(utkface_subdir):
        print(f"UTKFace subdirectory not found: {utkface_subdir}")
        return

    processed_count = 0
    for img_name in os.listdir(utkface_subdir):
        if img_name.endswith('.jpg') or img_name.endswith('.png'):
            try:
                age, gender, race, _ = img_name.split('_')
                age = int(age)
                race = int(race)

                if 10 <= age <= 60:
                    if race == 1:
                        nationality = 'american'
                    elif race == 2:
                        nationality = 'african'
                    elif race == 4:
                        nationality = 'indian'
                    else:
                        nationality = 'other'

                    split = np.random.choice(['train', 'val', 'test'], p=[0.8, 0.1, 0.1])
                    
                    src_path = os.path.join(utkface_subdir, img_name)
                    dst_path = os.path.join(custom_dir, split, nationality, img_name)
                    shutil.copy(src_path, dst_path)
                    processed_count += 1
                    
                    if processed_count % 1000 == 0:
                        print(f"Processed {processed_count} images from UTKFace")
            except ValueError:
                print(f"Skipping file with unexpected format: {img_name}")

    print(f"UTKFace dataset processed. Total images copied: {processed_count}")


In [17]:
def process_fer2013_dataset(fer2013_dir, custom_dir):
    print(f"Processing FER-2013 dataset from {fer2013_dir}")
    if not os.path.exists(fer2013_dir):
        print(f"FER-2013 directory not found: {fer2013_dir}")
        return

    processed_count = 0
    for split in ['train', 'test']:
        split_dir = os.path.join(fer2013_dir, split)
        for emotion in os.listdir(split_dir):
            emotion_dir = os.path.join(split_dir, emotion)
            for img_name in os.listdir(emotion_dir):
                if img_name.endswith('.jpg') or img_name.endswith('.png'):
                    nationality = np.random.choice(['indian', 'american', 'african', 'other'])
                    custom_split = 'train' if split == 'train' else np.random.choice(['val', 'test'], p=[0.5, 0.5])

                    src_path = os.path.join(emotion_dir, img_name)
                    dst_path = os.path.join(custom_dir, custom_split, nationality, f"{emotion}_{img_name}")
                    shutil.copy(src_path, dst_path)
                    processed_count += 1
                    
                    if processed_count % 1000 == 0:
                        print(f"Processed {processed_count} images from FER-2013")

    print(f"FER-2013 dataset processed. Total images copied: {processed_count}")

In [18]:
def create_directory_structure():
    base_dir = 'custom_dataset'
    nationalities = ['indian', 'american', 'african', 'other']
    splits = ['train', 'val', 'test']

    for split in splits:
        for nationality in nationalities:
            os.makedirs(os.path.join(base_dir, split, nationality), exist_ok=True)

    print("Directory structure created successfully!")

In [19]:
def create_annotation_csv(custom_dataset_dir, output_csv):
    print(f"Creating annotation CSV file: {output_csv}")
    with open(output_csv, 'w', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['image_path', 'nationality', 'emotion', 'age', 'dress_color'])

        total_annotations = 0
        for split in ['train', 'val', 'test']:
            for nationality in ['indian', 'american', 'african', 'other']:
                img_dir = os.path.join(custom_dataset_dir, split, nationality)
                for img_name in os.listdir(img_dir):
                    img_path = os.path.join(split, nationality, img_name)
                    
                    if img_name.startswith(('angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise')):
                        emotion = img_name.split('_')[0]
                    else:
                        emotion = np.random.choice(['happy', 'sad', 'angry', 'neutral', 'surprise'])
                    
                    if img_name.startswith(tuple(map(str, range(1, 117)))):
                        age = int(img_name.split('_')[0])
                    else:
                        age = np.random.randint(10, 61)
                    
                    dress_color = np.random.choice(['red', 'blue', 'green', 'yellow', 'white', 'black'])

                    csvwriter.writerow([img_path, nationality, emotion, age, dress_color])
                    total_annotations += 1
                    
                    if total_annotations % 1000 == 0:
                        print(f"Created {total_annotations} annotations")

    print(f"Annotation CSV file created. Total annotations: {total_annotations}")


In [20]:
# Run the processing steps
create_directory_structure()
process_utkface_dataset('datasets/UTKFace', 'custom_dataset')
process_fer2013_dataset('datasets/FER-2013', 'custom_dataset')
create_annotation_csv('custom_dataset', 'custom_dataset_annotations.csv')

Directory structure created successfully!
Processing UTKFace dataset from datasets/UTKFace
Processed 1000 images from UTKFace
Processed 2000 images from UTKFace
Processed 3000 images from UTKFace
Processed 4000 images from UTKFace
Processed 5000 images from UTKFace
Processed 6000 images from UTKFace
Processed 7000 images from UTKFace
Processed 8000 images from UTKFace
Processed 9000 images from UTKFace
Processed 10000 images from UTKFace
Processed 11000 images from UTKFace
Processed 12000 images from UTKFace
Processed 13000 images from UTKFace
Skipping file with unexpected format: 39_1_20170116174525125.jpg.chip.jpg
Processed 14000 images from UTKFace
Processed 15000 images from UTKFace
Processed 16000 images from UTKFace
Processed 17000 images from UTKFace
Processed 18000 images from UTKFace
Skipping file with unexpected format: 61_1_20170109142408075.jpg.chip.jpg
Skipping file with unexpected format: 61_1_20170109150557335.jpg.chip.jpg
UTKFace dataset processed. Total images copied: 

In [21]:
def check_custom_dataset(custom_dir):
    print(f"Checking contents of custom dataset: {custom_dir}")
    total_images = 0
    for split in ['train', 'val', 'test']:
        for nationality in ['indian', 'american', 'african', 'other']:
            dir_path = os.path.join(custom_dir, split, nationality)
            if os.path.exists(dir_path):
                files = os.listdir(dir_path)
                total_images += len(files)
                print(f"  {split}/{nationality}: {len(files)} images")
    print(f"Total images in custom dataset: {total_images}")

In [22]:
def check_csv_file(csv_path):
    print(f"Checking contents of CSV file: {csv_path}")
    if not os.path.exists(csv_path):
        print("  CSV file does not exist!")
        return
    
    with open(csv_path, 'r') as csvfile:
        csvreader = csv.reader(csvfile)
        header = next(csvreader, None)
        print(f"  CSV header: {header}")
        
        row_count = sum(1 for row in csvreader)
        print(f"  Total rows (excluding header): {row_count}")

check_custom_dataset('custom_dataset')
print("\n")
check_csv_file('custom_dataset_annotations.csv')

Checking contents of custom dataset: custom_dataset
  train/indian: 13878 images
  train/american: 16474 images
  train/african: 14619 images
  train/other: 22826 images
  val/indian: 1942 images
  val/american: 2410 images
  val/african: 2148 images
  val/other: 3668 images
  test/indian: 1960 images
  test/american: 2421 images
  test/african: 2145 images
  test/other: 3728 images
Total images in custom dataset: 88219


Checking contents of CSV file: custom_dataset_annotations.csv
  CSV header: ['image_path', 'nationality', 'emotion', 'age', 'dress_color']
  Total rows (excluding header): 88219
