In [64]:
import os
import io
import json
import time
import platform
import requests
import h5py
import psutil
import imagehash
import numpy as np
import pandas as pd

from PIL import Image
from tqdm import tqdm
from pathlib import Path
from collections import defaultdict
from datetime import datetime
from sklearn.model_selection import train_test_split, StratifiedGroupKFold


import albumentations as A
from albumentations.pytorch import ToTensorV2
from concurrent.futures import ThreadPoolExecutor

In [65]:
DATA_DIR = "full_image_dataset"          # Expected input dir: species_name/*.jpg
AUGMENTED_DIR = "augmented_dataset"     # Augmented images will be saved here
IMG_SIZE_THRESHOLD = 200              # Min resolution (px)
HASH_THRESHOLD = 8                    # Duplicate threshold using phash

species_keys = {
    "Carduelis carduelis": 2494686,
    "Ciconia ciconia": 2481912,
    "Columba livia": 2495414,
    "Delichon urbicum": 2489214,
    "Emberiza calandra":7634625,
    "Hirundo rustica": 7192162,
    "Passer domesticus": 5231190,
    "Serinus serinus":2494200,
    "Streptopelia decaocto": 2495696,
    "Sturnus unicolor":2489104,
    "Turdus merula": 6171845   
}

CONFIG = {
    'IMG_SIZE': (224, 224),
    'TEST_SIZE': 0.15,
    'TRAIN_SIZE': 0.7,
    'VAL_SIZE': 0.15,
    'N_SPLITS': 5,
    'COMPRESSION': 'gzip',
    'COMPRESSION_LEVEL': 6,
    'SAVE_AS_JPEG': True,
    'JPEG_QUALITY': 80,
    'AUGMENTATION': {
        'train': [
            {'name': 'RandomResizedCrop','size':(224,224) , 'scale': (0.8, 1.0)},
            {'name': 'HorizontalFlip', 'p': 0.5},
            {'name': 'ShiftScaleRotate', 'shift_limit': 0.05, 'scale_limit': 0.1, 'rotate_limit': 20, 'p': 0.7},
            {'name': 'ColorJitter', 'brightness': 0.1, 'contrast': 0.1, 'saturation': 0.1, 'hue': 0.05, 'p': 0.8},
            {'name': 'CoarseDropout', 'max_holes':1, 'max_height': 48, 'max_width': 48, 'p': 0.4},
        ]
    }
}

In [66]:
def getSystemInfo():
    mem = psutil.virtual_memory()
    return {
        "timestamp": datetime.now().isoformat(),
        "os": platform.system(),
        "os_version": platform.release(),
        "cpu": platform.processor(),
        "cpu_cores": psutil.cpu_count(logical=False),
        "ram_total_gb": round(mem.total / (1024**3), 2),
        "ram_available_gb": round(mem.available / (1024**3), 2),
        "python_version": platform.python_version()
    }

def initLogging(output_dir):
    metadata = {
        "config": CONFIG,
        "system": getSystemInfo(),
        "download": {},
        "cleaning": {},
        "augmentation": {},
        "dataset_stats": {},
        "processing_times": {}
    }
    os.makedirs(output_dir, exist_ok=True)
    metadata_path = os.path.join(output_dir, f"dataset_prep_{datetime.now().strftime("%Y%m%d")}.json")
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)
    
    return metadata_path

def updateLogging(metadata_path, updates):
    if not os.path.exists(metadata_path):
        return initLogging(os.path.dirname(metadata_path))
    
    with open(metadata_path, 'r') as f:
        metadata = json.load(f)
    
    metadata.update(updates)
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)

In [67]:
def getAugmentation():
    aug_config = CONFIG['AUGMENTATION']['train']
    return A.Compose([
        A.RandomResizedCrop(
            size=aug_config[0]['size'],
            scale=aug_config[0]['scale'],
        ),
        A.HorizontalFlip(p=aug_config[1]['p']),
        A.ShiftScaleRotate(
            shift_limit=aug_config[2]['shift_limit'],
            scale_limit=aug_config[2]['scale_limit'],
            rotate_limit=aug_config[2]['rotate_limit'],
            p=aug_config[2]['p']
        ),
        A.ColorJitter(
            brightness=aug_config[3]['brightness'],
            contrast=aug_config[3]['contrast'],
            saturation=aug_config[3]['saturation'],
            hue=aug_config[3]['hue'],
            p=aug_config[3]['p']
        ),
        A.CoarseDropout(
            max_holes=aug_config[4]['max_holes'],
            max_height=aug_config[4]['max_height'],
            max_width=aug_config[4]['max_width'],
            p=aug_config[4]['p']
        )
    ])

def processImage(img_path, output_dir, transform, save_augmented=True):
    """Process and save a single image with augmentation"""
    try:
        img = Image.open(img_path).convert("RGB")
        img_np = np.array(img)
        
        # Apply augmentation
        augmented = transform(image=img_np)['image']
        
        if save_augmented:
            # Save augmented image
            aug_name = f"{Path(img_path).stem}_aug.jpg"
            aug_path = os.path.join(output_dir, aug_name)
            Image.fromarray(augmented).save(aug_path, quality=CONFIG['JPEG_QUALITY'], optimize=True)
            return True
        return True
    except Exception as e:
        print(f"Error processing {img_path}: {e}")
        return False


def transformImagesFromDirectory(species_name, data_dir, metadata_path=None, save_augmented=True):
    start_time = time.time()
    species_dir = os.path.join(data_dir, species_name.replace(" ", "_"))
    if metadata_path is None:
        metadata_path = initLogging(data_dir)

    print(f"\nApplying transformations to images for: {species_name}")
    stats = {
        'species': species_name,
        'original_count': 0,
        'augmented_saved': 0,
        'start_time': datetime.now().strftime('%Y%m%d_%H%M%S'),
    }

    # Create output directory
    if save_augmented:
        output_dir = os.path.join(data_dir, AUGMENTED_DIR, species_name.replace(" ", "_"))
        os.makedirs(output_dir, exist_ok=True)
    else:
        output_dir = species_dir

    # Get augmentation pipeline
    transform = getAugmentation()

    # Process images in parallel
    image_paths = [os.path.join(species_dir, f) for f in os.listdir(species_dir) 
                  if os.path.isfile(os.path.join(species_dir, f)) and not f.endswith(".json")]
    
    stats['original_count'] = len(image_paths)

    with ThreadPoolExecutor(max_workers=4) as executor:
        results = list(tqdm(
            executor.map(
                lambda p: processImage(p, output_dir, transform, save_augmented),
                image_paths
            ),
            total=len(image_paths),
            desc=f"Augmenting {species_name}"
        ))
    
    stats['augmented_saved'] = sum(results)
    stats.update({
        'end_time': datetime.now().isoformat(),
        'time_seconds': time.time() - start_time
    })

    updateLogging(metadata_path, {"augmentation": {species_name: stats}})
    print(f"Transformations completed for {species_name}. Augmented saved: {stats['augmented_saved']}")
    return stats['augmented_saved']


In [68]:
def isValidImage(path):
    try:
        img = Image.open(path).convert("RGB")
        return min(img.size) >= IMG_SIZE_THRESHOLD
    except Exception as e:
        print(f"Error processing {path}: {e}")
        return False

def getPhash(path):
    try:
        img = Image.open(path).convert("RGB")
        return imagehash.phash(img)
    except Exception as e:
        print(f"Error generating hash for {path}: {e}")
        return None

def cleanData(species_name, metadata_path=None):
    start_time = time.time()
    if metadata_path is None:
        metadata_path = initLogging(DATA_DIR)

    hash_db = defaultdict(list)
    total_removed = 0
    total_valid_images = 0

    species_path = os.path.join(DATA_DIR, AUGMENTED_DIR, species_name.replace(" ", "_"))
    if not os.path.exists(species_path):
        return 0

    # Process images in parallel
    image_paths = list(Path(species_path).glob("*.*"))
    
    with ThreadPoolExecutor(max_workers=4) as executor:
        results = list(tqdm(
            executor.map(
                lambda p: (p, isValidImage(p), getPhash(p)),
                image_paths
            ),
            total=len(image_paths),
            desc=f"Cleaning {species_name}"
        ))

    # Process results
    for img_path, is_valid, phash in results:
        if not is_valid or phash is None:
            os.remove(img_path)
            total_removed += 1
        elif not any(phash - existing < HASH_THRESHOLD for existing in hash_db[species_name]):
            hash_db[species_name].append(phash)
            total_valid_images += 1
        else:
            os.remove(img_path)
            total_removed += 1

    stats = {
        'species': species_name,
        'total_removed': total_removed,
        'remaining_images': total_valid_images,
        'time_seconds': time.time() - start_time,
        'timestamp': datetime.now().isoformat()
    }

    updateLogging(metadata_path, {"cleaning": {species_name: stats}})
    print(f"Finished cleaning {species_name}. Removed {total_removed} images")
    return total_removed


In [69]:
def createDataset(metadata_path=None):
    start_time = time.time()
    if metadata_path is None:
        metadata_path = initLogging(DATA_DIR)

    # Collect all images and labels
    all_images = []
    all_labels = []
    species_counts = defaultdict(int)
    
    for species_idx, (species_name, _) in enumerate(species_keys.items()):
        species_dir = os.path.join(DATA_DIR, AUGMENTED_DIR, species_name.replace(" ", "_"))
        if not os.path.exists(species_dir):
            continue
            
        for img_name in os.listdir(species_dir):
            img_path = os.path.join(species_dir, img_name)
            try:
                img = Image.open(img_path).convert('RGB').resize(CONFIG['IMG_SIZE'])
                all_images.append(np.array(img))
                all_labels.append(species_idx)
                species_counts[species_name] += 1
            except:
                continue

    # Convert to numpy arrays
    X = np.array(all_images)
    y = np.array(all_labels)

    # Create splits
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=CONFIG['TEST_SIZE'], stratify=y, random_state=42
    )

    # Save CSV metadata
    csv_path = os.path.join(DATA_DIR, f"dataset_metadata_{datetime.now().strftime('%Y%m%d')}.csv")
    df = pd.DataFrame({
        'species': [list(species_keys.keys())[i] for i in y],
        'label_idx': y,
        'filepath': [f"{list(species_keys.keys())[i]}/{j}.jpg" 
                    for i, j in zip(y, range(len(y)))]
    })

    # Add fold information
    df['fold'] = -1
    skf = StratifiedGroupKFold(
        n_splits=CONFIG['N_SPLITS']
    )
    for fold_idx, (_, val_idx) in enumerate(skf.split(df, df['label_idx'])):
        df.loc[val_idx, 'fold'] = fold_idx
    df.to_csv(csv_path, index=False)

    # Save HDF5 dataset in chunks
    timestamp = datetime.now().strftime("%Y%m%d")
    h5_path = os.path.join(DATA_DIR, f"dataset_{timestamp}.h5")
    
    with h5py.File(h5_path, 'w') as hf:
        # Save test set
        test_group = hf.create_group('test')
        test_group.create_dataset('X_test', data=X_test,
                                compression=CONFIG['COMPRESSION'],
                                compression_opts=CONFIG['COMPRESSION_LEVEL'])
        test_group.create_dataset('y_test', data=y_test,
                                compression=CONFIG['COMPRESSION'],
                                compression_opts=CONFIG['COMPRESSION_LEVEL'])

        # Save cross-validation folds
        cv_group = hf.create_group('cross_validation')
        for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
            fold_group = cv_group.create_group(f'fold_{fold+1}')
            fold_group.create_dataset('X_train', data=X_train[train_idx],
                                    compression=CONFIG['COMPRESSION'],
                                    compression_opts=CONFIG['COMPRESSION_LEVEL'])
            fold_group.create_dataset('y_train', data=y_train[train_idx],
                                    compression=CONFIG['COMPRESSION'],
                                    compression_opts=CONFIG['COMPRESSION_LEVEL'])
            fold_group.create_dataset('X_val', data=X_train[val_idx],
                                    compression=CONFIG['COMPRESSION'],
                                    compression_opts=CONFIG['COMPRESSION_LEVEL'])
            fold_group.create_dataset('y_val', data=y_train[val_idx],
                                    compression=CONFIG['COMPRESSION'],
                                    compression_opts=CONFIG['COMPRESSION_LEVEL'])

        # Save metadata
        hf.attrs['species'] = json.dumps(list(species_keys.keys()))
        hf.attrs['image_size'] = json.dumps(CONFIG['IMG_SIZE'])
        hf.attrs['augmentation'] = json.dumps(CONFIG['AUGMENTATION'])
        hf.attrs['creation_time'] = timestamp
        hf.attrs['csv_reference'] = csv_path

    # Update metadata
    dataset_stats = {
        'total_images': len(all_images),
        'species_counts': dict(species_counts),
        'h5_path': h5_path,
        'csv_path': csv_path,
        'train_samples': len(X_train),
        'test_samples': len(X_test),
        'processing_time_seconds': time.time() - start_time,
        'timestamp': timestamp
    }
    
    updateLogging(metadata_path, {"dataset_stats": dataset_stats})
    print(f"Dataset created with {len(all_images)} images")
    print(f"- HDF5: {h5_path}")
    print(f"- CSV: {csv_path}")


In [70]:
print("Initiating dataset transformation...")
metadata_path = initLogging(DATA_DIR)
for species in species_keys.keys():
    transformImagesFromDirectory(species, DATA_DIR, metadata_path)
    cleanData(species, metadata_path=metadata_path)
createDataset(metadata_path)
print("Tasks completed")


  original_init(self, **validated_kwargs)
  A.CoarseDropout(


Initiating dataset transformation...

Applying transformations to images for: Carduelis carduelis


Augmenting Carduelis carduelis: 100%|██████████| 600/600 [00:04<00:00, 125.87it/s]


Transformations completed for Carduelis carduelis. Augmented saved: 600


Cleaning Carduelis carduelis: 100%|██████████| 600/600 [00:02<00:00, 296.89it/s]


Finished cleaning Carduelis carduelis. Removed 0 images

Applying transformations to images for: Ciconia ciconia


Augmenting Ciconia ciconia: 100%|██████████| 600/600 [00:05<00:00, 110.97it/s]


Transformations completed for Ciconia ciconia. Augmented saved: 600


Cleaning Ciconia ciconia: 100%|██████████| 600/600 [00:01<00:00, 301.39it/s]


Finished cleaning Ciconia ciconia. Removed 0 images

Applying transformations to images for: Columba livia


Augmenting Columba livia: 100%|██████████| 600/600 [00:10<00:00, 57.04it/s] 


Transformations completed for Columba livia. Augmented saved: 600


Cleaning Columba livia: 100%|██████████| 600/600 [00:01<00:00, 357.04it/s] 


Finished cleaning Columba livia. Removed 0 images

Applying transformations to images for: Delichon urbicum


Augmenting Delichon urbicum: 100%|██████████| 600/600 [00:07<00:00, 79.65it/s] 


Transformations completed for Delichon urbicum. Augmented saved: 600


Cleaning Delichon urbicum: 100%|██████████| 600/600 [00:02<00:00, 291.21it/s]


Finished cleaning Delichon urbicum. Removed 0 images

Applying transformations to images for: Emberiza calandra


Augmenting Emberiza calandra: 100%|██████████| 600/600 [00:03<00:00, 154.25it/s]


Transformations completed for Emberiza calandra. Augmented saved: 600


Cleaning Emberiza calandra: 100%|██████████| 600/600 [00:01<00:00, 343.46it/s]


Finished cleaning Emberiza calandra. Removed 0 images

Applying transformations to images for: Hirundo rustica


Augmenting Hirundo rustica: 100%|██████████| 600/600 [00:05<00:00, 115.04it/s]


Transformations completed for Hirundo rustica. Augmented saved: 600


Cleaning Hirundo rustica: 100%|██████████| 600/600 [00:02<00:00, 293.00it/s]


Finished cleaning Hirundo rustica. Removed 0 images

Applying transformations to images for: Passer domesticus


Augmenting Passer domesticus: 100%|██████████| 600/600 [00:05<00:00, 107.02it/s]


Transformations completed for Passer domesticus. Augmented saved: 600


Cleaning Passer domesticus: 100%|██████████| 600/600 [00:01<00:00, 306.55it/s]


Finished cleaning Passer domesticus. Removed 0 images

Applying transformations to images for: Serinus serinus


Augmenting Serinus serinus: 100%|██████████| 600/600 [00:10<00:00, 55.46it/s] 


Transformations completed for Serinus serinus. Augmented saved: 600


Cleaning Serinus serinus: 100%|██████████| 600/600 [00:01<00:00, 312.30it/s]


Finished cleaning Serinus serinus. Removed 0 images

Applying transformations to images for: Streptopelia decaocto


Augmenting Streptopelia decaocto: 100%|██████████| 600/600 [00:10<00:00, 56.87it/s]


Transformations completed for Streptopelia decaocto. Augmented saved: 600


Cleaning Streptopelia decaocto: 100%|██████████| 600/600 [00:01<00:00, 343.43it/s] 


Finished cleaning Streptopelia decaocto. Removed 0 images

Applying transformations to images for: Sturnus unicolor


Augmenting Sturnus unicolor: 100%|██████████| 600/600 [00:04<00:00, 142.77it/s]


Transformations completed for Sturnus unicolor. Augmented saved: 600


Cleaning Sturnus unicolor: 100%|██████████| 600/600 [00:01<00:00, 307.21it/s]


Finished cleaning Sturnus unicolor. Removed 0 images

Applying transformations to images for: Turdus merula


Augmenting Turdus merula: 100%|██████████| 600/600 [00:07<00:00, 79.97it/s] 


Transformations completed for Turdus merula. Augmented saved: 600


Cleaning Turdus merula: 100%|██████████| 600/600 [00:01<00:00, 360.17it/s] 


Finished cleaning Turdus merula. Removed 2 images
Dataset created with 6598 images
- HDF5: full_image_dataset/dataset_20250519.h5
- CSV: full_image_dataset/dataset_metadata_20250519.csv
Tasks completed
