In [8]:
import os
import numpy as np
import h5py
from PIL import Image
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

In [9]:



DATA_DIR = 'dataset'  # Replace with your dataset path
OUTPUT_FILE = 'bird_dataset_pytorch.h5'  # Output HDF5 file
IMG_SIZE = (224, 224)             # Standard size for CNNs
TEST_SIZE = 0.1                   # Test set proportion
COMPRESSION = 'gzip'              # Compression type
COMPRESSION_LEVEL = 7             # Compression level (1-9)

# Lista de esp√©cies (conforme a proposta)
species = [
    'Ciconia_ciconia', 'Columba_livia', 'Streptopelia_decaocto',
    'Emberiza_calandra', 'Carduelis_carduelis', 'Serinus_serinus',
    'Delichon_urbicum', 'Hirundo_rustica', 'Passer_domesticus',
    'Sturnus_unicolor', 'Turdus_merula'
]

# Define augmentations
train_transforms = transforms.Compose([
    transforms.Resize(IMG_SIZE),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(20),
    transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

test_transforms = transforms.Compose([
    transforms.Resize(IMG_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def load_and_preprocess_images(data_dir, species_list, img_size):
    """Load and preprocess images, keeping as uint8 to save space"""
    images = []
    labels = []
    
    for idx, specie in enumerate(species_list):
        specie_dir = os.path.join(data_dir, specie)
        
        for img_name in os.listdir(specie_dir):
            img_path = os.path.join(specie_dir, img_name)
            
            try:
                img = Image.open(img_path)
                if img.mode != 'RGB':
                    img = img.convert('RGB')
                img = img.resize(img_size)
                images.append(np.array(img))  # Keep as uint8 [0,255]
                labels.append(idx)
            except Exception as e:
                print(f"Error processing {img_path}: {e}")
    
    return np.array(images), np.array(labels)

# Load and preprocess images
print("Loading and preprocessing images...")
X, y = load_and_preprocess_images(DATA_DIR, species, IMG_SIZE)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y
)

# Save data to HDF5 file
print(f"Saving data to {OUTPUT_FILE}...")
with h5py.File(OUTPUT_FILE, 'w') as hf:
    # Save datasets with compression
    hf.create_dataset('X_train', data=X_train, compression=COMPRESSION, compression_opts=COMPRESSION_LEVEL)
    hf.create_dataset('y_train', data=y_train, compression=COMPRESSION, compression_opts=COMPRESSION_LEVEL)
    hf.create_dataset('X_test', data=X_test, compression=COMPRESSION, compression_opts=COMPRESSION_LEVEL)
    hf.create_dataset('y_test', data=y_test, compression=COMPRESSION, compression_opts=COMPRESSION_LEVEL)
    
    # Save metadata
    hf.attrs['species'] = np.array(species, dtype=h5py.string_dtype())
    hf.attrs['image_size'] = IMG_SIZE

print("Process completed successfully!")
print(f"Data saved to {OUTPUT_FILE} with {COMPRESSION} compression level {COMPRESSION_LEVEL}")


Loading and preprocessing images...


Saving data to bird_dataset_pytorch.h5...


Process completed successfully!
Data saved to bird_dataset_pytorch.h5 with gzip compression level 7


In [None]:

def get_dataloaders(h5_file, batch_size=32):
    train_dataset = BirdDataset(h5_file, train=True, transform=train_transforms)
    test_dataset = BirdDataset(h5_file, train=False, transform=test_transforms)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    return train_loader, test_loader, train_dataset.species