In [0]:
import os
import numpy as np
import h5py
from PIL import Image
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

First, the list of chosen bird species is defined:

In [None]:
species = [
    'Ciconia_ciconia', 'Columba_livia', 'Streptopelia_decaocto',
    'Emberiza_calandra', 'Carduelis_carduelis', 'Serinus_serinus',
    'Delichon_urbicum', 'Hirundo_rustica', 'Passer_domesticus',
    'Sturnus_unicolor', 'Turdus_merula'
]

And some settings are defined for pre-processing the images.

In [None]:
DATA_DIR = 'dataset'  # Replace with your dataset path
OUTPUT_FILE = 'bird_dataset_pytorch.h5'  # Output HDF5 file
IMG_SIZE = (224, 224)             # Standard size for CNNs
TEST_SIZE = 0.1                   # Test set proportion
COMPRESSION = 'gzip'              # Compression type
COMPRESSION_LEVEL = 7             # Compression level (1-9)

The images of the various birds must be transformed so that they can be used in the models, using PyTorch's transforms.Compose(). The transformations include data augmentation for the training set and basic preprocessing for the test set. But what is data augmentation?

Data Augmentation is a technique used to expand a training dataset by creating modified versions of existing images through random but realistic transformations. It helps improve model generalization by exposing it to varied examples without collecting new data. Common transformations include flipping, rotating, scaling, changing brightness/contrast, adding noise, or cropping. These variations simulate different real-world scenarios, making the model more robust to changes in viewpoint, lighting, or orientation.

Data augmentation is applied only during training—validation and test data remain unmodified to reflect real-world performance. It is especially useful for small datasets, reducing overfitting and improving accuracy. 

Let's now break down each component and explain the hyperparameters:

- transforms.Resize(IMG_SIZE)- Resizes the image to a fixed size. This size is typically chosen based on model architecture, in this case 224x224.

- transforms.RandomHorizontalFlip()- Randomly flips the image horizontally with a default probability of 0.5.

- transforms.RandomRotation(20)- Rotates the image randomly by up to ±20 degrees.

- transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1)- Randomly adjusts brightness, contrast, and saturation by up to ±10%.

- transforms.ToTensor()- Converts the image to a PyTorch tensor (values scaled to [0, 1]).

- transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])- Normalizes the image using precomputed mean and std from ImageNet.




In [None]:
train_transforms = transforms.Compose([
    transforms.Resize(IMG_SIZE),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(20),
    transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

test_transforms = transforms.Compose([
    transforms.Resize(IMG_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [11]:

images = []
labels = []
    
for idx, specie in enumerate(species_list):
    specie_dir = os.path.join(data_dir, specie)
        
    for img_name in os.listdir(specie_dir):
        img_path = os.path.join(specie_dir, img_name)
            
        try:
            img = Image.open(img_path)
            if img.mode != 'RGB':
                img = img.convert('RGB')
            img = img.resize(img_size)
            images.append(np.array(img))  # Keep as uint8 [0,255]
            labels.append(idx)
        except Exception as e:
            print(f"Error processing {img_path}: {e}")

# Load and preprocess images
print("Loading and preprocessing images...")
X = np.array(images)
y = np.array(labels)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y
)

# Save data to HDF5 file
print(f"Saving data to {OUTPUT_FILE}...")
with h5py.File(OUTPUT_FILE, 'w') as hf:
    # Save datasets with compression
    hf.create_dataset('X_train', data=X_train, compression=COMPRESSION, compression_opts=COMPRESSION_LEVEL)
    hf.create_dataset('y_train', data=y_train, compression=COMPRESSION, compression_opts=COMPRESSION_LEVEL)
    hf.create_dataset('X_test', data=X_test, compression=COMPRESSION, compression_opts=COMPRESSION_LEVEL)
    hf.create_dataset('y_test', data=y_test, compression=COMPRESSION, compression_opts=COMPRESSION_LEVEL)
    
    # Save metadata
    hf.attrs['species'] = np.array(species, dtype=h5py.string_dtype())
    hf.attrs['image_size'] = IMG_SIZE

print("Process completed successfully!")
print(f"Data saved to {OUTPUT_FILE} with {COMPRESSION} compression level {COMPRESSION_LEVEL}")


Loading and preprocessing images...


Saving data to bird_dataset_pytorch.h5...


Process completed successfully!
Data saved to bird_dataset_pytorch.h5 with gzip compression level 7


In [None]:

# 1. Load data from HDF5
with h5py.File('bird_dataset_pytorch.h5', 'r') as hf:
    X_train = hf['X_train'][:]
    y_train = hf['y_train'][:]
    X_test  = hf['X_test'][:]
    y_test  = hf['y_test'][:]


X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.1, stratify=y)


y_train_tensor = torch.from_numpy(y_train).long()
y_val_tensor = torch.from_numpy(y_val).long()
y_test_tensor = torch.from_numpy(y_test).long()

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:

# After saving the data, you can load it like this:
train_loader, test_loader, species = get_dataloaders('bird_dataset_pytorch.h5', batch_size=32)

# Example training loop:
for epoch in range(num_epochs):
    for images, labels in train_loader:
        # Your training code here
        pass
    
    # Validation
    with torch.no_grad():
        for images, labels in test_loader:
            # Your evaluation code here
            pass

In [None]:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchvision import models, transforms
import h5py
from PIL import Image
import numpy as np
from sklearn.metrics import f1_score, confusion_matrix
import matplotlib.pyplot as plt
import time
import os

# Configurações
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 32
NUM_EPOCHS = 30
LEARNING_RATE = 0.001
NUM_CLASSES = 11  # 11 espécies de aves
H5_PATH = 'bird_dataset_pytorch.h5'
MODEL_SAVE_DIR = 'saved_models'
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)


# Função para carregar modelo pré-treinado
def get_model(model_name, num_classes):
    if model_name == 'resnet18':
        model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
        model.fc = nn.Linear(model.fc.in_features, num_classes)
    elif model_name == 'efficientnet':
        model = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.DEFAULT)
        model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_classes)
    else:
        raise ValueError("Modelo não suportado")
    
    return model.to(DEVICE)

# Função para treinar o modelo
def train_model(model, criterion, optimizer, scheduler=None):
    best_f1 = 0.0
    train_losses, val_losses = [], []
    train_accs, val_accs = [], []
    
    for epoch in range(NUM_EPOCHS):
        print(f'Epoch {epoch+1}/{NUM_EPOCHS}')
        print('-' * 10)
        
        # Fase de treino
        model.train()
        running_loss = 0.0
        running_corrects = 0
        
        for inputs, labels in train_loader:
            inputs = inputs.to(DEVICE)
            labels = labels.to(DEVICE)
            
            optimizer.zero_grad()
            
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            loss = criterion(outputs, labels)
            
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)
        
        epoch_loss = running_loss / len(train_dataset)
        epoch_acc = running_corrects.double() / len(train_dataset)
        train_losses.append(epoch_loss)
        train_accs.append(epoch_acc)
        
        # Fase de validação
        model.eval()
        val_running_loss = 0.0
        val_running_corrects = 0
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs = inputs.to(DEVICE)
                labels = labels.to(DEVICE)
                
                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
                loss = criterion(outputs, labels)
                
                val_running_loss += loss.item() * inputs.size(0)
                val_running_corrects += torch.sum(preds == labels.data)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        
        val_epoch_loss = val_running_loss / len(test_dataset)
        val_epoch_acc = val_running_corrects.double() / len(test_dataset)
        val_f1 = f1_score(all_labels, all_preds, average='weighted')
        val_losses.append(val_epoch_loss)
        val_accs.append(val_epoch_acc)
        
        print(f'Train Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')
        print(f'Val Loss: {val_epoch_loss:.4f} Acc: {val_epoch_acc:.4f} F1: {val_f1:.4f}')
        
        # Atualizar learning rate
        if scheduler:
            scheduler.step(val_epoch_loss)
        
        # Salvar melhor modelo
        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save(model.state_dict(), os.path.join(MODEL_SAVE_DIR, f'best_{model.__class__.__name__}.pth'))
            print(f'Novo melhor modelo salvo com F1: {best_f1:.4f}')
    
    # Plotar métricas
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Val Loss')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(train_accs, label='Train Acc')
    plt.plot(val_accs, label='Val Acc')
    plt.legend()
    
    plt.savefig(os.path.join(MODEL_SAVE_DIR, 'training_metrics.png'))
    plt.close()
    
    return model

# Treinar ResNet18
print("\nTreinando ResNet18...")
resnet = get_model('resnet18', NUM_CLASSES)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(resnet.parameters(), lr=LEARNING_RATE)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True)

train_model(resnet, criterion, optimizer, scheduler)

# Treinar EfficientNet
print("\nTreinando EfficientNet...")
efficientnet = get_model('efficientnet', NUM_CLASSES)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(efficientnet.parameters(), lr=LEARNING_RATE)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True)

train_model(efficientnet, criterion, optimizer, scheduler)

print("Treino concluído! Modelos salvos em:", MODEL_SAVE_DIR)