In [15]:
import os
import io
import json
import time
import platform
import requests
import h5py
import psutil
import imagehash
import numpy as np
import pandas as pd

from PIL import Image
from tqdm import tqdm
from pathlib import Path
from collections import defaultdict
from datetime import datetime
from sklearn.model_selection import train_test_split, StratifiedGroupKFold
from torchvision import transforms


from pygbif import occurrences
from pyinaturalist.node_api import get_observations

In [16]:
DATA_DIR = "full_image_dataset"          # Expected input dir: species_name/*.jpg
IMG_SIZE_THRESHOLD = 200              # Min resolution (px)
HASH_THRESHOLD = 8                    # Duplicate threshold using phash

species_keys = {
    "Carduelis carduelis": 2494686,
    "Ciconia ciconia": 2481912,
    "Columba livia": 2495414,
    "Delichon urbicum": 2489214,
    "Emberiza calandra":7634625,
    "Hirundo rustica": 7192162,
    "Passer domesticus": 5231190,
    "Serinus serinus":2494200,
    "Streptopelia decaocto": 2495696,
    "Sturnus unicolor":2489104,
    "Turdus merula": 6171845   
}

CONFIG = {
    'IMG_SIZE': (224, 224),
    'TEST_SIZE': 0.15,
    'TRAIN_SIZE': 0.7,
    'VAL_SIZE': 0.15,
    'N_SPLITS': 5,
    'COMPRESSION': 'gzip',
    'COMPRESSION_LEVEL': 6,
    'SAVE_AS_JPEG': True,
    'JPEG_QUALITY': 80,
    'AUGMENTATION': {
        'train': [
            {'name': 'RandomResizedCrop', 'size': (224, 224), 'scale': (0.8, 1.0)},
            {'name': 'RandomHorizontalFlip', 'p': 0.5},
            {'name': 'RandomRotation', 'degrees': 15},
            {'name': 'ColorJitter', 'brightness': 0.1, 'contrast': 0.1, 'saturation': 0.1, 'hue': 0.05, 'p': 0.8},
        ]
    }
}

In [17]:
def getSystemInfo():
    return {
        "timestamp": datetime.now().strftime('%Y%m%d_%H%M%S'),
        "os": platform.system(),
        "os_version": platform.release(),
        "cpu": platform.processor(),
        "ram_gb": round(psutil.virtual_memory().total / (1024**3), 2),
        "python_version": platform.python_version()
    }

def initLogging(output_dir):
    metadata = {
        "config": CONFIG,
        "system": getSystemInfo(),
        "download": {},
        "cleaning": {},
        "augmentation": {},
        "dataset_stats": {},
        "processing_times": {}
    }
    os.makedirs(output_dir, exist_ok=True)
    metadata_path = os.path.join(output_dir, f"dataset_prep_{datetime.now().strftime('%Y%m%d')}.json")
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)
    
    return metadata_path

def updateLogging(metadata_path, updates):
    if not os.path.exists(metadata_path):
        return initLogging(os.path.dirname(metadata_path))
    
    with open(metadata_path, 'r') as f:
        metadata = json.load(f)
    
    metadata.update(updates)
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)

In [18]:
def apply_augmentation(img):
    transforms_list = []
    for aug in CONFIG['AUGMENTATION']['train']:
        if aug['name'] == 'RandomResizedCrop':
            transforms_list.append(transforms.RandomResizedCrop(size=aug['size'], scale=aug['scale']))
        elif aug['name'] == 'RandomHorizontalFlip':
            transforms_list.append(transforms.RandomHorizontalFlip(p=aug['p']))
        elif aug['name'] == 'RandomRotation':
            transforms_list.append(transforms.RandomRotation(degrees=aug['degrees']))
        elif aug['name'] == 'ColorJitter':
            transforms_list.append(transforms.ColorJitter(brightness=aug['brightness'], contrast=aug['contrast'],
                                                          saturation=aug['saturation'], hue=aug['hue']))
    return transforms.Compose(transforms_list)(img)


def downloadImages(species_name, output_dir, limit=500, metadata_path=None):
    start_time = time.time()
    output_dir = os.path.join(DATA_DIR, species_name.replace(" ", "_"))
    os.makedirs(output_dir, exist_ok=True)
    if metadata_path is None:
        metadata_path = initLogging(DATA_DIR)
    
    print(f"\nDownloading images for: {species_name}")
    stats = {
        'iNaturalist': 0,
        'GBIF': 0,
        'start_time': datetime.now().strftime('%Y%m%d_%H%M%S')
    }
    
    try:
        #iNaturalist download
        stats['iNaturalist'] = downloadImages_INaturalist(species_name, output_dir, limit)
        
        # GBIF download
        current_count = stats['iNaturalist']
        stats['GBIF'] = downloadImages_GBIF(species_name, current_count, output_dir, limit - current_count)
        
    except Exception as e:
        print(f"Error during download: {e}")
    
    # Update metadata
    stats.update({
        'end_time': datetime.now().strftime('%Y%m%d_%H%M%S'),
        'total_downloaded': stats['iNaturalist'] + stats['GBIF'],
        'time_seconds': time.time() - start_time
    })
    
    updateLogging(metadata_path, {"download": {species_name: stats}})
    print(f"Total images downloaded for {species_name}: {stats['total_downloaded']}")
    return stats['total_downloaded']

def downloadImages_INaturalist(species_name, output_dir, limit=500):
    results = get_observations(
        taxon_name=species_name,
        per_page=limit,
        quality_grade="research",
        media_type="photo",
        license=["CC-BY","CC-BY-NC"] 
    )

    images_downloaded = 0
    seen_urls = set()

    for obs in tqdm(results.get("results", [])):
        for photo in obs.get("photos",[]):
            url = photo.get("url", "").replace("square", "original")
            if not url or url in seen_urls:
                continue
            seen_urls.add(url)
            try:
                response = requests.get(url, timeout=10)
                response.raise_for_status()

                img = Image.open(io.BytesIO(response.content)).convert('RGB')
                augmented_img = apply_augmentation(img)
                image_ext = url.split(".")[-1].split("?")[0]
                filename = f"{species_name.replace(' ', '_')}_{images_downloaded}.{image_ext}"
                
                augmented_img.save(os.path.join(output_dir, filename), quality=CONFIG['JPEG_QUALITY'])
                images_downloaded += 1
            except requests.exceptions.RequestException as e:
                print(f"Download error for {url}: {e}")
            except Exception as e:
                print(f"Error processing or saving {url}: {e}")

            if images_downloaded >= limit:
                break
        if images_downloaded >= limit:
            break

    print(f"Downloaded {images_downloaded} images from iNaturalist for {species_name}")
    return images_downloaded

def downloadImages_GBIF(species_name, downloadedValue, output_dir, limit=500):
    images_downloaded = 0
    try:
        result = occurrences.search(
            taxonKey=species_keys[species_name],
            mediaType="StillImage",
            limit=limit
        )
        
        for idx, occ in enumerate(result.get("results", [])):
            for media in occ.get("media",[]):
                if media.get("type") != "StillImage":
                    continue
                    
                imgURL = media.get("identifier")
                if not imgURL:
                    continue
                    
                try:
                    response = requests.get(imgURL, timeout=10)
                    response.raise_for_status()

                    img = Image.open(io.BytesIO(response.content)).convert('RGB')
                    augmented_img = apply_augmentation(img)
                    ext = 'jpg' if 'jpeg' in response.headers.get('content-type', '') else 'png'
                    filename = f"{species_name.replace(' ', '_')}_{downloadedValue + images_downloaded}.{ext}"

                    augmented_img.save(os.path.join(output_dir, filename), quality=CONFIG['JPEG_QUALITY'])
                    images_downloaded += 1
                    print(f"Downloaded image {images_downloaded}/{limit}", end='\r')

                except requests.exceptions.RequestException as e:
                    print(f"\nDownload error for image {idx} for {species_name}: {e}")
                    continue
                except Exception as e:
                    print(f"\nError processing or saving image {idx} for {species_name}: {e}")
                    continue
                if images_downloaded >= limit:
                    break
            if images_downloaded >= limit:
                break 
    except Exception as e:
        print(f"\nError fetching occurrences for {species_name}: {e}")
    print(f"\nDownloaded {images_downloaded} images from GBIF for {species_name}")
    return images_downloaded

In [19]:
def isValidImage(path):
    try:
        img = Image.open(path).convert("RGB")
        return min(img.size) >= IMG_SIZE_THRESHOLD
    except Exception as e:
        print(f"Error processing {path}: {e}")
        return False

def getPhash(path):
    try:
        img = Image.open(path).convert("RGB")
        return imagehash.phash(img)
    except Exception as e:
        print(f"Error generating hash for {path}: {e}")
        return None

def cleanData(species_name, metadata_path=None):
    start_time = time.time()
    if metadata_path is None:
        metadata_path = initLogging(DATA_DIR)

    # Initialize hash_db as a dictionary instead of a list
    hash_db = defaultdict(list)
    total_removed = 0
    imgs_to_remove = []
    valid_images_count = 0

    print(f"Starting image cleaning for: {species_name}")

    species_path = Path(DATA_DIR) / species_name.replace(" ", "_")
    if not species_path.is_dir():
        print(f"Directory {species_path} does not exist.")
        return 0

    # Get all image paths first
    img_paths = list(species_path.glob("*.*"))
    
    for img_path in tqdm(img_paths, desc=f"Cleaning {species_name}"):
        if not isValidImage(img_path):
            imgs_to_remove.append(img_path)
            continue

        phash = getPhash(img_path)
        if phash is None:
            imgs_to_remove.append(img_path)
            continue

        # Check against existing hashes for this species
        is_duplicate = any(phash - existing < HASH_THRESHOLD for existing in hash_db[species_name])
        if is_duplicate:
            imgs_to_remove.append(img_path)
        else:
            hash_db[species_name].append(phash)
            valid_images_count += 1

    # Remove invalid/duplicate files
    for img_path in imgs_to_remove:
        try:
            os.remove(img_path)
            total_removed += 1
        except OSError as e:
            print(f"Error deleting {img_path}: {e}")

    stats = {
        'species': species_name,
        'total_removed': total_removed,
        'remaining_images': valid_images_count,
        'time_seconds': time.time() - start_time,
        'timestamp': datetime.now().strftime('%Y%m%d_%H%M%S')
    }

    updateLogging(metadata_path, {"cleaning": {species_name: stats}})
    print(f"Finished cleaning {species_name}. Removed: {total_removed}, Remaining: {valid_images_count}")
    return total_removed


In [None]:
def createDataset(metadata_path=None):
    start_time = time.time()
    
    # Initialize logging
    if metadata_path is None:
        metadata_path = initLogging(DATA_DIR)
    
    # Data collection structures
    images = []
    filepaths = []
    labels = []
    groups = []
    species_counts = defaultdict(int)
    
    # Process images with progress bar
    print("Processing images...")
    for idx, specie in tqdm(enumerate(species_keys), total=len(species_keys), desc="Species"):
        specie_dir = os.path.join(DATA_DIR, specie.replace(" ", "_"))
        if not os.path.exists(specie_dir):
            print(f"\nDirectory {specie_dir} does not exist. Skipping...")
            continue
        
        for img_name in os.listdir(specie_dir):
            img_path = os.path.join(specie_dir, img_name)
            try:
                with Image.open(img_path) as img:
                    img = img.convert('RGB').resize(CONFIG['IMG_SIZE'])
                    img_array = np.array(img)
                
                images.append(img_array)
                filepaths.append(img_path)
                labels.append(idx)
                groups.append(specie)
                species_counts[specie] += 1
            except Exception as e:
                print(f"\nError processing {img_path}: {e}")

    # Convert to numpy arrays
    X = np.array(images)
    y = np.array(labels)
    groups = np.array(groups)
    
    # Create DataFrame
    df = pd.DataFrame({
        'filepath': filepaths,
        'label_idx': y,
        'species': groups,
        'split': '',
        'fold': -1
    })
    
    # First split: 85% (train+val) vs 15% test
    train_val_idx, test_idx = train_test_split(
        X,
        test_size=CONFIG['TEST_SIZE'],
        stratify=y,
        random_state=42
    )
    df.loc[test_idx, 'split'] = 'test'
    
    # Cross-validation folds on train+val data (85%)
    sgkf = StratifiedGroupKFold(
        n_splits=CONFIG['N_SPLITS'],
        shuffle=True,
        random_state=42
    )
    
    # For each fold, we'll have:
    # - 70% train (of total)
    # - 15% val (of total)
    # - 15% test (fixed)
    for fold, (train_idx, val_idx) in enumerate(sgkf.split(
        df.loc[train_val_idx],
        df.loc[train_val_idx, 'label_idx'],
        df.loc[train_val_idx, 'species']
    )):
        # Get the original indices
        original_train_idx = df.index[train_val_idx[train_idx]]
        original_val_idx = df.index[train_val_idx[val_idx]]
        
        # Assign splits for this fold
        df.loc[original_train_idx, 'split'] = 'train'
        df.loc[original_train_idx, 'fold'] = fold
        df.loc[original_val_idx, 'split'] = 'val'
        df.loc[original_val_idx, 'fold'] = fold
    
    # Save CSV metadata
    csv_path = os.path.join(DATA_DIR, f"dataset_metadata_{datetime.now().strftime('%Y%m%d')}.csv")
    df.to_csv(csv_path, index=False)
    print(f"\nSaved metadata to: {csv_path}")

    # Create HDF5 dataset
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    h5_path = os.path.join(DATA_DIR, f"dataset_{datetime.now().strftime('%Y%m%d')}.h5")
    
    with h5py.File(h5_path, 'w') as hf:
        # Store all images and labels
        hf.create_dataset('X', data=X, 
                         compression=CONFIG['COMPRESSION'],
                         compression_opts=CONFIG['COMPRESSION_LEVEL'])
        hf.create_dataset('y', data=y,
                         compression=CONFIG['COMPRESSION'],
                         compression_opts=CONFIG['COMPRESSION_LEVEL'])
        
        # Create groups for each fold
        cv_group = hf.create_group('cross_validation')
        
        for fold in range(CONFIG['N_SPLITS']):
            fold_group = cv_group.create_group(f'fold_{fold}')
            
            # Get indices for this fold
            train_mask = (df['fold'] == fold) & (df['split'] == 'train')
            val_mask = (df['fold'] == fold) & (df['split'] == 'val')
            test_mask = (df['split'] == 'test')
            
            # Store data for this fold
            fold_group.create_dataset('X_train', data=X[train_mask],
                                    compression=CONFIG['COMPRESSION'],
                                    compression_opts=CONFIG['COMPRESSION_LEVEL'])
            fold_group.create_dataset('y_train', data=y[train_mask],
                                    compression=CONFIG['COMPRESSION'],
                                    compression_opts=CONFIG['COMPRESSION_LEVEL'])
            fold_group.create_dataset('X_val', data=X[val_mask],
                                    compression=CONFIG['COMPRESSION'],
                                    compression_opts=CONFIG['COMPRESSION_LEVEL'])
            fold_group.create_dataset('y_val', data=y[val_mask],
                                    compression=CONFIG['COMPRESSION'],
                                    compression_opts=CONFIG['COMPRESSION_LEVEL'])
            fold_group.create_dataset('X_test', data=X[test_mask],
                                    compression=CONFIG['COMPRESSION'],
                                    compression_opts=CONFIG['COMPRESSION_LEVEL'])
            fold_group.create_dataset('y_test', data=y[test_mask],
                                    compression=CONFIG['COMPRESSION'],
                                    compression_opts=CONFIG['COMPRESSION_LEVEL'])
        
        # Store metadata
        hf.attrs.update({
            'species': json.dumps(list(species_keys.keys())),
            'image_size': json.dumps(CONFIG['IMG_SIZE']),
            'augmentation': json.dumps(CONFIG['AUGMENTATION']),
            'creation_time': timestamp,
            'csv_reference': os.path.basename(csv_path),
            'n_folds': CONFIG['N_SPLITS'],
            'split_ratio': '70/15/15'
        })
    
    # Update metadata log
    dataset_stats = {
        'total_images': len(df),
        'species_counts': species_counts,
        'split_counts': {
            'train': len(df[df['split'] == 'train']) // CONFIG['N_SPLITS'],  
            'val': len(df[df['split'] == 'val']) // CONFIG['N_SPLITS'],      
            'test': len(df[df['split'] == 'test'])                           
        },
        'h5_path': h5_path,
        'csv_path': csv_path,
        'processing_time_seconds': round(time.time() - start_time, 2),
        'timestamp': timestamp,
        'memory_usage_mb': round(X.nbytes / (1024**2), 2)
    }
    
    updateLogging(metadata_path, {"dataset_stats": dataset_stats})

    print(f"\nDataset successfully created:")
    print(f"- HDF5 file: {h5_path}")
    print(f"- Metadata CSV: {csv_path}")
    print(f"\nSplit counts per fold:")
    print(f"Train: {dataset_stats['split_counts']['train']}")
    print(f"Validation: {dataset_stats['split_counts']['val']}")
    print(f"Test: {dataset_stats['split_counts']['test']}")
    print(f"\nProcessing time: {time.time() - start_time:.2f} seconds")

In [21]:
print("Initiating dataset creation...")
metadata_path = initLogging(DATA_DIR)
for species in species_keys.keys():
    downloadImages(species, DATA_DIR, limit=600, metadata_path=metadata_path)
    cleanData(species, metadata_path=metadata_path)
createDataset(metadata_path)
print("Tasks completed")


INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Carduelis+carduelis&per_page=600&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Initiating dataset creation...

Downloading images for: Carduelis carduelis


100%|██████████| 200/200 [07:27<00:00,  2.24s/it]


Downloaded 320 images from iNaturalist for Carduelis carduelis
Downloaded image 280/280
Downloaded 280 images from GBIF for Carduelis carduelis
Total images downloaded for Carduelis carduelis: 600
Starting image cleaning for: Carduelis carduelis


Cleaning Carduelis carduelis: 100%|██████████| 600/600 [00:01<00:00, 535.49it/s]
INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Ciconia+ciconia&per_page=600&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Finished cleaning Carduelis carduelis. Removed: 1, Remaining: 599

Downloading images for: Ciconia ciconia


100%|██████████| 200/200 [06:11<00:00,  1.86s/it]


Downloaded 313 images from iNaturalist for Ciconia ciconia
Downloaded image 287/287
Downloaded 287 images from GBIF for Ciconia ciconia
Total images downloaded for Ciconia ciconia: 600
Starting image cleaning for: Ciconia ciconia


Cleaning Ciconia ciconia: 100%|██████████| 600/600 [00:01<00:00, 535.53it/s]
INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Columba+livia&per_page=600&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Finished cleaning Ciconia ciconia. Removed: 4, Remaining: 596

Downloading images for: Columba livia


 18%|█▊        | 35/200 [01:04<04:42,  1.71s/it]

Download error for https://inaturalist-open-data.s3.amazonaws.com/photos/506132405/original.jpg: HTTPSConnectionPool(host='inaturalist-open-data.s3.amazonaws.com', port=443): Read timed out.


 24%|██▎       | 47/200 [01:59<13:23,  5.25s/it]

Download error for https://inaturalist-open-data.s3.amazonaws.com/photos/506121502/original.jpg: HTTPSConnectionPool(host='inaturalist-open-data.s3.amazonaws.com', port=443): Read timed out. (read timeout=10)


100%|██████████| 200/200 [12:16<00:00,  3.68s/it]


Downloaded 324 images from iNaturalist for Columba livia
Downloaded image 5/276
Download error for image 2 for Columba livia: HTTPSConnectionPool(host='inaturalist-open-data.s3.amazonaws.com', port=443): Read timed out. (read timeout=10)
Downloaded image 276/276
Downloaded 276 images from GBIF for Columba livia
Total images downloaded for Columba livia: 600
Starting image cleaning for: Columba livia


Cleaning Columba livia: 100%|██████████| 600/600 [00:01<00:00, 555.87it/s]
INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Delichon+urbicum&per_page=600&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Finished cleaning Columba livia. Removed: 0, Remaining: 600

Downloading images for: Delichon urbicum


100%|██████████| 200/200 [08:05<00:00,  2.43s/it]


Downloaded 352 images from iNaturalist for Delichon urbicum
Downloaded image 248/248
Downloaded 248 images from GBIF for Delichon urbicum
Total images downloaded for Delichon urbicum: 600
Starting image cleaning for: Delichon urbicum


Cleaning Delichon urbicum: 100%|██████████| 600/600 [00:01<00:00, 582.76it/s]
INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Emberiza+calandra&per_page=600&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Finished cleaning Delichon urbicum. Removed: 34, Remaining: 566

Downloading images for: Emberiza calandra


100%|██████████| 200/200 [05:56<00:00,  1.78s/it]


Downloaded 299 images from iNaturalist for Emberiza calandra
Downloaded image 301/301
Downloaded 301 images from GBIF for Emberiza calandra
Total images downloaded for Emberiza calandra: 600
Starting image cleaning for: Emberiza calandra


Cleaning Emberiza calandra: 100%|██████████| 600/600 [00:01<00:00, 511.45it/s]
INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Hirundo+rustica&per_page=600&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Finished cleaning Emberiza calandra. Removed: 7, Remaining: 593

Downloading images for: Hirundo rustica


100%|██████████| 200/200 [07:01<00:00,  2.11s/it]


Downloaded 348 images from iNaturalist for Hirundo rustica
Downloaded image 252/252
Downloaded 252 images from GBIF for Hirundo rustica
Total images downloaded for Hirundo rustica: 600
Starting image cleaning for: Hirundo rustica


Cleaning Hirundo rustica: 100%|██████████| 600/600 [00:01<00:00, 576.02it/s]
INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Passer+domesticus&per_page=600&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Finished cleaning Hirundo rustica. Removed: 12, Remaining: 588

Downloading images for: Passer domesticus


100%|██████████| 200/200 [07:18<00:00,  2.19s/it]


Downloaded 326 images from iNaturalist for Passer domesticus
Downloaded image 274/274
Downloaded 274 images from GBIF for Passer domesticus
Total images downloaded for Passer domesticus: 600
Starting image cleaning for: Passer domesticus


Cleaning Passer domesticus: 100%|██████████| 600/600 [00:01<00:00, 551.71it/s]
INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Serinus+serinus&per_page=600&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Finished cleaning Passer domesticus. Removed: 6, Remaining: 594

Downloading images for: Serinus serinus


100%|██████████| 200/200 [05:20<00:00,  1.60s/it]


Downloaded 282 images from iNaturalist for Serinus serinus
Downloaded image 318/318
Downloaded 318 images from GBIF for Serinus serinus
Total images downloaded for Serinus serinus: 600
Starting image cleaning for: Serinus serinus


Cleaning Serinus serinus: 100%|██████████| 600/600 [00:01<00:00, 541.50it/s]
INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Streptopelia+decaocto&per_page=600&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Finished cleaning Serinus serinus. Removed: 2, Remaining: 598

Downloading images for: Streptopelia decaocto


100%|██████████| 200/200 [07:35<00:00,  2.28s/it]


Downloaded 357 images from iNaturalist for Streptopelia decaocto
Downloaded image 243/243
Downloaded 243 images from GBIF for Streptopelia decaocto
Total images downloaded for Streptopelia decaocto: 600
Starting image cleaning for: Streptopelia decaocto


Cleaning Streptopelia decaocto: 100%|██████████| 600/600 [00:01<00:00, 548.44it/s]
INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Sturnus+unicolor&per_page=600&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Finished cleaning Streptopelia decaocto. Removed: 5, Remaining: 595

Downloading images for: Sturnus unicolor


100%|██████████| 200/200 [05:58<00:00,  1.79s/it]


Downloaded 302 images from iNaturalist for Sturnus unicolor
Downloaded image 298/298
Downloaded 298 images from GBIF for Sturnus unicolor
Total images downloaded for Sturnus unicolor: 600
Starting image cleaning for: Sturnus unicolor


Cleaning Sturnus unicolor: 100%|██████████| 600/600 [00:01<00:00, 529.53it/s]
INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Turdus+merula&per_page=600&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Finished cleaning Sturnus unicolor. Removed: 3, Remaining: 597

Downloading images for: Turdus merula


100%|██████████| 200/200 [06:07<00:00,  1.84s/it]


Downloaded 263 images from iNaturalist for Turdus merula
Downloaded image 337/337
Downloaded 337 images from GBIF for Turdus merula
Total images downloaded for Turdus merula: 600
Starting image cleaning for: Turdus merula


Cleaning Turdus merula: 100%|██████████| 600/600 [00:01<00:00, 535.57it/s]


Finished cleaning Turdus merula. Removed: 28, Remaining: 572
Processing images...


Species: 100%|██████████| 11/11 [00:02<00:00,  3.81it/s]



Saved metadata to: full_image_dataset/dataset_metadata_20250517.csv

Dataset successfully created:
- HDF5 file: full_image_dataset/dataset_20250517_213505.h5
- Metadata CSV: full_image_dataset/dataset_metadata_20250517.csv

Split counts per fold:
Train: 907
Validation: 197
Test: 975

Processing time: 99.67 seconds
Tasks completed
