In [23]:
import os
import io
import json
import time
import platform
import requests
import h5py
import psutil
import imagehash
import numpy as np

from PIL import Image
from tqdm import tqdm
from pathlib import Path
from collections import defaultdict
from datetime import datetime
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from torchvision import transforms


from pygbif import occurrences
from pyinaturalist.node_api import get_observations
from ebird.api import get_observations as ebird_get_observations

In [None]:
DATA_DIR = "new_dataset"          # Expected input dir: species_name/*.jpg
IMG_SIZE_THRESHOLD = 200              # Min resolution (px)
HASH_THRESHOLD = 8                    # Duplicate threshold using phash
EBIRD_API_KEY = "r2qmi9gi3gpg"

species_keys = {
    "Carduelis carduelis": 2494686,
    "Ciconia ciconia": 2481912,
    "Columba livia": 2495414,
    "Delichon urbicum": 2489214,
    "Emberiza calandra":7634625,
    "Hirundo rustica": 7192162,
    "Passer domesticus": 5231190,
    "Serinus serinus":2494200,
    "Streptopelia decaocto": 2495696,
    "Sturnus unicolor":2489104,
    "Turdus merula": 6171845   
}

CONFIG = {
    'IMG_SIZE': (224, 224),
    'TEST_SIZE': 0.15,
    'TRAIN_SIZE': 0.7,
    'VAL_SIZE': 0.1,
    'N_SPLITS': 5,
    'COMPRESSION': 'gzip',
    'COMPRESSION_LEVEL': 9,
    'AUGMENTATION': {
        'train': [
            {'name': 'RandomHorizontalFlip', 'p': 0.5},
            {'name': 'RandomRotation', 'degrees': 20},
            {'name': 'ColorJitter', 'brightness': 0.1, 'contrast': 0.1, 'saturation': 0.1}
        ]
    }
}

In [25]:
def getSystemInfo():
    return {
        "timestamp": datetime.now().isoformat(),
        "os": platform.system(),
        "os_version": platform.release(),
        "cpu": platform.processor(),
        "ram_gb": round(psutil.virtual_memory().total / (1024**3), 2),
        "python_version": platform.python_version()
    }

def initLogging(output_dir):
    metadata = {
        "config": CONFIG,
        "system": getSystemInfo(),
        "download": {},
        "cleaning": {},
        "augmentation": {},
        "dataset_stats": {},
        "processing_times": {}
    }
    os.makedirs(output_dir, exist_ok=True)
    metadata_path = os.path.join(output_dir, "dataset_prep.json")
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)
    
    return metadata_path

def updateLogging(metadata_path, updates):
    if not os.path.exists(metadata_path):
        return initLogging(os.path.dirname(metadata_path))
    
    with open(metadata_path, 'r') as f:
        metadata = json.load(f)
    
    metadata.update(updates)
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)

In [26]:
def apply_augmentation(img):
    transform = transforms.Compose([
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomRotation(20),
        transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1),
    ])
    return transform(img)


def downloadImages(species_name, output_dir, limit=500, metadata_path=None):
    start_time = time.time()
    output_dir = os.path.join(DATA_DIR, species_name.replace(" ", "_"))
    os.makedirs(output_dir, exist_ok=True)
    if metadata_path is None:
        metadata_path = initLogging(DATA_DIR)
    
    print(f"\nDownloading images for: {species_name}")
    stats = {
        'iNaturalist': 0,
        'GBIF': 0,
        'start_time': datetime.now().isoformat()
    }
    
    try:
        #iNaturalist download
        stats['iNaturalist'] = downloadImages_INaturalist(species_name, output_dir, limit)
        
        # GBIF download
        current_count = stats['iNaturalist']
        stats['GBIF'] = downloadImages_GBIF(species_name, current_count, output_dir, limit - current_count)
        
    except Exception as e:
        print(f"Error during download: {e}")
    
    # Update metadata
    stats.update({
        'end_time': datetime.now().isoformat(),
        'total_downloaded': stats['iNaturalist'] + stats['GBIF'],
        'time_seconds': time.time() - start_time
    })
    
    updateLogging(metadata_path, {"download": {species_name: stats}})
    print(f"Total images downloaded for {species_name}: {stats['total_downloaded']}")
    return stats['total_downloaded']

def downloadImages_INaturalist(species_name, output_dir, limit=500):
    results = get_observations(
        taxon_name=species_name,
        per_page=limit,
        quality_grade="research",
        media_type="photo",
        license=["CC-BY","CC-BY-NC"] 
    )

    images_downloaded = 0
    seen_urls = set()

    for obs in tqdm(results.get("results", [])):
        for photo in obs.get("photos",[]):
            url = photo.get("url", "").replace("square", "original")
            if not url or url in seen_urls:
                continue
            seen_urls.add(url)
            try:
                # Full-size image (not thumbnail)
                response = requests.get(url, timeout=10)
                if response.status_code == 200:
                    # Apply augmentation before saving
                    img = Image.open(io.BytesIO(response.content)).convert('RGB')
                    img = apply_augmentation(img)  # Apply augmentation
                    image_ext = url.split(".")[-1].split("?")[0]
                    filename = f"{species_name.replace(' ', '_')}_{images_downloaded}.{image_ext}"
                    
                    # Save augmented image
                    img.save(os.path.join(output_dir, filename))
                    images_downloaded += 1
            except Exception as e:
                print(f"Error: {e}")

            if images_downloaded >= limit:
                break
        if images_downloaded >= limit:
            break

    print(f"Downloaded {images_downloaded} images from iNaturalist for {species_name}")
    return images_downloaded

def downloadImages_GBIF(species_name, downloadedValue, output_dir, limit=500):
    images_downloaded = 0
    try:
        result = occurrences.search(
            taxonKey=species_keys[species_name],
            mediaType="StillImage",
            limit=limit
        )
        
        for idx, occ in enumerate(result.get("results", [])):
            for media in occ.get("media",[]):
                if media.get("type") != "StillImage":
                    continue
                    
                imgURL = media.get("identifier")
                if not imgURL:
                    continue
                    
                try:
                    response = requests.get(imgURL, timeout=10)
                    response.raise_for_status()
                    
                    # Apply augmentation before saving
                    img = Image.open(io.BytesIO(response.content)).convert('RGB')
                    img = apply_augmentation(img)
                    ext = 'jpg' if 'jpeg' in response.headers.get('content-type', '') else 'png'
                    filename = f"{species_name.replace(' ', '_')}_{downloadedValue + images_downloaded}.{ext}"
                    
                    #Save
                    img.save(os.path.join(output_dir, filename))
                    images_downloaded += 1
                    print(f"Downloaded image {images_downloaded}/{limit}", end='\r')
                    
                except Exception as e:
                    print(f"\nError downloading image {idx} for {species_name}: {e}")
                    continue
                if images_downloaded >= limit:
                    break
            if images_downloaded >= limit:
                break   
    except Exception as e:
        print(f"\nError fetching occurrences for {species_name}: {e}")
    print(f"\nDownloaded {images_downloaded} images from GBIF for {species_name}")
    return images_downloaded

In [27]:
def isValidImage(path):
    try:
        img = Image.open(path).convert("RGB")
        return min(img.size) >= IMG_SIZE_THRESHOLD
    except Exception as e:
        print(f"Error processing {path}: {e}")
        return False

def getPhash(path):
    try:
        img = Image.open(path).convert("RGB")
        return imagehash.phash(img)
    except Exception as e:
        print(f"Error generating hash for {path}: {e}")
        return None

def cleanData(species_name, metadata_path=None):
    start_time = time.time()
    if metadata_path is None:
        metadata_path = initLogging(DATA_DIR)

    hash_db = []
    total_removed = 0
    total_valid_images = 0
    print("Starting image extraction and cleaning...")

    for species in os.listdir(DATA_DIR):
        if species_name and species not in species_name:
            continue
        species_path = Path(DATA_DIR) / species
        if not species_path.is_dir():
            continue

        imgsRemove = []
        valid_images = 0
        for img_path in species_path.glob("*.*"):
            if not isValidImage(img_path):
                imgsRemove.append(img_path)
                continue

            phash = getPhash(img_path)
            if phash is None:
                imgsRemove.append(img_path)
                continue

            # Check for duplicates
            is_duplicate = any(phash - existing < HASH_THRESHOLD for existing in hash_db[species])
            if is_duplicate:
                imgsRemove.append(img_path)
            else:
                hash_db[species].append(phash)
                valid_images += 1

        # Remove invalid/duplicate files
        for img_path in imgsRemove:
            os.remove(img_path)
            total_removed += 1

        total_valid_images += valid_images 

    stats = {
        'species': species_name,
        'total_removed': total_removed,
        'remaining_images': total_valid_images,
        'time_seconds': time.time() - start_time,
        'timestamp': datetime.now().isoformat()
    }

    updateLogging(metadata_path, {"cleaning": {species_name: stats}})
    print(f"Finished cleaning. Total Images Removed: {total_removed}")
    return total_removed


In [None]:
def createDataset(metadata_path=None):
    start_time = time.time()
    
    # Initialize log if not provided
    if metadata_path is None:
        metadata_path = initLogging(DATA_DIR)
    
    images = []
    labels = []
    species_counts = defaultdict(int)
        
    for idx, specie in enumerate(species_keys):
        specie_dir = os.path.join(DATA_DIR, specie.replace(" ", "_"))  
        for img_name in os.listdir(specie_dir):
            img_path = os.path.join(specie_dir, img_name)   
            try:
                img = Image.open(img_path).convert('RGB').resize(CONFIG['IMG_SIZE'])
                images.append(np.array(img))
                labels.append(idx)
                species_counts[specie] += 1
            except Exception as e:
                print(f"Error processing {img_path}: {e}")

    # Split into train and test sets
    X = np.array(images)
    y = np.array(labels)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=CONFIG['TEST_SIZE'], stratify=y
    )

    # Cross Validation with Stratified K-Folds
    cv = StratifiedShuffleSplit(
        n_splits=CONFIG['N_SPLITS'], 
        test_size=CONFIG['TEST_SIZE'], 
        train_size=CONFIG['TRAIN_SIZE'], 
        random_state=42
    )

    # Save to HDF5 with versioning
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = f"dataset_{timestamp}.h5"
    output_path = os.path.join(DATA_DIR, output_file)
    
    with h5py.File(output_path, 'w') as hf:
        # Test set
        test_group = hf.create_group('test')
        test_group.create_dataset('X_test', data=X_test, 
                                compression=CONFIG['COMPRESSION'], 
                                compression_opts=CONFIG['COMPRESSION_LEVEL'])
        test_group.create_dataset('y_test', data=y_test,
                                compression=CONFIG['COMPRESSION'],
                                compression_opts=CONFIG['COMPRESSION_LEVEL'])

        # Cross-validation splits
        cv_group = hf.create_group('cross_validation')
        for fold, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
            fold_group = cv_group.create_group(f'fold_{fold + 1}')
            fold_group.create_dataset('X_train', data=X_train[train_idx],
                                    compression=CONFIG['COMPRESSION'],
                                    compression_opts=CONFIG['COMPRESSION_LEVEL'])
            fold_group.create_dataset('y_train', data=y_train[train_idx],
                                    compression=CONFIG['COMPRESSION'],
                                    compression_opts=CONFIG['COMPRESSION_LEVEL'])
            fold_group.create_dataset('X_val', data=X_train[val_idx],
                                    compression=CONFIG['COMPRESSION'],
                                    compression_opts=CONFIG['COMPRESSION_LEVEL'])
            fold_group.create_dataset('y_val', data=y_train[val_idx],
                                    compression=CONFIG['COMPRESSION'],
                                    compression_opts=CONFIG['COMPRESSION_LEVEL'])

        # Save metadata
        hf.attrs['species'] = json.dumps(list(species_keys.keys()))
        hf.attrs['image_size'] = json.dumps(CONFIG['IMG_SIZE'])
        hf.attrs['augmentation'] = json.dumps(CONFIG['AUGMENTATION'])
        hf.attrs['creation_time'] = timestamp
    
    # Update metadata log
    dataset_stats = {
        'total_images': len(images),
        'species_counts': dict(species_counts),
        'train_samples': len(X_train),
        'test_samples': len(X_test),
        'output_path': output_path,
        'compression': CONFIG['COMPRESSION'],
        'compression_level': CONFIG['COMPRESSION_LEVEL'],
        'processing_time_seconds': time.time() - start_time,
        'timestamp': timestamp
    }
    
    updateLogging(metadata_path, {
        "dataset_stats": dataset_stats
    })

    print(f"Dataset created at {output_path}")
    print(f"Total processing time: {time.time() - start_time:.2f} seconds")

In [29]:
print("Initiating dataset creation...")
metadata_path = initLogging(DATA_DIR)
for species in species_keys.keys():
    downloadImages(species, DATA_DIR, limit=2000, metadata_path=metadata_path)
    cleanData(species, metadata_path=metadata_path)
createDataset(metadata_path)
print("Tasks completed")


INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Carduelis+carduelis&per_page=2000&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Initiating dataset creation...

Downloading images for: Carduelis carduelis


100%|██████████| 200/200 [07:31<00:00,  2.26s/it]


Downloaded 292 images from iNaturalist for Carduelis carduelis
Downloaded image 426/1708

INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Ciconia+ciconia&per_page=2000&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Downloaded image 427/1708
Downloaded 427 images from GBIF for Carduelis carduelis
Total images downloaded for Carduelis carduelis: 719
Starting image extraction and cleaning...
Finished cleaning. Total Images Removed: 0

Downloading images for: Ciconia ciconia


100%|██████████| 200/200 [07:24<00:00,  2.22s/it]


Downloaded 313 images from iNaturalist for Ciconia ciconia
Downloaded image 482/1687

INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Columba+livia&per_page=2000&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Downloaded image 483/1687
Downloaded 483 images from GBIF for Ciconia ciconia
Total images downloaded for Ciconia ciconia: 796
Starting image extraction and cleaning...
Finished cleaning. Total Images Removed: 0

Downloading images for: Columba livia


100%|██████████| 200/200 [07:36<00:00,  2.28s/it]


Downloaded 302 images from iNaturalist for Columba livia
Downloaded image 473/1698

INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Delichon+urbicum&per_page=2000&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Downloaded image 474/1698
Downloaded 474 images from GBIF for Columba livia
Total images downloaded for Columba livia: 776
Starting image extraction and cleaning...
Finished cleaning. Total Images Removed: 0

Downloading images for: Delichon urbicum


100%|██████████| 200/200 [08:22<00:00,  2.51s/it]


Downloaded 391 images from iNaturalist for Delichon urbicum
Downloaded image 490/1609

INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Emberiza+calandra&per_page=2000&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Downloaded image 491/1609
Downloaded 491 images from GBIF for Delichon urbicum
Total images downloaded for Delichon urbicum: 882
Starting image extraction and cleaning...
Finished cleaning. Total Images Removed: 0

Downloading images for: Emberiza calandra


100%|██████████| 200/200 [06:58<00:00,  2.09s/it]


Downloaded 325 images from iNaturalist for Emberiza calandra
Downloaded image 523/1675

INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Hirundo+rustica&per_page=2000&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Downloaded image 524/1675
Downloaded 524 images from GBIF for Emberiza calandra
Total images downloaded for Emberiza calandra: 849
Starting image extraction and cleaning...
Finished cleaning. Total Images Removed: 0

Downloading images for: Hirundo rustica


100%|██████████| 200/200 [08:16<00:00,  2.48s/it]


Downloaded 370 images from iNaturalist for Hirundo rustica
Downloaded image 526/1630

INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Passer+domesticus&per_page=2000&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Downloaded image 527/1630
Downloaded 527 images from GBIF for Hirundo rustica
Total images downloaded for Hirundo rustica: 897
Starting image extraction and cleaning...
Finished cleaning. Total Images Removed: 0

Downloading images for: Passer domesticus


100%|██████████| 200/200 [07:23<00:00,  2.22s/it]


Downloaded 309 images from iNaturalist for Passer domesticus
Downloaded image 545/1691

INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Serinus+serinus&per_page=2000&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Downloaded image 546/1691
Downloaded 546 images from GBIF for Passer domesticus
Total images downloaded for Passer domesticus: 855
Starting image extraction and cleaning...
Finished cleaning. Total Images Removed: 0

Downloading images for: Serinus serinus


100%|██████████| 200/200 [07:48<00:00,  2.34s/it]


Downloaded 334 images from iNaturalist for Serinus serinus
Downloaded image 507/1666

INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Streptopelia+decaocto&per_page=2000&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Downloaded image 508/1666
Downloaded 508 images from GBIF for Serinus serinus
Total images downloaded for Serinus serinus: 842
Starting image extraction and cleaning...
Finished cleaning. Total Images Removed: 0

Downloading images for: Streptopelia decaocto


100%|██████████| 200/200 [07:38<00:00,  2.29s/it]


Downloaded 335 images from iNaturalist for Streptopelia decaocto
Downloaded image 445/1665

INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Sturnus+unicolor&per_page=2000&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Downloaded image 446/1665
Downloaded 446 images from GBIF for Streptopelia decaocto
Total images downloaded for Streptopelia decaocto: 781
Starting image extraction and cleaning...
Finished cleaning. Total Images Removed: 0

Downloading images for: Sturnus unicolor


100%|██████████| 200/200 [07:12<00:00,  2.16s/it]


Downloaded 300 images from iNaturalist for Sturnus unicolor
Downloaded image 372/1700

INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Turdus+merula&per_page=2000&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Downloaded image 373/1700
Downloaded 373 images from GBIF for Sturnus unicolor
Total images downloaded for Sturnus unicolor: 673
Starting image extraction and cleaning...
Finished cleaning. Total Images Removed: 0

Downloading images for: Turdus merula


100%|██████████| 200/200 [08:05<00:00,  2.43s/it]


Downloaded 332 images from iNaturalist for Turdus merula
Downloaded image 569/1668
Downloaded 569 images from GBIF for Turdus merula
Total images downloaded for Turdus merula: 901
Starting image extraction and cleaning...
Finished cleaning. Total Images Removed: 0
Dataset created at new_dataset/dataset_20250506_024257.h5
Total processing time: 518.53 seconds
Tasks completed
