In [1]:
import pygbif
import requests
import os
from pathlib import Path
import shutil
import imagehash


from tqdm import tqdm
from PIL import Image
from collections import defaultdict
from pyinaturalist.node_api import get_observations
from ebird.api import get_observations as ebird_get_observations

  from pyinaturalist.node_api import get_observations


In [2]:
RAW_DATA_DIR = "new_dataset"          # Expected input dir: species_name/*.jpg
CLEAN_DATA_DIR = "clean_dataset"      # Output cleaned dir
IMG_SIZE_THRESHOLD = 200              # Min resolution (px)
HASH_THRESHOLD = 8                    # Duplicate threshold using phash
EBIRD_API_KEY = "r2qmi9gi3gpg"

species_keys = {
    "Carduelis carduelis": 2494686,
    "Ciconia ciconia": 2481912,
    "Columba livia": 2495414,
    "Delichon urbicum": 2489214,
    "Emberiza calandra":7634625,
    "Hirundo rustica": 7192162,
    "Passer domesticus": 5231190,
    "Serinus serinus":2494200,
    "Streptopelia decaocto": 2495696,
    "Sturnus unicolor":2489104,
    "Turdus merula": 6171845   
}

In [3]:
def isValidImage(path):
    try:
        img = Image.open(path).convert("RGB")
        if min(img.size) < IMG_SIZE_THRESHOLD:
            return False
        img.verify()
        return True
    except Exception as e:
        print(f"Error processing {path}: {e}")
        return False

def getPhash(path):
    try:
        img = Image.open(path).convert("RGB")
        return imagehash.phash(img)
    except Exception as e:
        print(f"Error generating hash for {path}: {e}")
        return None

def downloadImages_INaturalist(species_name, output_dir, limit=500):
    results = get_observations(
        taxon_name=species_name,
        per_page=limit,
        quality_grade="research",
        media_type="photo",
        license=["CC-BY","CC-BY-NC"] 
    )

    images_downloaded = 0
    seen_urls = set()

    for obs in tqdm(results["results"]):
        if "photos" not in obs:
            continue
        for photo in obs["photos"]:
            url = photo.get("url", "")
            if not url or url in seen_urls:
                continue
            seen_urls.add(url)
            try:
                # Full-size image (not thumbnail)
                full_url = url.replace("square", "original")
                response = requests.get(full_url, timeout=10)
                if response.status_code == 200:
                    image_ext = full_url.split(".")[-1].split("?")[0]
                    filename = f"{species_name.replace(' ', '_')}_{images_downloaded}.{image_ext}"
                    file_path = os.path.join(output_dir, filename)
                    with open(file_path, "wb") as f:
                        f.write(response.content)
                    images_downloaded += 1
            except Exception as e:
                print(f"Error: {e}")

            if images_downloaded >= limit:
                break
        if images_downloaded >= limit:
            break

    print(f"Downloaded {images_downloaded} images for {species_name}")
    return images_downloaded

def downloadImages_GBIF(species_name, downloadedValue, output_dir, limit=500):
    taxon_key = species_keys[species_name]
    images_downloaded = downloadedValue+1
    
    try:
        # Fetch occurrences from GBIF
        occurrences = pygbif.occurrences.search(
            taxonKey=taxon_key,
            mediaType="StillImage",
            limit=limit
        )
        
        for idx, occ in enumerate(occurrences["results"]):
            if images_downloaded >= limit:
                break
                
            if "media" not in occ:
                continue
                
            # Process each media item that is a still image
            for media in occ["media"]:
                if media.get("type") != "StillImage":
                    continue
                    
                imgURL = media.get("identifier")
                if not imgURL:
                    continue
                    
                try:
                    # Download the image
                    response = requests.get(imgURL, timeout=10)
                    response.raise_for_status()
                    
                    # Determine file extension from content type
                    content_type = response.headers.get('content-type', 'image/jpeg')
                    ext = 'jpg' if 'jpeg' in content_type else 'png'
                    
                    # Save the image
                    filename = f"{species_name.replace(' ', '_')}_{images_downloaded}.{ext}"
                    filepath = os.path.join(output_dir, filename)
                    
                    with open(filepath, "wb") as f:
                        f.write(response.content)
                        
                    images_downloaded += 1
                    print(f"Downloaded image {images_downloaded}/{limit}", end='\r')
                    
                except Exception as e:
                    print(f"\nError downloading image {idx} for {species_name}: {e}")
                    continue
                    
    except Exception as e:
        print(f"\nError fetching occurrences for {species_name}: {e}")
    
    print(f"Downloaded {images_downloaded - downloadedValue} GBIF images for {species_name}")
    return images_downloaded


def downloadImages_eBird(species_name , downloadedValue , output_dir, limit=500):
    ## Get the species code from the species name
    headers = {'X-eBirdApiToken': EBIRD_API_KEY}
    region = 'world'
    # Step 1: Get species code (with 403 error handling)
    try:
        # Try direct taxonomy lookup first
        taxon_url = "https://api.ebird.org/v2/ref/taxon/find"
        response = requests.get(taxon_url, headers=headers, params={'species': species_name})
        response.raise_for_status()
        specie_code = response.json()[0]['speciesCode']
    except requests.HTTPError as e:
        if e.response.status_code == 403:
            # Fallback: Search recent observations for the species code
            print("Taxonomy endpoint blocked, using observation search fallback...")
            obs_search_url = f"https://api.ebird.org/v2/data/obs/{region}/recent"
            response = requests.get(obs_search_url, headers=headers, params={'species': species_name})
            response.raise_for_status()
            if not response.json():
                raise ValueError(f"No observations found for {species_name}")
            specie_code = response.json()[0]['speciesCode']
        else:
            raise
    
    # Step 2: Download images
    images_downloaded = 0
    obs_url = f"https://api.ebird.org/v2/data/obs/{region}/recent?speciesCode={specie_code}"
    observations = requests.get(obs_url, headers=headers).json()
    
    with tqdm(total=limit, desc=f"Downloading {species_name}") as pbar:
        for obs in observations:
            if images_downloaded >= limit:
                break
                
            if obs.get('hasMedia'):
                try:
                    # Get media details
                    media_url = f"https://api.ebird.org/v2/observation/{obs['subId']}"
                    media_data = requests.get(media_url, headers=headers).json()
                    
                    for media in media_data.get('media', []):
                        if media['type'] == 'photo':
                            try:
                                img_url = media['url']
                                response = requests.get(img_url, stream=True, timeout=15)
                                ext = 'jpg' if 'jpeg' in response.headers.get('content-type','') else 'png'
                                filename = f"{specie_code}_{images_downloaded}.{ext}"
                                
                                with open(os.path.join(output_dir, filename), 'wb') as f:
                                    for chunk in response.iter_content(8192):
                                        f.write(chunk)
                                
                                images_downloaded += 1
                                pbar.update(1)
                            except Exception as e:
                                print(f"Image download failed: {e}")
                                continue
                except Exception as e:
                    print(f"Observation processing failed: {e}")
                    continue
    
    print(f"\nDownloaded {images_downloaded} images for {species_name} ({specie_code})")
    return images_downloaded



def downloadImages(species_name, output_dir, limit=500):
    output_dir = os.path.join(RAW_DATA_DIR, species_name.replace(" ", "_"))
    os.makedirs(output_dir, exist_ok=True)
    print(f"\nDownloading images for: {species_name}")
    downloadedValue = downloadImages_INaturalist(species_name, output_dir, limit)
    downloadedValue = downloadImages_GBIF(species_name, downloadedValue, output_dir, limit)
    #downloadedValue = downloadImages_eBird(species_name, downloadedValue, output_dir, limit)
    print(f"Total images downloaded for {species_name}: {downloadedValue}")

def cleanData(species_name):
    hash_db = defaultdict(list)
    total_removed = 0
    print("Starting image extraction and cleaning...")

    for species in os.listdir(RAW_DATA_DIR):
        if species_name and species not in species_name:
            continue
        species_path = os.path.join(RAW_DATA_DIR, species)
        
        if not os.path.isdir(species_path):
            continue

        imgsRemove = []
        for img_path in species_path.glob("*.*"):
            if not isValidImage(img_path):
                imgsRemove.append(img_path)
                continue

            phash = getPhash(img_path)
            if phash is None:
                imgsRemove.append(img_path)
                continue

            # Check for duplicates
            is_duplicate = any(phash - existing < HASH_THRESHOLD 
                             for existing in hash_db[species])
            if is_duplicate:
                imgsRemove.append(img_path)
            else:
                hash_db[species].append(phash)

        # Remove invalid/duplicate files
        for img_path in imgsRemove:
            os.remove(img_path)
            total_removed += 1


    print(f"Finished cleaning. Total Images Removed: {total_removed}")


In [4]:
for species in species_keys.keys():
    downloadImages(species, RAW_DATA_DIR, limit=2000)
    cleanData(species)

INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Carduelis+carduelis&per_page=2000&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive




Downloading images for: Carduelis carduelis


100%|██████████| 200/200 [08:36<00:00,  2.58s/it]


Downloaded 266 images for Carduelis carduelis
Downloaded image 693/2000

INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Ciconia+ciconia&per_page=2000&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Downloaded 428 GBIF images for Carduelis carduelis
Total images downloaded for Carduelis carduelis: 694
Starting image extraction and cleaning...
Finished cleaning. Total Images Removed: 0

Downloading images for: Ciconia ciconia


100%|██████████| 200/200 [07:39<00:00,  2.30s/it]


Downloaded 255 images for Ciconia ciconia
Downloaded image 738/2000

INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Columba+livia&per_page=2000&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Downloaded 484 GBIF images for Ciconia ciconia
Total images downloaded for Ciconia ciconia: 739
Starting image extraction and cleaning...
Finished cleaning. Total Images Removed: 0

Downloading images for: Columba livia


100%|██████████| 200/200 [14:50<00:00,  4.45s/it]


Downloaded 300 images for Columba livia
Downloaded image 774/2000

INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Delichon+urbicum&per_page=2000&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Downloaded 475 GBIF images for Columba livia
Total images downloaded for Columba livia: 775
Starting image extraction and cleaning...
Finished cleaning. Total Images Removed: 0

Downloading images for: Delichon urbicum


100%|██████████| 200/200 [11:19<00:00,  3.40s/it]


Downloaded 398 images for Delichon urbicum
Downloaded image 889/2000

INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Emberiza+calandra&per_page=2000&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Downloaded 492 GBIF images for Delichon urbicum
Total images downloaded for Delichon urbicum: 890
Starting image extraction and cleaning...
Finished cleaning. Total Images Removed: 0

Downloading images for: Emberiza calandra


100%|██████████| 200/200 [08:28<00:00,  2.54s/it]


Downloaded 294 images for Emberiza calandra
Downloaded image 818/2000

INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Hirundo+rustica&per_page=2000&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Downloaded 525 GBIF images for Emberiza calandra
Total images downloaded for Emberiza calandra: 819
Starting image extraction and cleaning...
Finished cleaning. Total Images Removed: 0

Downloading images for: Hirundo rustica


100%|██████████| 200/200 [10:32<00:00,  3.16s/it]


Downloaded 343 images for Hirundo rustica
Downloaded image 874/2000

INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Passer+domesticus&per_page=2000&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Downloaded 532 GBIF images for Hirundo rustica
Total images downloaded for Hirundo rustica: 875
Starting image extraction and cleaning...
Finished cleaning. Total Images Removed: 0

Downloading images for: Passer domesticus


100%|██████████| 200/200 [10:40<00:00,  3.20s/it]


Downloaded 290 images for Passer domesticus
Downloaded image 836/2000

INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Serinus+serinus&per_page=2000&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Downloaded 547 GBIF images for Passer domesticus
Total images downloaded for Passer domesticus: 837
Starting image extraction and cleaning...
Finished cleaning. Total Images Removed: 0

Downloading images for: Serinus serinus


100%|██████████| 200/200 [09:25<00:00,  2.83s/it]


Downloaded 279 images for Serinus serinus
Downloaded image 787/2000

INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Streptopelia+decaocto&per_page=2000&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Downloaded 509 GBIF images for Serinus serinus
Total images downloaded for Serinus serinus: 788
Starting image extraction and cleaning...
Finished cleaning. Total Images Removed: 0

Downloading images for: Streptopelia decaocto


100%|██████████| 200/200 [11:01<00:00,  3.31s/it]


Downloaded 321 images for Streptopelia decaocto
Downloaded image 767/2000

INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Sturnus+unicolor&per_page=2000&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Downloaded 447 GBIF images for Streptopelia decaocto
Total images downloaded for Streptopelia decaocto: 768
Starting image extraction and cleaning...
Finished cleaning. Total Images Removed: 0

Downloading images for: Sturnus unicolor


100%|██████████| 200/200 [05:10<00:00,  1.55s/it]


Downloaded 261 images for Sturnus unicolor
Downloaded image 634/2000

INFO:Request:
GET https://api.inaturalist.org/v1/observations?license=CC-BY%2CCC-BY-NC&quality_grade=research&taxon_name=Turdus+merula&per_page=2000&media_type=photo
User-Agent: python-requests/2.32.3 pyinaturalist/0.20.1
Accept-Encoding: gzip, deflate, br
Accept: application/json
Connection: keep-alive



Downloaded 374 GBIF images for Sturnus unicolor
Total images downloaded for Sturnus unicolor: 635
Starting image extraction and cleaning...
Finished cleaning. Total Images Removed: 0

Downloading images for: Turdus merula


100%|██████████| 200/200 [06:28<00:00,  1.94s/it]


Downloaded 267 images for Turdus merula
Downloaded image 600/2000
Error downloading image 162 for Turdus merula: HTTPSConnectionPool(host='inaturalist-open-data.s3.amazonaws.com', port=443): Read timed out.
Downloaded 569 GBIF images for Turdus merula
Total images downloaded for Turdus merula: 836
Starting image extraction and cleaning...
Finished cleaning. Total Images Removed: 0
