<a href="https://colab.research.google.com/github/henry5250/SpotCheck/blob/main/scripts/1_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import requests
from time import sleep

def download_inat_images(species_name, max_images=100, delay=1):
    """
    Downloads images from iNaturalist for a given species name.
    Only includes research-grade observations with photos.
    """
    folder_name = species_name.replace(" ", "_")
    os.makedirs(folder_name, exist_ok=True)

    base_url = "https://api.inaturalist.org/v1/observations"
    params = {
        "taxon_name": species_name,
        "quality_grade": "research",
        "has": "photos",
        "per_page": 100,   # max per page
        "page": 1
    }

    downloaded = 0
    seen_urls = set()

    while downloaded < max_images:
        print(f"Fetching page {params['page']} for {species_name}...")
        response = requests.get(base_url, params=params)
        if response.status_code != 200:
            print("Error fetching data:", response.status_code)
            break

        data = response.json().get("results", [])
        if not data:
            print("No more results found.")
            break

        for obs in data:
            if "photos" not in obs or not obs["photos"]:
                continue
            img_url = obs["photos"][0]["url"].replace("square", "medium")
            if img_url in seen_urls:
                continue
            seen_urls.add(img_url)

            try:
                img_data = requests.get(img_url).content
                img_path = os.path.join(folder_name, f"{downloaded+1:04}.jpg")
                with open(img_path, "wb") as f:
                    f.write(img_data)
                downloaded += 1
                print(f"Saved: {img_path}")
                if downloaded >= max_images:
                    break
                sleep(delay)  # polite delay to avoid rate limits
            except Exception as e:
                print("Failed to download image:", e)
                continue

        params["page"] += 1

    print(f"\n✅ Downloaded {downloaded} images for {species_name}\n")

# Example usage:
download_inat_images("Harmonia axyridis", max_images=120)
download_inat_images("Coccinella novemnotata", max_images=120)

Fetching page 1 for Harmonia axyridis...
Saved: Harmonia_axyridis/0001.jpg
Saved: Harmonia_axyridis/0002.jpg
Saved: Harmonia_axyridis/0003.jpg
Saved: Harmonia_axyridis/0004.jpg
Saved: Harmonia_axyridis/0005.jpg
Saved: Harmonia_axyridis/0006.jpg
Saved: Harmonia_axyridis/0007.jpg
Saved: Harmonia_axyridis/0008.jpg
Saved: Harmonia_axyridis/0009.jpg
Saved: Harmonia_axyridis/0010.jpg
Saved: Harmonia_axyridis/0011.jpg
Saved: Harmonia_axyridis/0012.jpg
Saved: Harmonia_axyridis/0013.jpg
Saved: Harmonia_axyridis/0014.jpg
Saved: Harmonia_axyridis/0015.jpg
Saved: Harmonia_axyridis/0016.jpg
Saved: Harmonia_axyridis/0017.jpg
Saved: Harmonia_axyridis/0018.jpg
Saved: Harmonia_axyridis/0019.jpg
Saved: Harmonia_axyridis/0020.jpg
Saved: Harmonia_axyridis/0021.jpg
Saved: Harmonia_axyridis/0022.jpg
Saved: Harmonia_axyridis/0023.jpg
Saved: Harmonia_axyridis/0024.jpg
Saved: Harmonia_axyridis/0025.jpg
Saved: Harmonia_axyridis/0026.jpg
Saved: Harmonia_axyridis/0027.jpg
Saved: Harmonia_axyridis/0028.jpg
Saved: 

In [2]:
import os

folder_harmonia = "Harmonia_axyridis"
folder_coccinella = "Coccinella_novemnotata"

# Create zip files
os.system(f"zip -r {folder_harmonia}.zip {folder_harmonia}")
os.system(f"zip -r {folder_coccinella}.zip {folder_coccinella}")

print(f"Created {folder_harmonia}.zip and {folder_coccinella}.zip")

Created Harmonia_axyridis.zip and Coccinella_novemnotata.zip
