In [None]:
# PRIMERO MONTAMOS NUESTRO DRIVE PARA PODER ALMACENAR LAS IMAGENES SCRAPEADAS
from google.colab import drive
drive.mount('/content/drive')
import os
import time
import requests
import hashlib
import csv
import random
from tqdm.notebook import tqdm
from math import floor


# ===================== CONFIG (ajusta según prefieras) =====================
BASE_DIR = "/content/drive/MyDrive/PLANTS_MEDICAL"   # carpeta base en Drive
# Nombre de la carpeta base del dataset
DATASET_NAME = "dataset_split_anidado"
OUTPUT_ROOT_DIR = os.path.join(BASE_DIR, DATASET_NAME)


# Fuentes a usar: 'inaturalist', 'wikimedia'
SOURCES = ["inaturalist", "wikimedia"]


MAX_PER_SPECIES = 1200  # máximo imágenes por especie (ajusta)
INAT_PER_PAGE = 50
PAUSE_BETWEEN_REQS = 0.5
TIMEOUT = 15


# Porcentajes de división (deben sumar 1.0 o 100)
TRAIN_PERCENT = 0.70 # 70%
VAL_PERCENT = 0.20   # 20%
TEST_PERCENT = 0.10  # 10%


if not (TRAIN_PERCENT + VAL_PERCENT + TEST_PERCENT) == 1.0:
    print("¡ERROR DE CONFIGURACIÓN! Los porcentajes de división no suman 1.0 (100%).")


# Especies (common_name: scientific_name)
SPECIES = {
    "Eucalipto": "Eucalyptus globulus",
    "Manzanilla": "Matricaria chamomilla",
    "Puna Salvia": "Lepechinia meyenii",
    "Romero": "Rosmarinus officinalis",
    "Ruda": "Ruta graveolens",
    "Muña": "Minthostachys mollis",
    "Berro": "Nasturtium officinale",
    "Ortiga": "Urtica urens",
    "Llantén": "Plantago lanceolata",
    "Wira Wira": "Gnaphalium glandulosum",
    "Mullaka": "Muehlenbeckia volcanica",
    "Ajenjo": "Artemisia absinthium"
}


# ===================== UTILIDADES ADAPTADAS =====================
def md5_of_bytes(b):
    m = hashlib.md5()
    m.update(b)
    return m.hexdigest()


def safe_filename(name):
    return "".join(c if c.isalnum() or c in "._-" else "_" for c in name).strip()


def ensure_dir(path):
    os.makedirs(path, exist_ok=True)


def get_split_dir(common_name):
    """
    Asigna la ruta de guardado a la carpeta train, val o test y crea la subcarpeta de especie anidada.
    Estructura: OUTPUT_ROOT_DIR / split / common_name_split /
    """
    r = random.random() # Número aleatorio entre 0.0 y 1.0


    # 1. Determinar el split (train/val/test)
    if r < TRAIN_PERCENT:
        split = "train"
    elif r < (TRAIN_PERCENT + VAL_PERCENT):
        split = "val"
    else:
        split = "test"


    # 2. Crear el nombre de la subcarpeta de la especie anidada (ej: Eucalipto_train)
    nested_species_dir_name = safe_filename(f"{common_name}_{split}")


    # 3. Construir la ruta final: OUTPUT_ROOT_DIR / split / nested_species_dir_name
    final_dir = os.path.join(OUTPUT_ROOT_DIR, split, nested_species_dir_name)


    ensure_dir(final_dir) # Aseguramos que exista la carpeta
    return final_dir, split


# ===================== PREPARAR CSV DE METADATOS =====================
ensure_dir(OUTPUT_ROOT_DIR)


# Creamos las carpetas train, val, test vacías si no existen, dentro de OUTPUT_ROOT_DIR
ensure_dir(os.path.join(OUTPUT_ROOT_DIR, "train"))
ensure_dir(os.path.join(OUTPUT_ROOT_DIR, "val"))
ensure_dir(os.path.join(OUTPUT_ROOT_DIR, "test"))




csv_path = os.path.join(OUTPUT_ROOT_DIR, "descargas_metadata_split_anidado.csv")
# Si el archivo existe, lo SOBREESCRIBIMOS
csv_file = open(csv_path, "w", newline="", encoding="utf-8")
csv_writer = csv.writer(csv_file)
# Añadimos la columna 'split'
csv_writer.writerow(["common_name","scientific_name","source","source_id","image_url","saved_path","md5","split"])


# ===================== DESCARGAR DESDE iNATURALIST =====================
# La lógica interna de descarga es la misma, solo cambia cómo se llama a get_split_dir
def download_from_inaturalist(common_name, scientific_name, max_images):
    """Descarga observaciones de iNaturalist con división train/val/test."""
    count = 0
    page = 1
    base_api = "https://api.inaturalist.org/v1/observations"
    params = {
        "taxon_name": scientific_name,
        "quality_grade": "research",
        "photos": True,
        "per_page": INAT_PER_PAGE,
        "page": page
    }


    while count < max_images:
        params["page"] = page
        try:
            r = requests.get(base_api, params=params, timeout=TIMEOUT)
            data = r.json()
        except Exception as e:
            print("Error iNaturalist API:", e)
            break


        results = data.get("results", [])
        if not results:
            break


        for obs in results:
            photos = obs.get("photos", [])
            for ph in photos:
                if count >= max_images:
                    break


                url = ph.get("url")
                if not url:
                    continue
                url = url.replace("square", "large")
                try:
                    resp = requests.get(url, timeout=TIMEOUT)
                    if resp.status_code != 200:
                        continue
                    b = resp.content
                    h = md5_of_bytes(b)


                    # OBTENER CARPETA DE DIVISIÓN (estructura anidada)
                    target_dir, split_name = get_split_dir(common_name)


                    # Guardar
                    fname = safe_filename(f"{scientific_name.replace(' ','_')}_inat_{count}.jpg")
                    outp = os.path.join(target_dir, fname)
                    with open(outp, "wb") as fh:
                        fh.write(b)
                    # Escribir metadatos
                    csv_writer.writerow([common_name, scientific_name, "iNaturalist", obs.get("id"), url, outp, h, split_name])
                    count += 1


                except Exception:
                    continue
                time.sleep(PAUSE_BETWEEN_REQS)
            if count >= max_images:
                break


        # paginación
        total_results = data.get("total_results", 0)
        if (page * INAT_PER_PAGE) >= total_results:
            break
        page += 1
        time.sleep(PAUSE_BETWEEN_REQS)
    return count


# ===================== DESCARGAR DESDE WIKIMEDIA COMMONS =====================
# La lógica interna de descarga es la misma, solo cambia cómo se llama a get_split_dir
def download_from_wikimedia(common_name, scientific_name, max_images, start_count):
    """Busca imágenes en Wikimedia Commons con división train/val/test."""
    count = start_count
    session = requests.Session()
    search_api = "https://commons.wikimedia.org/w/api.php"
    query = scientific_name
    params_search = {
        "action": "query",
        "format": "json",
        "generator": "search",
        "gsrsearch": query,
        "gsrlimit": 50,
        "prop": "imageinfo",
        "iiprop": "url",
        "iiurlwidth": 800
    }
    try:
        r = session.get(search_api, params=params_search, timeout=TIMEOUT)
        data = r.json()
        pages = data.get("query", {}).get("pages", {})
    except Exception as e:
        print("Wikimedia search error:", e)
        return 0


    for pid, page in pages.items():
        if count >= max_images:
            break


        imageinfo = page.get("imageinfo")
        if not imageinfo:
            continue
        for info in imageinfo:
            if count >= max_images:
                break


            url = info.get("thumburl") or info.get("url")
            if not url:
                continue
            try:
                resp = requests.get(url, timeout=TIMEOUT)
                if resp.status_code != 200:
                    continue
                b = resp.content
                h = md5_of_bytes(b)


                # OBTENER CARPETA DE DIVISIÓN (estructura anidada)
                target_dir, split_name = get_split_dir(common_name)


                # Guardar
                fname = safe_filename(f"{scientific_name.replace(' ','_')}_wm_{count}.jpg")
                outp = os.path.join(target_dir, fname)
                with open(outp, "wb") as fh:
                    fh.write(b)
                # Escribir metadatos
                csv_writer.writerow([common_name, scientific_name, "Wikimedia", pid, url, outp, h, split_name])
                count += 1


            except Exception:
                continue
            time.sleep(PAUSE_BETWEEN_REQS)


    return count - start_count


# ===================== MAIN: ejecutar descargas por especie =====================
summary = []


# Configuramos la semilla para que la división sea reproducible
random.seed(42)


for common, sci in SPECIES.items():
    print("\n==============================")
    print(f"Procesando: {common}  ({sci})")
    print("=============================")


    got = 0
    # priorizar iNaturalist
    if "inaturalist" in SOURCES:
        need = MAX_PER_SPECIES - got
        if need > 0:
            n = download_from_inaturalist(common, sci, need)
            got += n
            print(f"  iNaturalist: +{n}")


    # fallback wikimedia si faltan
    if got < MAX_PER_SPECIES and "wikimedia" in SOURCES:
        need = MAX_PER_SPECIES - got
        n = download_from_wikimedia(common, sci, MAX_PER_SPECIES, got)
        got += n
        print(f"  Wikimedia: +{n}")


    summary.append((common, sci, got))
    print(f"Total obtenidas para {common}: {got}/{MAX_PER_SPECIES}")


# Cerrar CSV
csv_file.close()


print("\n====== RESUMEN FINAL ======")
for s in summary:
    print(f"- {s[0]} ({s[1]}): {s[2]} imágenes")


print("\nMetadatos guardados en:", csv_path)
print("Carpeta destino:", OUTPUT_ROOT_DIR)
