## Crawl ·∫£nh

In [1]:
import requests
import urllib.parse
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
from tabulate import tabulate  # pip install tabulate

# ---- 1. ƒê·ªçc danh s√°ch t√™n khoa h·ªçc t·ª´ CSV ----
csv_path = "flower.csv"
df_names = pd.read_csv(csv_path, encoding="utf-8")
scientific_names = df_names["ten_khoa_hoc"].tolist()

print("T·ªïng s·ªë lo√†i c·∫ßn ki·ªÉm tra:", len(scientific_names))

# ---- 2. H√†m ki·ªÉm tra tr√™n iNaturalist v·ªõi retry ----
def check_inaturalist(scientific_name, max_retries=3):
    retries = 0
    last_error = ""

    while retries < max_retries:
        try:
            name_encoded = urllib.parse.quote(scientific_name)
            url = f"https://api.inaturalist.org/v1/taxa?q={name_encoded}&rank=species"

            response = requests.get(url, timeout=10)

            if response.status_code != 200:
                last_error = f"HTTP {response.status_code}"
                retries += 1
                time.sleep(1)
                continue

            try:
                data = response.json()
            except ValueError as e:
                last_error = f"JSONDecodeError: {e}"
                retries += 1
                time.sleep(1)
                continue

            results = data.get("results", [])
            if results:
                best_match = results[0]
                matched_name = best_match.get("name", "")
                return scientific_name, True, matched_name, ""
            else:
                return scientific_name, False, "", ""

        except Exception as e:
            last_error = str(e)
            retries += 1
            time.sleep(1)

    return scientific_name, f"L·ªói: {last_error}", "", last_error

# ---- 3. Ch·∫°y ƒëa lu·ªìng ----
results = []
max_threads = 1
with ThreadPoolExecutor(max_workers=max_threads) as executor:
    future_to_name = {executor.submit(check_inaturalist, name): name for name in scientific_names}

    for future in as_completed(future_to_name):
        name, exists, matched_name, error = future.result()
        results.append({
            "T√™n khoa h·ªçc": name,
            "T·ªìn t·∫°i tr√™n iNaturalist": exists,
            "T√™n kh·ªõp": matched_name,
            "L·ªói": error
        })

# ---- 4. Chuy·ªÉn sang DataFrame v√† in b·∫£ng ----
df_result = pd.DataFrame(results)

# In ra console d∆∞·ªõi d·∫°ng b·∫£ng ƒë·∫πp
print("\n=== K·∫øt qu·∫£ ki·ªÉm tra iNaturalist ===")
print(tabulate(df_result, headers='keys', tablefmt='grid', showindex=False))

T·ªïng s·ªë lo√†i c·∫ßn ki·ªÉm tra: 113

=== K·∫øt qu·∫£ ki·ªÉm tra iNaturalist ===
+---------------------------+----------------------------+---------------------------+-------+
| T√™n khoa h·ªçc              | T·ªìn t·∫°i tr√™n iNaturalist   | T√™n kh·ªõp                  | L·ªói   |
| Aconitum napellus         | True                       | Aconitum napellus         |       |
+---------------------------+----------------------------+---------------------------+-------+
| Adenium obesum            | True                       | Adenium obesum            |       |
+---------------------------+----------------------------+---------------------------+-------+
| Alpinia purpurata         | True                       | Alpinia purpurata         |       |
+---------------------------+----------------------------+---------------------------+-------+
| Alstroemeria aurea        | True                       | Alstroemeria aurea        |       |
+---------------------------+-------------------

In [None]:
import os
import requests
import pandas as pd
from time import sleep
from urllib.parse import quote
from concurrent.futures import ThreadPoolExecutor

In [None]:
csv_path = "1.csv"
root_folder = "Vietnam_flower"
os.makedirs(root_folder, exist_ok=True)
df = pd.read_csv(csv_path)
MAX_IMAGES = 300

In [None]:
def download_image(photo_url, file_path):
    try:
        img_data = requests.get(photo_url, timeout=30).content
        with open(file_path, "wb") as f:
            f.write(img_data)
        return True
    except Exception as e:
        print(f"  L·ªói t·∫£i ·∫£nh: {e}")
        return False

In [None]:
for species in df["ten_khoa_hoc"]:
    species_folder = os.path.join(root_folder, species.replace(" ", "_"))
    os.makedirs(species_folder, exist_ok=True)
    print(f"\nƒêang crawl ·∫£nh cho lo√†i: {species} ...")

    page = 1
    total_images = len(os.listdir(species_folder))
    stop = False

    while not stop and total_images < MAX_IMAGES:
        url = (
            "https://api.inaturalist.org/v1/observations"
            f"?taxon_name={quote(species)}"
            "&per_page=200"
            f"&page={page}"
        )
        try:
            r = requests.get(url, timeout=30)
            r.raise_for_status()
            data = r.json()
            results = data.get("results", [])
            if not results:
                break

            with ThreadPoolExecutor(max_workers=8) as executor:
                download_jobs = []
                for obs in results:
                    for idx, p in enumerate(obs.get("photos", [])):
                        if total_images >= MAX_IMAGES:   # ‚úÖ ch·∫∑n ngay khi ƒë·∫°t 1000
                            stop = True
                            break

                        photo_url = p.get("url")
                        if not photo_url:
                            continue
                        photo_url = photo_url.replace("square", "original")

                        file_name = f"{obs['id']}_{idx}.jpg"
                        file_path = os.path.join(species_folder, file_name)

                        if not os.path.exists(file_path):
                            download_jobs.append(executor.submit(download_image, photo_url, file_path))
                            total_images += 1   # ‚úÖ tƒÉng ngay khi submit task

                    if stop:
                        break

                # ch·ªù t·∫•t c·∫£ job c·ªßa v√≤ng n√†y xong
                for job in download_jobs:
                    job.result()

            print(f"  ‚úÖ Trang {page}: t·ªïng {total_images} ·∫£nh")
            page += 1
            sleep(1)

        except Exception as e:
            print(f"‚ùå L·ªói v·ªõi lo√†i {species}: {e}")
            break

    print(f"üéâ Ho√†n t·∫•t {species}. T·ªïng s·ªë ·∫£nh: {total_images}")

In [None]:
for species in df["ten_khoa_hoc"]:
    species_folder = os.path.join(root_folder, species.replace(" ", "_"))
    os.makedirs(species_folder, exist_ok=True)
    print(f"\nƒêang crawl ·∫£nh cho lo√†i: {species} ...")

    page = 1
    total_images = len(os.listdir(species_folder))
    stop = False

    while not stop and total_images < MAX_IMAGES:
        url = (
            "https://api.inaturalist.org/v1/observations"
            f"?taxon_name={quote(species)}"
            "&per_page=200"
            f"&page={page}"
        )
        try:
            r = requests.get(url, timeout=30)
            r.raise_for_status()
            data = r.json()
            results = data.get("results", [])
            if not results:
                break

            with ThreadPoolExecutor(max_workers=8) as executor:
                download_jobs = []
                for obs in results:
                    for idx, p in enumerate(obs.get("photos", [])):
                        if total_images >= MAX_IMAGES:   # ‚úÖ ch·∫∑n ngay khi ƒë·∫°t 1000
                            stop = True
                            break

                        photo_url = p.get("url")
                        if not photo_url:
                            continue
                        photo_url = photo_url.replace("square", "original")

                        file_name = f"{obs['id']}_{idx}.jpg"
                        file_path = os.path.join(species_folder, file_name)

                        if not os.path.exists(file_path):
                            download_jobs.append(executor.submit(download_image, photo_url, file_path))
                            total_images += 1   # ‚úÖ tƒÉng ngay khi submit task

                    if stop:
                        break

                # ch·ªù t·∫•t c·∫£ job c·ªßa v√≤ng n√†y xong
                for job in download_jobs:
                    job.result()

            print(f"  ‚úÖ Trang {page}: t·ªïng {total_images} ·∫£nh")
            page += 1
            sleep(1)

        except Exception as e:
            print(f"‚ùå L·ªói v·ªõi lo√†i {species}: {e}")
            break

    print(f"üéâ Ho√†n t·∫•t {species}. T·ªïng s·ªë ·∫£nh: {total_images}")