This notebook is a tool that scrapes images from the lichen portal in order to generate training data.



In [None]:
# SETUP
I_KNOW_WHAT_I_AM_DOING = False
I_KNOW_THIS_IS_AN_INTERNAL_TOOL = False

csv_path = '../../data/nov_4_24foliicolousverification.csv'
num_rows_to_import = 10 # Do not set this very high please
randomize_columns = True # highly recommended

In [None]:
import pandas as pd
import os

if not I_KNOW_THIS_IS_AN_INTERNAL_TOOL or not I_KNOW_WHAT_I_AM_DOING:
    raise RuntimeError("i stop now")

# Step 1: fetch ID

# Load the CSV into a DataFrame
df = pd.read_csv(csv_path, encoding='ISO-8859-1')

if randomize_columns:
    # Shuffle all rows in the DataFrame
    df = df.sample(frac=1).reset_index(drop=True)

# Get list of IDs
if 'id' not in df.columns:
    raise ValueError("'id' column not found.")
all_ids = df['id'].dropna().astype(str).tolist()

# ilter out IDs that already have image files
pending_ids = []
for id_val in all_ids:
    image_path = os.path.join('../../data/img', f"{id_val}.jpg")

    if not os.path.exists(image_path):
        pending_ids.append(id_val)
    if len(pending_ids) >= num_rows_to_import:
        break


# ✅ Use `pending_ids` for your scraping loop
print("IDs to process:", pending_ids)


In [None]:
import requests
from bs4 import BeautifulSoup

for occid in pending_ids:
    try:
        url = f"https://lichenportal.org/portal/collections/individual/index.php?occid={occid}"

        # Step 3: Request the page
        response = requests.get(url)
        response.raise_for_status()  # Raise an error if request failed

        # Step 4: Parse HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        thumb_div = soup.find(id='thumbnail-div')

        if thumb_div and thumb_div.a:
            href = thumb_div.a['href']
            if href.startswith('http'):
                absolute_url = href  # already absolute
            else:
                absolute_url = f"https://lichenportal.org{href}"  # relative path

            print("Image URL:", absolute_url)
        else:
            print("Image URL not found for id " + occid)
            continue


        # save the image
        img_data = requests.get(absolute_url).content
        with open(f"../../data/img/{occid}.jpg", 'wb') as f:
            f.write(img_data)

        print(f"Saved image as {occid}.jpg")

        
    except Exception as e:
        print(f"[{occid}] Error fetching page: {e}")

