In [1]:
import pandas as pd
from pathlib import Path
import requests
import time
from bs4 import BeautifulSoup
import os
import random

# Plant list

In [2]:
df = pd.read_csv('master_plant_list.csv', index_col=0)
df.shape

(1135, 2)

In [3]:
df.head()

Unnamed: 0,Family,Species
0,ACANTHACEAE,Carlowrightia arizonica
1,ACANTHACEAE,Justicia californica
2,AIZOACEAE,Mesembryanthemum nodiflorum
3,AIZOACEAE,Trianthema portulacastrum
4,AMARANTHACEAE,Amaranthus crassipes


In [4]:
df = df.dropna().reset_index(drop=True)
df.shape

(1133, 2)

In [5]:
df.Species.value_counts()

Species
Salix exigua                   1
Carlowrightia arizonica        1
Justicia californica           1
Mesembryanthemum nodiflorum    1
Trianthema portulacastrum      1
                              ..
Allium macropetalum            1
Tidestromia lanuginosa         1
Suaeda nigra                   1
Salsola tragus                 1
Monolepis nuttalliana          1
Name: count, Length: 1133, dtype: int64

# Scraping

## SEINet

In [6]:
def get_seinet_tabs(genus, species, pause=1.5, retries=2):
    """
    Robust SEINet tab scraper.

    Returns
    -------
    dict with keys:
        - tabs: {tab_label: text}
        - status: 'ok' | 'not_found' | 'http_error' | 'timeout' | 'parse_error'
    """

    url = f"https://swbiodiversity.org/seinet/taxa/index.php?taxon={genus}+{species}"

    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/120.0.0.0 Safari/537.36"
        ),
        "Referer": "https://swbiodiversity.org/seinet/index.php"
    }

    for attempt in range(retries + 1):
        try:
            r = requests.get(url, headers=headers, timeout=30)
            r.raise_for_status()
            break
        except requests.Timeout:
            status = "timeout"
        except requests.HTTPError:
            status = "http_error"
        except requests.RequestException:
            status = "request_error"

        if attempt == retries:
            return {"tabs": {}, "status": status}

        time.sleep(2)

    soup = BeautifulSoup(r.text, "html.parser")

    desctabs = soup.find("div", id="desctabs")
    if desctabs is None:
        return {"tabs": {}, "status": "not_found"}

    # --- Map tab IDs to labels ---
    tab_labels = {}
    nav = desctabs.find("ul", class_="ui-tabs-nav")

    if nav:
        for li in nav.find_all("li"):
            a = li.find("a")
            href = a.get("href", "")
            if href.startswith("#"):
                tab_labels[href.lstrip("#")] = a.get_text(strip=True)

    tabs_data = {}

    for tab_div in desctabs.find_all("div", class_="sptab"):
        tab_id = tab_div.get("id")
        label = tab_labels.get(tab_id, tab_id)
        tabs_data[label] = tab_div.get_text(" ", strip=True)

    time.sleep(random.uniform(2, 4))

    return {
        "tabs": tabs_data,
        "status": "ok" if tabs_data else "parse_error"
    }

In [7]:
plant_list = df.Species.str.split(' ', expand = True)
plant_list.rename(columns = {0 : 'Genus', 1 : 'Species'}, inplace = True)
plant_list = plant_list[plant_list.Species != 'sp.'].reset_index(drop=True)
plant_list.shape

(1128, 2)

In [8]:
seinet_results = pd.read_csv('seinet_results.csv')

In [9]:
seinet_results.seinet_status.value_counts()

seinet_status
ok    1057
Name: count, dtype: int64

In [10]:
seinet_results = seinet_results[seinet_results.seinet_status == 'ok'].reset_index(drop=True)

In [11]:
seinet_results.to_csv('seinet_results.csv')