In [26]:
import pandas as pd
from pathlib import Path
import requests
import time
from bs4 import BeautifulSoup

# Plant list

In [2]:
df = pd.read_csv('master_plant_list.csv', index_col=0)
df.shape

(1135, 2)

In [3]:
df.head()

Unnamed: 0,Family,Species
0,ACANTHACEAE,Carlowrightia arizonica
1,ACANTHACEAE,Justicia californica
2,AIZOACEAE,Mesembryanthemum nodiflorum
3,AIZOACEAE,Trianthema portulacastrum
4,AMARANTHACEAE,Amaranthus crassipes


In [4]:
out_dir = Path("plant_lists")

In [5]:
plantlist = r"C:\Users\matta\Desktop\Documents\Python\clonal_plants\plant_lists\plantlst.txt"
plants = pd.read_csv(plantlist, sep=',')
plants.shape

(93157, 5)

In [6]:
plants.head()

Unnamed: 0,Symbol,Synonym Symbol,Scientific Name with Author,Common Name,Family
0,ABAB,,Abutilon abutiloides (Jacq.) Garcke ex Hochr.,shrubby Indian mallow,Malvaceae
1,ABAB,ABAM5,Abutilon americanum (L.) Sweet,,
2,ABAB,ABJA,Abutilon jacquinii G. Don,,
3,ABAB,ABLI,Abutilon lignosum (Cav.) G. Don,,
4,ABAB70,,Abietinella abietina (Hedw.) Fleisch.,abietinella moss,Thuidiaceae


In [7]:
names = plants["Scientific Name with Author"].str.split(' ', expand = True)
names = names[0] + ' ' + names[1]
plants['Species'] = names
plants.drop(columns = ['Scientific Name with Author', 'Synonym Symbol', 'Family', 'Common Name'], inplace = True)
plants = plants.drop_duplicates().reset_index(drop=True)

In [8]:
plants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82269 entries, 0 to 82268
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Symbol   82269 non-null  object
 1   Species  82269 non-null  object
dtypes: object(2)
memory usage: 1.3+ MB


In [9]:
df = df.dropna().reset_index(drop=True)
df.shape

(1133, 2)

In [10]:
df.Species.value_counts()

Species
Salix exigua                   1
Carlowrightia arizonica        1
Justicia californica           1
Mesembryanthemum nodiflorum    1
Trianthema portulacastrum      1
                              ..
Allium macropetalum            1
Tidestromia lanuginosa         1
Suaeda nigra                   1
Salsola tragus                 1
Monolepis nuttalliana          1
Name: count, Length: 1133, dtype: int64

In [11]:
plants.Species.value_counts().head()

Species
Astragalus lentiginosus    37
Eriogonum umbellatum       33
Ericameria nauseosa        25
Chrysothamnus nauseosus    25
Lepidium montanum          22
Name: count, dtype: int64

In [12]:
plants[plants.Species == 'Carnegiea gigantea']

Unnamed: 0,Symbol,Species
12464,CAGI10,Carnegiea gigantea


In [13]:
df.merge(plants, left_on='Species', right_on='Species', how='left')

Unnamed: 0,Family,Species,Symbol
0,ACANTHACEAE,Carlowrightia arizonica,CAAR7
1,ACANTHACEAE,Justicia californica,JUCA8
2,AIZOACEAE,Mesembryanthemum nodiflorum,MENO2
3,AIZOACEAE,Trianthema portulacastrum,TRPO2
4,AMARANTHACEAE,Amaranthus crassipes,AMCR
...,...,...,...
2153,PTERIDACEAE,Cheilanthes wootoni,
2154,SALICACEAE,Salix exigua,SAEX
2155,SALICACEAE,Salix exigua,SAIN3
2156,SALICACEAE,Salix exigua,SAME2


# Scraping

## SEINet

In [52]:
def get_seinet_tabs(genus, species, pause=1.5, retries=2):
    """
    Robust SEINet tab scraper.

    Returns
    -------
    dict with keys:
        - tabs: {tab_label: text}
        - status: 'ok' | 'not_found' | 'http_error' | 'timeout' | 'parse_error'
    """

    url = f"https://swbiodiversity.org/seinet/taxa/index.php?taxon={genus}+{species}"

    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/120.0.0.0 Safari/537.36"
        ),
        "Referer": "https://swbiodiversity.org/seinet/index.php"
    }

    for attempt in range(retries + 1):
        try:
            r = requests.get(url, headers=headers, timeout=30)
            r.raise_for_status()
            break
        except requests.Timeout:
            status = "timeout"
        except requests.HTTPError:
            status = "http_error"
        except requests.RequestException:
            status = "request_error"

        if attempt == retries:
            return {"tabs": {}, "status": status}

        time.sleep(2)

    soup = BeautifulSoup(r.text, "html.parser")

    desctabs = soup.find("div", id="desctabs")
    if desctabs is None:
        return {"tabs": {}, "status": "not_found"}

    # --- Map tab IDs to labels ---
    tab_labels = {}
    nav = desctabs.find("ul", class_="ui-tabs-nav")

    if nav:
        for li in nav.find_all("li"):
            a = li.find("a")
            href = a.get("href", "")
            if href.startswith("#"):
                tab_labels[href.lstrip("#")] = a.get_text(strip=True)

    tabs_data = {}

    for tab_div in desctabs.find_all("div", class_="sptab"):
        tab_id = tab_div.get("id")
        label = tab_labels.get(tab_id, tab_id)
        tabs_data[label] = tab_div.get_text(" ", strip=True)

    time.sleep(pause)

    return {
        "tabs": tabs_data,
        "status": "ok" if tabs_data else "parse_error"
    }

In [48]:
plant_list = df.Species.str.split(' ', expand = True)
plant_list.rename(columns = {0 : 'Genus', 1 : 'Species'}, inplace = True)

In [56]:
TAB_COLUMNS = ["FNA", "VPAP", "SW Field Guide"]

rows = []

for genus, species in zip(plant_list[:5].Genus, plant_list[:5].Species):
    result = get_seinet_tabs(genus, species, pause=2)

    tabs = result["tabs"]
    status = result["status"]

    row = {
        "Genus": genus,
        "Species": species,
        "seinet_status": status
    }

    for tab in TAB_COLUMNS:
        row[tab] = tabs.get(tab)

    rows.append(row)

In [57]:
seinet_df = pd.DataFrame(rows)

In [58]:
seinet_df

Unnamed: 0,Genus,Species,seinet_status,FNA,VPAP,SW Field Guide
0,Carlowrightia,arizonica,ok,,,"Wiggins 1964, Daniel 1984, Kearney and Peebles..."
1,Justicia,californica,ok,,,"Benson and Darrow 1981, Hickman 1993, Powell 1..."
2,Mesembryanthemum,nodiflorum,ok,"Nancy J. Vivrette, John E. Bleck & Wayne R. Fe...",,FNA 2004 Duration : Annual Nativity : Non-Nati...
3,Trianthema,portulacastrum,ok,Wayne R. Ferren Jr. in Flora of North America ...,,"Kearney and Peebles 1969, FNA 2004, Correll an..."
4,Amaranthus,crassipes,ok,Sergei L. Mosyakin & Kenneth R. Robertson in F...,,
