In [26]:
import pandas as pd
from pathlib import Path
import requests
import time
from bs4 import BeautifulSoup

# Plant list

In [2]:
df = pd.read_csv('master_plant_list.csv', index_col=0)
df.shape

(1135, 2)

In [3]:
df.head()

Unnamed: 0,Family,Species
0,ACANTHACEAE,Carlowrightia arizonica
1,ACANTHACEAE,Justicia californica
2,AIZOACEAE,Mesembryanthemum nodiflorum
3,AIZOACEAE,Trianthema portulacastrum
4,AMARANTHACEAE,Amaranthus crassipes


In [4]:
out_dir = Path("plant_lists")

In [5]:
plantlist = r"C:\Users\matta\Desktop\Documents\Python\clonal_plants\plant_lists\plantlst.txt"
plants = pd.read_csv(plantlist, sep=',')
plants.shape

(93157, 5)

In [6]:
plants.head()

Unnamed: 0,Symbol,Synonym Symbol,Scientific Name with Author,Common Name,Family
0,ABAB,,Abutilon abutiloides (Jacq.) Garcke ex Hochr.,shrubby Indian mallow,Malvaceae
1,ABAB,ABAM5,Abutilon americanum (L.) Sweet,,
2,ABAB,ABJA,Abutilon jacquinii G. Don,,
3,ABAB,ABLI,Abutilon lignosum (Cav.) G. Don,,
4,ABAB70,,Abietinella abietina (Hedw.) Fleisch.,abietinella moss,Thuidiaceae


In [7]:
names = plants["Scientific Name with Author"].str.split(' ', expand = True)
names = names[0] + ' ' + names[1]
plants['Species'] = names
plants.drop(columns = ['Scientific Name with Author', 'Synonym Symbol', 'Family', 'Common Name'], inplace = True)
plants = plants.drop_duplicates().reset_index(drop=True)

In [8]:
plants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82269 entries, 0 to 82268
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Symbol   82269 non-null  object
 1   Species  82269 non-null  object
dtypes: object(2)
memory usage: 1.3+ MB


In [9]:
df = df.dropna().reset_index(drop=True)
df.shape

(1133, 2)

In [10]:
df.Species.value_counts()

Species
Salix exigua                   1
Carlowrightia arizonica        1
Justicia californica           1
Mesembryanthemum nodiflorum    1
Trianthema portulacastrum      1
                              ..
Allium macropetalum            1
Tidestromia lanuginosa         1
Suaeda nigra                   1
Salsola tragus                 1
Monolepis nuttalliana          1
Name: count, Length: 1133, dtype: int64

In [11]:
plants.Species.value_counts().head()

Species
Astragalus lentiginosus    37
Eriogonum umbellatum       33
Ericameria nauseosa        25
Chrysothamnus nauseosus    25
Lepidium montanum          22
Name: count, dtype: int64

In [12]:
plants[plants.Species == 'Carnegiea gigantea']

Unnamed: 0,Symbol,Species
12464,CAGI10,Carnegiea gigantea


In [13]:
df.merge(plants, left_on='Species', right_on='Species', how='left')

Unnamed: 0,Family,Species,Symbol
0,ACANTHACEAE,Carlowrightia arizonica,CAAR7
1,ACANTHACEAE,Justicia californica,JUCA8
2,AIZOACEAE,Mesembryanthemum nodiflorum,MENO2
3,AIZOACEAE,Trianthema portulacastrum,TRPO2
4,AMARANTHACEAE,Amaranthus crassipes,AMCR
...,...,...,...
2153,PTERIDACEAE,Cheilanthes wootoni,
2154,SALICACEAE,Salix exigua,SAEX
2155,SALICACEAE,Salix exigua,SAIN3
2156,SALICACEAE,Salix exigua,SAME2


# Scraping

## SEINet

In [27]:
url = "https://swbiodiversity.org/seinet/taxa/index.php?taxon=Carnegiea+gigantea"
headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://swbiodiversity.org/seinet/index.php"}
session = requests.Session()
session.headers.update(headers)

response = session.get(url, timeout=30)
response.raise_for_status()

soup = BeautifulSoup(response.text, "html.parser")

print(soup.title.text)
tabs = soup.find("div", id="desctabs")

tab_labels = {}

nav = soup.find("ul", class_="ui-tabs-nav")

for li in nav.find_all("li"):
    a = li.find("a")
    href = a.get("href", "")
    
    # Skip non-tab links (like "Resources")
    if href.startswith("#"):
        tab_id = href.lstrip("#")
        label = a.get_text(strip=True)
        tab_labels[tab_id] = label

tabs_data = {}

for tab_div in soup.find_all("div", class_="sptab"):
    tab_id = tab_div["id"]
    label = tab_labels.get(tab_id, tab_id)

    # Extract readable text
    text = tab_div.get_text(" ", strip=True)

    tabs_data[label] = text

SEINet - AZ/NM Node - Carnegiea gigantea


In [28]:
tabs_data.keys()

dict_keys(['FNA', 'VPAP', 'SW Field Guide'])

In [29]:
tabs_data["SW Field Guide"][:400]

'Benson 1969, Kearney and Peebles 1969 Common Name : saguaro Duration : Perennial Protected Status : Highly safeguarded, Salvage restricted in Arizona. General : Upright, a large simple stem with 1 to several lateral branches reaching 16 m tall with branches 30-65 cm in diameter and 12-25 ribs that are obtuse and 1-3 cm high, which varies with water availability. Spines : Aeroles 2-4 cm apart on ol'

In [30]:
tabs_data["FNA"][:400]

'Arthur C. Gibson in Flora of North America (vol. 4) Stems 25+ cm diam., widest where proximal branches arise; pith 10+ cm diam. Flowers usually ter-minal, 6.5-8.5 cm diam.; scales on flower tubes broadly triangular to rounded, green with red apices; ovary with locule to 25 mm; filaments white, short; anthers tan. Seeds: testa thin. 2 n = 22. Flowering early May-late Jun. Sonoran desert scrub; 180-'

In [31]:
tabs_data["VPAP"][:400]

'JANAS 29(1) Plant : Massive columnar trees to 15+ m tall. STEM simple or the trunk bearing 1-6(-20+) upright-curving branches usually 2-2.5 m above ground, commonly in subwhorls, sometimes rebranched, to 75+ cm in diameter (widest at lowest branches), green, glabrous but woolly at apices; ribs 12-24, 3-4 cm high, continuous, increasing in number from the narrow base. AREOLES mostly circular, 6-8 m'

In [25]:
def get_seinet_tabs(genus, species, pause=2):
    url = f"https://swbiodiversity.org/seinet/taxa/index.php?taxon={genus}+{species}"
    headers = {"User-Agent": (
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                    "AppleWebKit/537.36 (KHTML, like Gecko) "
                    "Chrome/120.0.0.0 Safari/537.36"),
                "Accept-Language": "en-US,en;q=0.9",
                "Referer": "https://swbiodiversity.org/seinet/index.php"}
    session = requests.Session()
    session.headers.update(headers)

    response = session.get(url, timeout=30)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    tabs = soup.find("div", id="desctabs")
    tab_labels = {}
    nav = soup.find("ul", class_="ui-tabs-nav")

    for li in nav.find_all("li"):
        a = li.find("a")
        href = a.get("href", "")
    
        # Skip non-tab links (like "Resources")
        if href.startswith("#"):
            tab_id = href.lstrip("#")
            label = a.get_text(strip=True)
            tab_labels[tab_id] = label

    tabs_data = {}
    for tab_div in soup.find_all("div", class_="sptab"):
        tab_id = tab_div["id"]
        label = tab_labels.get(tab_id, tab_id)
        # Extract readable text
        text = tab_div.get_text(" ", strip=True)
        tabs_data[label] = text

    time.sleep(pause)
    return tabs_data

In [32]:


def get_seinet_tabs(genus, species, pause=1.5):
    """
    Scrape SEINet tabbed description content for a plant species.

    Parameters
    ----------
    genus : str
    species : str
    pause : float
        Seconds to sleep after request (politeness / rate limiting)

    Returns
    -------
    dict
        {tab_label: tab_text}
        Example keys: 'FNA', 'VPAP', 'SW Field Guide'
        Returns {} if page or tabs not found.
    """

    url = f"https://swbiodiversity.org/seinet/taxa/index.php?taxon={genus}+{species}"

    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/120.0.0.0 Safari/537.36"
        ),
        "Referer": "https://swbiodiversity.org/seinet/index.php"
    }

    try:
        r = requests.get(url, headers=headers, timeout=30)
        r.raise_for_status()
    except requests.RequestException:
        return {}

    soup = BeautifulSoup(r.text, "html.parser")

    desctabs = soup.find("div", id="desctabs")
    if desctabs is None:
        return {}

    # --- Map tab IDs to tab labels ---
    tab_labels = {}
    nav = desctabs.find("ul", class_="ui-tabs-nav")

    if nav:
        for li in nav.find_all("li"):
            a = li.find("a")
            href = a.get("href", "")
            if href.startswith("#"):
                tab_id = href.lstrip("#")
                label = a.get_text(strip=True)
                tab_labels[tab_id] = label

    # --- Extract tab content ---
    tabs_data = {}

    for tab_div in desctabs.find_all("div", class_="sptab"):
        tab_id = tab_div.get("id")
        label = tab_labels.get(tab_id, tab_id)
        text = tab_div.get_text(" ", strip=True)
        tabs_data[label] = text

    time.sleep(pause)
    return tabs_data

In [39]:
adum = get_seinet_tabs('Ambrosia', 'dumosa', 2)

In [42]:
adum#['FNA']

{'FNA': 'John L. Strother in Flora of North America (vol. 21) Shrubs, 10-40(-60+) cm. Stems erect. Leaves mostly alternate; petioles 2-8(-12+) mm; blades (white) ovate to elliptic, 10-25(-45) × 8-15(-30) mm, (1-)2-3-pinnately lobed, abaxial and adaxial faces densely strigillose. Pistillate heads intermixed with staminates (sometimes wanting or staminates sometimes wanting, plants unisexual); florets (1-)2. Staminate heads: peduncles 0-1(-2) mm; involucres shallowly cup-shaped, 3-5 mm diam., ± strigillose; florets 8-15+. Burs: bodies ± globose, 3-5+ mm, pilosulous and/or gland-dotted, spines 12-25+, scattered, ± subulate (± navicular at bases), 2-4 mm, tips straight. 2 n = 36, 72, 108, 126. Flowering Mar-May(-Dec). Rocky or sandy washes, benches; (-100-)100-1200(-1500) m; Ariz., Calif., Nev., Utah; Mexico (Baja California, Sonora).',
 'SW Field Guide': 'Wiggins 1964, FNA 2008, Benson and Darrow 1981, Turner et al. 1995, Kearney and Peebles 1969 Common Name : burrobush Duration : Perenni

In [37]:
tabs_data['VPAP']

'JANAS 29(1) Plant : Massive columnar trees to 15+ m tall. STEM simple or the trunk bearing 1-6(-20+) upright-curving branches usually 2-2.5 m above ground, commonly in subwhorls, sometimes rebranched, to 75+ cm in diameter (widest at lowest branches), green, glabrous but woolly at apices; ribs 12-24, 3-4 cm high, continuous, increasing in number from the narrow base. AREOLES mostly circular, 6-8 mm in diameter, spaced about 2.5 cm apart on rib to nearly contiguous at stem apices, bearing short tan to gray wool Leaves : LEAVES of long shoots minute or obsolete; SPINES yellow to reddish brown, aging gray to gray-black, terete to angular, mostly bulbous-based, divergent; central-most spines stout, 3-5(-10) per areole, mostly l-3 cm long, but the basal one longest, 3.5-5(-8) cm long; peripheral spines finer, 12-15(-19) per areole, 1-2 cm long Flowers : nocturnal but remaining open into morning, solitary in areoles, arranged in masses usually on south sides and below apices of the branches