In [67]:
import pandas as pd
import numpy as np
import re

In [68]:
df = pd.read_csv('seinet_results.csv', index_col=0)
df.drop(columns = ['Unnamed: 0', 'seinet_status'], inplace = True)
print(df.shape)
print(df.columns)

(1057, 5)
Index(['Genus', 'Species', 'FNA', 'VPAP', 'SW Field Guide'], dtype='object')


In [69]:
df['Species name'] = df.Genus + ' ' + df.Species

In [70]:
plant_list = pd.read_csv('master_plant_list.csv', index_col=0)

In [71]:
df = df.merge(plant_list, left_on = 'Species name', right_on = 'Species', how = 'left')
df.drop(columns = 'Species_y', inplace = True)
df.rename(columns = {'Species_x' : 'Species'}, inplace = True)
df.head()

Unnamed: 0,Genus,Species,FNA,VPAP,SW Field Guide,Species name,Family
0,Carlowrightia,arizonica,,,"Wiggins 1964, Daniel 1984, Kearney and Peebles...",Carlowrightia arizonica,ACANTHACEAE
1,Justicia,californica,,,"Benson and Darrow 1981, Hickman 1993, Powell 1...",Justicia californica,ACANTHACEAE
2,Mesembryanthemum,nodiflorum,"Nancy J. Vivrette, John E. Bleck & Wayne R. Fe...",,FNA 2004 Duration : Annual Nativity : Non-Nati...,Mesembryanthemum nodiflorum,AIZOACEAE
3,Trianthema,portulacastrum,Wayne R. Ferren Jr. in Flora of North America ...,,"Kearney and Peebles 1969, FNA 2004, Correll an...",Trianthema portulacastrum,AIZOACEAE
4,Amaranthus,crassipes,Sergei L. Mosyakin & Kenneth R. Robertson in F...,,,Amaranthus crassipes,AMARANTHACEAE


# Lifecycle duration

In [72]:
def extract_life_duration(text):
    """
    Extract plant life duration from descriptive text.

    Parameters
    ----------
    text : str or None

    Returns
    -------
    str
        One of:
        'annual', 'biennial', 'perennial',
        'annual/biennial', 'annual/perennial',
        'biennial/perennial', 'unknown'
    """
    if not text or not isinstance(text, str):
        return "unknown"

    t = text.lower()

    found = set()

    # strict word boundaries to avoid false matches
    patterns = {
        "annual": r"\bannual\b",
        "biennial": r"\bbiennial\b",
        "perennial": r"\bperennial\b"
    }

    for label, pattern in patterns.items():
        if re.search(pattern, t):
            found.add(label)

    if not found:
        return "unknown"

    # normalize combinations
    if found == {"annual"}:
        return "annual"
    if found == {"biennial"}:
        return "annual"
    if found == {"perennial"}:
        return "perennial"
    if found == {"annual", "biennial"}:
        return "annual"
    if found == {"annual", "perennial"}:
        return "annual/perennial"
    if found == {"biennial", "perennial"}:
        return "perennial"

    # rare but possible
    return "-".join(sorted(found))

In [73]:
df['duration_FNA'] = df["FNA"].apply(extract_life_duration)
df['duration_VPAP'] = df["VPAP"].apply(extract_life_duration)
df['duration_SWFG'] = df["SW Field Guide"].apply(extract_life_duration)

In [74]:
def consensus_life_duration(row):
    values = [
        row["duration_FNA"],
        row["duration_VPAP"],
        row["duration_SWFG"]
    ]
    values = [v for v in values if v != "unknown"]

    if not values:
        return "unknown"

    # if all agree
    if len(set(values)) == 1:
        return values[0]

    # otherwise keep ambiguity explicit
    return "/".join(sorted(set(values)))

In [75]:
df["duration_consensus"] = df.apply(consensus_life_duration, axis=1)

In [76]:
df.loc[df["duration_consensus"] == "annual/annual-biennial-perennial", "duration_consensus"] = "annual"
df.loc[df["duration_consensus"] == "annual-biennial-perennial/perennial", "duration_consensus"] = "perennial"
df.loc[df["duration_consensus"] == "annual-biennial-perennial", "duration_consensus"] = "annual"
df.loc[(df['Genus'] == 'Baileya') & (df['Species'] == 'multiradiata'), 'duration_consensus'] = "perennial"
df.loc[df["duration_consensus"] == "annual/perennial/perennial", "duration_consensus"] = "perennial"
df.loc[df["duration_consensus"] == "annual/annual/perennial", "duration_consensus"] = "annual"

In [77]:
df.loc[(df['Genus'] == 'Atriplex') & (df['Species'] == 'elegans'), 'duration_consensus'] = "annual"
df.loc[(df['Genus'] == 'Ambrosia') & (df['Species'] == 'confertiflora'), 'duration_consensus'] = "perennial"
df.loc[(df['Genus'] == 'Sonchus') & (df['Species'] == 'oleraceus'), 'duration_consensus'] = "annual"
df.loc[(df['Genus'] == 'Xanthisma') & (df['Species'] == 'spinulosum'), 'duration_consensus'] = "perennial"
df.loc[(df['Genus'] == 'Cryptantha') & (df['Species'] == 'holoptera'), 'duration_consensus'] = "perennial"
df.loc[(df['Genus'] == 'Cuscuta') & (df['Species'] == 'umbellata'), 'duration_consensus'] = "annual"
df.loc[(df['Genus'] == 'Chamaesyce') & (df['Species'] == 'pediculifera'), 'duration_consensus'] = "perennial"
df.loc[(df['Genus'] == 'Ditaxis') & (df['Species'] == 'neomexicana'), 'duration_consensus'] = "perennial"

In [78]:
lifespan = ['unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
            'annual', 'perennial', 'unknown', 'annual', 'perennial', 'annual', 'unknown', 'unknown', 'perennial', 'unknown', 
            'unknown', 'unknown', 'annual', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'perennial', 
            'unknown', 'unknown', 'unknown', 'unknown', 'annual', 'unknown', 'annual', 'annual', 'unknown', 'unknown', 
            'unknown', 'annual', 'annual', 'unknown', 'unknown', 'unknown', 'unknown', 'perennial', 'unknown', 
            'unknown', 'unknown', 'unknown', 'annual', 'annual', 'unknown']

In [79]:
for i, j in zip(df[df["duration_consensus"] == 'annual/perennial'][['Genus', 'Species']].values, lifespan):
    df.loc[(df['Genus'] == i[0]) & (df['Species'] == i[1]), 'duration_consensus'] = j

In [80]:
lifespan2 = ['perennial', 'annual', 'perennial', 'annual', 'perennial', 'perennial', 'perennial', 'perennial', 'perennial', 'annual', 
             'perennial', 'perennial', 'annual', 'perennial', 'annual', 'perennial', 'annual', 'perennial', 'perennial', 'annual', 
             'perennial', 'annual', 'perennial', 'annual', 'perennial', 'annual', 'perennial', 'annual', 'perennial', 'perennial']

In [81]:
for i, j in zip(df[(df.duration_consensus == 'unknown') & (df.duration_SWFG == 'annual/perennial')][['Genus', 'Species']].values, lifespan2):
    df.loc[(df['Genus'] == i[0]) & (df['Species'] == i[1]), 'duration_consensus'] = j

In [82]:
lifespan3 = ['perennial', 'unknown',  'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
             'perennial', 'perennial', 'unknown', 'unknown', 'unknown', 'perennial', 'annual', 'perennial', 'unknown', 'unknown', 
             'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
             'perennial', 'perennial', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
             'perennial', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
             'unknown', 'unknown', 'unknown', 'unknown', 'perennial',  'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
             'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
             'unknown', 'annual', 'unknown', 'annual', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
             'perennial', 'unknown', 'perennial', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
             'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown']

In [83]:
for i, j in zip(df[df.duration_consensus == 'unknown'][['Genus', 'Species']].values, lifespan3):
    df.loc[(df['Genus'] == i[0]) & (df['Species'] == i[1]), 'duration_consensus'] = j

In [84]:
lifespan4 = ['unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'annual', 'unknown', 'unknown', 
             'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 
             'perennial', 'perennial', 'perennial', 'perennial', 'unknown', 'unknown', 'unknown', 'perennial', 'perennial', 'annual', 
             'unknown', 'unknown', 'unknown', 'unknown', 'perennial', 'perennial', 'perennial', 'perennial', 'perennial', 'perennial', 
             'unknown', 'annual', 'unknown', 'unknown', 'unknown', 'perennial', 'perennial', 'perennial', 'perennial', 'perennial', 
             'perennial', 'annual', 'unknown', 'annual', 'annual', 'annual', 'unknown', 'annual', 'perennial', 'perennial', 
             'unknown', 'unknown', 'unknown', 'annual', 'annual', 'annual', 'annual', 'perennial', 'unknown', 'unknown', 
             'unknown', 'unknown', 'perennial', 'perennial', 'perennial', 'perennial', 'perennial', 'perennial', 'perennial', 'annual', 
             'unknown', 'perennial']

In [85]:
for i, j in zip(df[df.duration_consensus == 'unknown'][['Genus', 'Species']].values, lifespan4):
    df.loc[(df['Genus'] == i[0]) & (df['Species'] == i[1]), 'duration_consensus'] = j

In [86]:
lifespan5 = ['perennial', 'annual', 'perennial', 'annual', 'annual', 'annual', 'perennial', 'annual', 'perennial', 'annual', 
             'unknown', 'perennial', 'annual', 'annual', 'annual', 'annual', 'perennial', 'annual', 'annual', 'perennial', 
             'perennial', 'annual', 'unknown', 'annual', 'annual', 'annual', 'perennial', 'annual', 'perennial', 'annual', 
             'perennial', 'perennial', 'perennial', 'annual', 'unknown', 'perennial', 'perennial', 'perennial', 'annual', 'annual']

In [87]:
for i, j in zip(df[df.duration_consensus == 'unknown'][['Genus', 'Species']].values, lifespan5):
    df.loc[(df['Genus'] == i[0]) & (df['Species'] == i[1]), 'duration_consensus'] = j

In [88]:
df = df[df.duration_consensus != 'unknown'].reset_index(drop=True)

In [89]:
df.duration_consensus.value_counts()

duration_consensus
perennial    579
annual       475
Name: count, dtype: int64

In [90]:
df.drop(columns = ['duration_FNA', 'duration_VPAP', 'duration_SWFG'], inplace = True)

# Native

In [91]:
NON_NATIVE_TERMS = [
    "non-native",
    "introduced",
    "invasive",
    "exotic",
    "naturalized", 
    "non native", 
    "not native"
]

def parse_nativity(sw_text):
    """
    Determine nativity from SW Field Guide text.

    Parameters
    ----------
    sw_text : str or None

    Returns
    -------
    str
        'native', 'non-native', or 'unknown'
    """

    if pd.isna(sw_text):
        return "unknown"

    text = sw_text.lower()

    # --- Explicit Nativity field ---
    nativity_match = re.search(
        r"nativity\s*:\s*([a-z\-]+)",
        text
    )
    
    if nativity_match:
        value = nativity_match.group(1)

        if value == "native":
            return "native"

        if value in NON_NATIVE_TERMS:
            return "non-native"

    # --- Fallback: search for non-native indicators anywhere ---
    for term in NON_NATIVE_TERMS:
        if re.search(rf"\b{term}\b", text):
            return "non-native"

    return "unknown"

In [92]:
NATIVITY_COLUMNS = ["FNA", "VPAP", "SW Field Guide"]

def nativity_consensus(row):
    results = []

    for col in NATIVITY_COLUMNS:
        value = parse_nativity(row[col])
        results.append(value)

    if "non-native" in results:
        return "non-native"

    if "native" in results:
        return "native"

    return "unknown"

In [93]:
df["nativity_sw"] = df.apply(nativity_consensus, axis=1)

In [94]:
df.loc[(df.Family == 'CACTACEAE') & (df.nativity_sw == 'unknown'), 'nativity_sw'] = 'native'

In [95]:
df.drop(df[(df.Genus == 'Festuca') & (df.Species == 'octoflora')].index, inplace = True)

In [96]:
df.loc[(df.Family == 'ASPARAGACEAE') & (df.nativity_sw == 'unknown'), 'nativity_sw'] = 'native'

In [97]:
df.loc[(df.Family == 'PTERIDACEAE') & (df.nativity_sw == 'unknown'), 'nativity_sw'] = 'native'

In [98]:
df.drop(df[df.Species == 'wootoni'].index, inplace=True)

In [99]:
df.drop(df[df['Species name'] == 'Cheilanthes yavapensis'].index, inplace = True)

In [100]:
df.loc[df['Species name'] == 'Hordeum arizonicum', 'nativity_sw'] = 'native'
df.loc[df['Species name'] == 'Hordeum murinum', 'nativity_sw'] = 'non-native'
df.loc[df['Species name'] == 'Rhus kearneyi', 'nativity_sw'] = 'native'
df.loc[df['Species name'] == 'Diaperia verna', 'nativity_sw'] = 'native'
df.loc[df['Species name'] == 'Isocoma coronopifolia', 'nativity_sw'] = 'native'
df.loc[df['Species name'] == 'Stephanomeria schottii', 'nativity_sw'] = 'native'
df.loc[df['Species name'] == 'Cryptantha ganderi', 'nativity_sw'] = 'native'

In [101]:
nativity = ['unknown', 'unknown', 'native', 'unknown', 'native', 'native', 'unknown', 'unknown', 'unknown', 'unknown', 
            'unknown', 'native', 'native', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'native', 'native', 
            'native', 'native', 'unknown', 'unknown', 'unknown', 'native', 'native', 'native', 'native', 'native', 
            'native', 'unknown', 'unknown', 'native', 'non-native', 'unknown', 'unknown', 'native', 'unknown', 'native', 
            'native', 'unknown', 'unknown', 'unknown', 'unknown', 'native', 'unknown', 'unknown', 'native', 'unknown', 
            'native', 'native', 'unknown', 'unknown', 'unknown', 'unknown', 'native', 'native', 'unknown', 'unknown', 
            'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'native', 'native', 'unknown']

In [102]:
for i, j in zip(df[df.nativity_sw == 'unknown'][['Genus', 'Species']].values, nativity):
    df.loc[(df['Genus'] == i[0]) & (df['Species'] == i[1]), 'nativity_sw'] = j

In [103]:
nativity2 = ['unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'native', 'native', 'native', 'unknown', 'unknown', 
             'unknown', 'native', 'native', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'native', 'native', 
             'native', 'unknown', 'unknown', 'unknown', 'native', 'unknown', 'unknown', 'unknown', 'unknown', 'native', 
             'unknown', 'non-native', 'unknown', 'native', 'native', 'unknown', 'unknown', 'unknown', 'native', 'unknown', 
             'unknown']

In [104]:
for i, j in zip(df[df.nativity_sw == 'unknown'][['Genus', 'Species']].values, nativity2):
    df.loc[(df['Genus'] == i[0]) & (df['Species'] == i[1]), 'nativity_sw'] = j

In [105]:
df.loc[df.nativity_sw == 'unknown', 'nativity_sw'] = 'native'

In [106]:
df.loc[df['Species name'] == 'Ricinus communis', 'nativity_sw'] = 'non-native'

In [107]:
df.loc[df['Species name'] == 'Urochloa reptans', 'nativity_sw'] = 'non-native'

In [108]:
df.loc[df['Species name'] == 'Melia azedarach', 'nativity_sw'] = 'non-native'

In [109]:
df["nativity_sw"].value_counts()

nativity_sw
native        887
non-native    164
Name: count, dtype: int64

# Aquatic

In [153]:
AQUATIC_PATTERNS = [
    # strong indicators
    r"\baquatic\b",
    r"\bsubmerged\b",
    r"\bfloating\b",
    r"\bemergent\b",
    r"\bwater plant\b",
    r"\bhydrophyte\b",

    # habitat indicators
    r"\bpond(s)?\b",
    r"\blake(s)?\b",
    #r"\bstream(s)?\b",
    #r"\briver(s)?\b",
    r"\bmarsh(es)?\b",
    r"\bswamp(s)?\b",
    #r"\bwetland(s)?\b",
    r"\bshallow water\b",
    r"\bstanding water\b"
]

In [154]:
AQUATIC_REGEX = re.compile("|".join(AQUATIC_PATTERNS), flags=re.IGNORECASE)

In [155]:
def is_aquatic_text(text):
    """
    Returns True if aquatic indicators are found in text.
    """
    if not isinstance(text, str) or not text.strip():
        return False

    return bool(AQUATIC_REGEX.search(text))

In [156]:
def aquatic_parser(row, source_columns):
    """
    Determines aquatic status from multiple text columns.

    Returns:
        dict with per-source and consensus aquatic status
    """
    results = {}

    for col in source_columns:
        results[f"aquatic_{col}"] = is_aquatic_text(row.get(col))

    # consensus: aquatic if ANY source says aquatic
    results["aquatic_consensus"] = any(results.values())

    return results

In [157]:
SOURCE_COLUMNS = ["FNA", "VPAP", "SW Field Guide"]

aquatic_results = df.apply(
    lambda row: aquatic_parser(row, SOURCE_COLUMNS),
    axis=1,
    result_type="expand"
)

df = pd.concat([df, aquatic_results], axis=1)

In [158]:
df.aquatic_consensus.value_counts()

aquatic_consensus
False    999
True      52
Name: count, dtype: int64

In [159]:
df[df.aquatic_consensus == True]['Species name'].to_list()

['Trianthema portulacastrum',
 'Erigeron lobatus',
 'Logfia californica',
 'Logfia filaginoides',
 'Echinocactus polycephalus',
 'Acmispon maritimus',
 'Marsilea vestita',
 'Glinus lotoides',
 'Leptochloa panicea',
 'Zannichellia palustris',
 'Physalis lobata',
 'Typha domingensis',
 'Verbena bracteata',
 'Gnaphalium palustre',
 'Quincula lobata',
 'Filago californica',
 'Pluchea sericea',
 'Xanthium strumarium',
 'Hordeum pusillum',
 'Stuckenia pectinata',
 'Populus fremontii',
 'Bassia hyssopifolia',
 'Rhus aromatica',
 'Tessaria sericea',
 'Nasturtium officinale',
 'Cyperus laevigatus',
 'Eleocharis geniculata',
 'Eleocharis rostellata',
 'Schoenoplectus americanus',
 'Najas marina',
 'Sisyrinchium demissum',
 'Juncus balticus',
 'Juncus cooperi',
 'Phragmites australis',
 'Myosurus minimus',
 'Ruppia maritima',
 'Anemopsis californica',
 'Chenopodium ambrosioides',
 'Dysphania ambrosioides',
 'Lemna gibba',
 'Cyperus strigosus',
 'Ludwigia peploides',
 'Veronica anagallis-aquatica'

In [160]:
df[df['Species name'] == 'Populus fremontii']['SW Field Guide'].values

array(['Eckenwalder 1992, Heil et al. 2013, Allred and Ivey 2012 Common Name : Fremont cottonwood Duration : Perennial Nativity : Native Lifeform : Tree General : Trees up to 30 m tall with open crown; bark whitish, smooth, deeply furrowed at maturity; year-old twigs mostly pubescent. Leaves : Alternate, clustered near tips of branchlets, on flattened petioles that are nearly as long as the leaf blades; blades deltoid, 4-7 mm long and about as wide or wider, slightly cordate or cuneate at base and sharply pointed at the tip, with margins coarsely and irregularly toothed, and surfaces bright green and glabrous. Flowers : Catkins 4-13 cm long, with male and female catkins on the same tree; each flower subtended by a deeply cup-shaped disc, 3-9 mm wide. Fruits : Capsules globose or ellipsoid, 6-10 mm long; splitting open into 4 segments (valves) to release many tiny seeds attached to abundant white cottony hairs. Ecology : Found along streams banks and near lakes and ponds, below 6,500 ft

In [152]:
df.drop(columns = 'aquatic_consensus', inplace = True)

In [120]:
AQUATIC_REGEX

re.compile(r'\baquatic\b|\bsubmerged\b|\bfloating\b|\bemergent\b|\bwater plant\b|\bhydrophyte\b|\bpond(s)?\b|\blake(s)?\b|\briver(s)?\b|\bmarsh(es)?\b|\bswamp(s)?\b|\bwetland(s)?\b|\bshallow water\b|\bstanding water\b',
           re.IGNORECASE|re.UNICODE)

# Elevation

# Lifeform