In [1]:
import pandas as pd
from pathlib import Path

# Loading data

In [2]:
folder = Path(r"C:\Users\matta\Desktop\Documents\Python\clonal_plants\plant_lists")

for csv_file in folder.glob("*.csv"):
    df = pd.read_csv(csv_file)
    print(f"{csv_file.name}: {len(df)} rows")

Cabeza Prieta National Wildlife Refuge_1767930774.csv: 585 rows
Castle Dome Mountains_1767930845.csv: 247 rows
McDowell Mountain Regional Park_1767930168.csv: 351 rows
national_wetland_plant_list.csv: 2966 rows
Organ Pipe Cactus National Monument_1767930222.csv: 805 rows
Sierra Estrella Mountains Regional Park_1767930372.csv: 388 rows
Tucson Mountains_1767930256.csv: 844 rows
Tumamoc Hill_1767930500.csv: 456 rows
White Tank Mountains Regional Park 1968_1767930094.csv: 422 rows


In [3]:
folder = Path(r"C:\Users\matta\Desktop\Documents\Python\clonal_plants\plant_lists")

dfs = [
    pd.read_csv(f)
    for f in folder.glob("*.csv")
    if f.name != "national_wetland_plant_list.csv"
]

df = pd.concat(dfs, ignore_index=True)
df.shape

(4098, 5)

In [4]:
df.columns

Index(['Family', 'ScientificName', 'ScientificNameAuthorship', 'Notes',
       'TaxonId'],
      dtype='object')

# Cleaning

In [5]:
df.drop(columns = ['ScientificNameAuthorship', 'Notes', 'TaxonId'], inplace = True)
df.drop_duplicates('ScientificName', inplace = True)
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Family,ScientificName
0,ACANTHACEAE,Carlowrightia arizonica
1,ACANTHACEAE,Justicia californica
2,AIZOACEAE,Mesembryanthemum nodiflorum
3,AIZOACEAE,Trianthema portulacastrum
4,AMARANTHACEAE,Amaranthus crassipes


In [6]:
df.Family.value_counts()

Family
ASTERACEAE          205
POACEAE             191
FABACEAE            110
BORAGINACEAE         74
CACTACEAE            72
                   ... 
ASPHODELACEAE         1
MELIACEAE             1
PASSIFLORACEAE        1
SCROPHULARIACEAE      1
VIOLACEAE             1
Name: count, Length: 105, dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1475 entries, 0 to 1474
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Family          1475 non-null   object
 1   ScientificName  1475 non-null   object
dtypes: object(2)
memory usage: 23.2+ KB


In [8]:
sciname = df.ScientificName.str.split(' ', expand=True)

In [9]:
for i in sciname.columns:
    print(sciname[i].value_counts(dropna=True))
    print()

0
Cylindropuntia    24
Eriogonum         21
Euphorbia         20
Lycium            19
Cryptantha        18
                  ..
Passiflora         1
Cortaderia         1
Disakisperma       1
Lamarckia          1
Johnstonella       1
Name: count, Length: 529, dtype: int64

1
californica        32
arizonica          16
wrightii           16
coulteri           13
emoryi              9
                   ..
kearneyi            1
nasturtiifolium     1
echinata            1
pulchellum          1
subulatum           1
Name: count, Length: 807, dtype: int64

2
var.           219
subsp.         112
x                3
X                2
tetracantha      1
ajoensis         1
tucsonensis      1
kelvinensis      1
vivipara         1
Name: count, dtype: int64

3
californica    10
arizonica       4
parishii        4
schottii        4
villosa         3
               ..
asper           1
belenidium      1
pectinacea      1
gooddingii      1
parviflorum     1
Name: count, Length: 274, dtype: int64

4
s

In [10]:
sciname['Family'] = df['Family']
sciname = sciname[~sciname.astype(str).isin(['x', 'X']).any(axis=1)]
sciname.shape

(1466, 6)

In [11]:
sciname['Species'] = sciname[0] + ' ' + sciname[1]
sciname.drop(columns = [0, 1, 2, 3, 4], inplace = True)
plant_list = sciname.drop_duplicates().reset_index(drop=True)
plant_list.shape

(1135, 2)

In [12]:
plant_list.head()

Unnamed: 0,Family,Species
0,ACANTHACEAE,Carlowrightia arizonica
1,ACANTHACEAE,Justicia californica
2,AIZOACEAE,Mesembryanthemum nodiflorum
3,AIZOACEAE,Trianthema portulacastrum
4,AMARANTHACEAE,Amaranthus crassipes


In [13]:
plant_list[plant_list.Family == 'CACTACEAE'].shape

(40, 2)

In [14]:
plant_list[plant_list.Family == 'FAGACEAE'].shape

(2, 2)

# Saving

In [15]:
plant_list.to_csv('master_plant_list.csv')