Licensed under the MIT License.

Copyright (c) 2021-2031. All rights reserved.

# Plants Data

In [2]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

## About the Data

* Download the data files from: https://archive.ics.uci.edu/ml/datasets/Plants
  * There are 34781 records
  * Also 34781 unique species, each row is a species
    * Therefore, to use this dataset, better to start with clustering
* The raw data file is `plants.data`, a text file
  * Each row contains a species and a list of the abbreviations of the states (locations)
  * The full name of each state can be found in file `stateabbr.txt`
* After converting to one-hot format as shown below, there are 70 attributes (states)

In [19]:
raw_df_dct = {}  # this dictionary is going to be converted to pandas dataframe

with open('plants.data') as plants_raw:
    lines = plants_raw.readlines()
    i = 0
    for line in lines:  # read each line and record species and states list
        value_lst = line.strip().split(',')
        species = value_lst[0]
        raw_df_dct[i] = {'prefix': species.split()[0], 'species': species, 'states': value_lst[1:]}
        i+=1
plants_raw.close()

raw_df = pd.DataFrame(raw_df_dct).T  # convert the dataset to pandas dataframe
print(raw_df.shape)
raw_df.head()

(34781, 3)


Unnamed: 0,prefix,species,states
0,abelia,abelia,"[fl, nc]"
1,abelia,abelia x grandiflora,"[fl, nc]"
2,abelmoschus,abelmoschus,"[ct, dc, fl, hi, il, ky, la, md, mi, ms, nc, s..."
3,abelmoschus,abelmoschus esculentus,"[ct, dc, fl, il, ky, la, md, mi, ms, nc, sc, v..."
4,abelmoschus,abelmoschus moschatus,"[hi, pr]"


In [20]:
# Convert the states into one-hot columns
mlb = MultiLabelBinarizer(sparse_output=True)  # keep sparse format for the one-hot output

df = raw_df.join(pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(raw_df.pop('states')),
                index=raw_df.index,
                columns=mlb.classes_)).drop_duplicates()

print(df.shape)
df.head()

(34781, 72)


Unnamed: 0,prefix,species,ab,ak,al,ar,az,bc,ca,co,...,tx,ut,va,vi,vt,wa,wi,wv,wy,yt
0,abelia,abelia,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,abelia,abelia x grandiflora,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,abelmoschus,abelmoschus,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
3,abelmoschus,abelmoschus esculentus,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
4,abelmoschus,abelmoschus moschatus,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# check the number of distinct species
df['species'].nunique()

34781

In [22]:
df.to_csv('structured_data/plants.csv', index=False)

In [23]:
# df = pd.read_csv('structured_data/plants.csv')
# print(df.shape)
# df.head()

(34781, 72)


Unnamed: 0,prefix,species,ab,ak,al,ar,az,bc,ca,co,...,tx,ut,va,vi,vt,wa,wi,wv,wy,yt
0,abelia,abelia,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,abelia,abelia x grandiflora,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,abelmoschus,abelmoschus,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
3,abelmoschus,abelmoschus esculentus,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
4,abelmoschus,abelmoschus moschatus,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
