Licensed under the MIT License.

Copyright (c) 2021-2031. All rights reserved.

# Leaf Data

In [1]:
import pandas as pd

## About the Data

* Download the data files from: https://archive.ics.uci.edu/ml/datasets/leaf
  * Check its `ReadMe.pdf` to get attributes' names and detailed definitions
  * `leaf.csv` contais 340 records
    * 14 attributes
    * 30 distinct species
  * Save downloaded folder <b>leaf/</b> into folder <b>raw_data/</b>
    
### Output All 30-class Data

In [2]:
leaf_df = pd.read_csv('raw_data/leaf/leaf.csv', header=None)

print(leaf_df.shape)
leaf_df.head()

(340, 16)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1,1,0.72694,1.4742,0.32396,0.98535,1.0,0.83592,0.004657,0.003947,0.04779,0.12795,0.016108,0.005232,0.000275,1.1756
1,1,2,0.74173,1.5257,0.36116,0.98152,0.99825,0.79867,0.005242,0.005002,0.02416,0.090476,0.008119,0.002708,7.5e-05,0.69659
2,1,3,0.76722,1.5725,0.38998,0.97755,1.0,0.80812,0.007457,0.010121,0.011897,0.057445,0.003289,0.000921,3.8e-05,0.44348
3,1,4,0.73797,1.4597,0.35376,0.97566,1.0,0.81697,0.006877,0.008607,0.01595,0.065491,0.004271,0.001154,6.6e-05,0.58785
4,1,5,0.82301,1.7707,0.44462,0.97698,1.0,0.75493,0.007428,0.010042,0.007938,0.045339,0.002051,0.00056,2.4e-05,0.34214


In [3]:
cols = ['species', 'specimen_number', 'eccentricity', 'aspect_ratio', 'elongation',
          'solidity', 'stochastic_convexity', 'isoperimetric_factor', 'maximal_indentation_depth',
          'lobedness', 'average_intensity', 'average_contrast', 'smoothness', 'third_moment',
          'uniformity', 'entropy']

leaf_df.columns = cols

leaf_df.head()

Unnamed: 0,species,specimen_number,eccentricity,aspect_ratio,elongation,solidity,stochastic_convexity,isoperimetric_factor,maximal_indentation_depth,lobedness,average_intensity,average_contrast,smoothness,third_moment,uniformity,entropy
0,1,1,0.72694,1.4742,0.32396,0.98535,1.0,0.83592,0.004657,0.003947,0.04779,0.12795,0.016108,0.005232,0.000275,1.1756
1,1,2,0.74173,1.5257,0.36116,0.98152,0.99825,0.79867,0.005242,0.005002,0.02416,0.090476,0.008119,0.002708,7.5e-05,0.69659
2,1,3,0.76722,1.5725,0.38998,0.97755,1.0,0.80812,0.007457,0.010121,0.011897,0.057445,0.003289,0.000921,3.8e-05,0.44348
3,1,4,0.73797,1.4597,0.35376,0.97566,1.0,0.81697,0.006877,0.008607,0.01595,0.065491,0.004271,0.001154,6.6e-05,0.58785
4,1,5,0.82301,1.7707,0.44462,0.97698,1.0,0.75493,0.007428,0.010042,0.007938,0.045339,0.002051,0.00056,2.4e-05,0.34214


In [4]:
print(leaf_df['species'].nunique())

specimen_ct = leaf_df[['species', 'specimen_number']].drop_duplicates()\
                                                     .groupby('species', as_index=False)['specimen_number']\
                                                     .agg('count').sort_values(['species'])
specimen_ct.head()

30


Unnamed: 0,species,specimen_number
0,1,12
1,2,10
2,3,10
3,4,8
4,5,12


In [5]:
print(leaf_df.shape)
leaf_df.to_csv('structured_data/leaf.csv', index=False)

(340, 16)


### Output Binary Class Data

In [6]:
binary_leaf_df = leaf_df.copy()

print(binary_leaf_df.shape)
binary_leaf_df.head()

(340, 16)


Unnamed: 0,species,specimen_number,eccentricity,aspect_ratio,elongation,solidity,stochastic_convexity,isoperimetric_factor,maximal_indentation_depth,lobedness,average_intensity,average_contrast,smoothness,third_moment,uniformity,entropy
0,1,1,0.72694,1.4742,0.32396,0.98535,1.0,0.83592,0.004657,0.003947,0.04779,0.12795,0.016108,0.005232,0.000275,1.1756
1,1,2,0.74173,1.5257,0.36116,0.98152,0.99825,0.79867,0.005242,0.005002,0.02416,0.090476,0.008119,0.002708,7.5e-05,0.69659
2,1,3,0.76722,1.5725,0.38998,0.97755,1.0,0.80812,0.007457,0.010121,0.011897,0.057445,0.003289,0.000921,3.8e-05,0.44348
3,1,4,0.73797,1.4597,0.35376,0.97566,1.0,0.81697,0.006877,0.008607,0.01595,0.065491,0.004271,0.001154,6.6e-05,0.58785
4,1,5,0.82301,1.7707,0.44462,0.97698,1.0,0.75493,0.007428,0.010042,0.007938,0.045339,0.002051,0.00056,2.4e-05,0.34214


In [7]:
binary_leaf_df.drop('specimen_number', inplace=True, axis=1)

# class 11 has more records than other classes
binary_leaf_df.loc[(binary_leaf_df['species']!=11), 'species'] = 0
binary_leaf_df.loc[(binary_leaf_df['species']==11), 'species'] = 1

In [8]:
binary_leaf_df['species'].value_counts()

species
0    324
1     16
Name: count, dtype: int64

In [9]:
print(binary_leaf_df.shape)
binary_leaf_df.to_csv('structured_data/binary_leaf.csv', index=False)

(340, 15)
