Licensed under the MIT License.

Copyright (c) 2021-2031. All rights reserved.

# 100 Leaves Data

In [1]:
import pandas as pd

## About the Data

* The data files are downloaded from: https://archive.ics.uci.edu/ml/datasets/One-hundred+plant+species+leaves+data+set
  * 100 classes, each class has 16 records
    * Tex file has 1599 records, that's why after the inner join, all the records from 3 files has 1599 records
  * 3 featurevectors, each feature vector has 64 attributes

In [2]:
fv_lst = ['Mar', 'Sha', 'Tex']
df_lst = []

for fv in fv_lst:
    df = pd.read_csv(f'100 leaves plant species/data_{fv}_64.txt', header=None)
    cols = ['species'] + [f'{fv}_{i+1}' for i in range(64)]
    df.columns = cols
    df = df.sort_values(['species']).reset_index(drop=True)
    print(df.shape)
    df_lst.append(df)

(1600, 65)
(1600, 65)
(1599, 65)


In [3]:
df_lst[1].head()

Unnamed: 0,species,Sha_1,Sha_2,Sha_3,Sha_4,Sha_5,Sha_6,Sha_7,Sha_8,Sha_9,...,Sha_55,Sha_56,Sha_57,Sha_58,Sha_59,Sha_60,Sha_61,Sha_62,Sha_63,Sha_64
0,Acer Campestre,0.00057,0.000526,0.000476,0.000438,0.000383,0.000341,0.000354,0.00042,0.00048,...,0.000332,0.000274,0.000219,0.000178,0.000193,0.000262,0.000335,0.000403,0.000466,0.000528
1,Acer Campestre,0.000754,0.000705,0.000702,0.000677,0.000606,0.000532,0.000573,0.000656,0.00066,...,0.000406,0.000338,0.000286,0.000274,0.000364,0.000461,0.000551,0.000638,0.000725,0.000795
2,Acer Campestre,0.000684,0.000647,0.000609,0.000603,0.000588,0.000544,0.000501,0.000477,0.000545,...,0.000407,0.000346,0.000286,0.000233,0.00025,0.000346,0.00044,0.000535,0.000621,0.000708
3,Acer Campestre,0.000772,0.000705,0.000657,0.000646,0.000613,0.000578,0.000568,0.000654,0.00073,...,0.000479,0.000407,0.000345,0.000294,0.000309,0.000413,0.000516,0.000606,0.000692,0.000789
4,Acer Campestre,0.000427,0.000502,0.000582,0.000597,0.00056,0.000522,0.000486,0.000456,0.000425,...,0.000636,0.000716,0.000686,0.000645,0.000605,0.000571,0.00057,0.000537,0.000482,0.000427


In [4]:
all_df = pd.merge(df_lst[0], df_lst[1].iloc[:, 1:], left_index=True, right_index=True)
all_df = pd.merge(all_df, df_lst[2].iloc[:, 1:], left_index=True, right_index=True)

print(all_df.shape)
all_df.head()

(1599, 193)


Unnamed: 0,species,Mar_1,Mar_2,Mar_3,Mar_4,Mar_5,Mar_6,Mar_7,Mar_8,Mar_9,...,Tex_55,Tex_56,Tex_57,Tex_58,Tex_59,Tex_60,Tex_61,Tex_62,Tex_63,Tex_64
0,Acer Campestre,0.003906,0.003906,0.027344,0.033203,0.007812,0.017578,0.023438,0.005859,0.0,...,0.10352,0.0,0.001953,0.000977,0.022461,0.0,0.0,0.001953,0.0,0.027344
1,Acer Campestre,0.017578,0.011719,0.023438,0.019531,0.003906,0.011719,0.015625,0.0,0.0,...,0.15332,0.0,0.010742,0.0,0.007812,0.0,0.0,0.0,0.0,0.021484
2,Acer Campestre,0.009766,0.021484,0.019531,0.027344,0.003906,0.025391,0.023438,0.0,0.001953,...,0.12109,0.0,0.019531,0.0,0.003906,0.0,0.0,0.0,0.0,0.012695
3,Acer Campestre,0.015625,0.009766,0.025391,0.027344,0.001953,0.001953,0.011719,0.0,0.001953,...,0.12402,0.0,0.000977,0.0,0.021484,0.0,0.0,0.0,0.0,0.014648
4,Acer Campestre,0.017578,0.041016,0.017578,0.005859,0.003906,0.027344,0.017578,0.003906,0.0,...,0.044922,0.0,0.003906,0.0,0.012695,0.0,0.0,0.0,0.0,0.004883


In [6]:
# convert the classes into int format
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(all_df['species'])
all_df['species_int'] = le.transform(all_df['species'])

print(all_df['species'].nunique(), all_df['species_int'].nunique())

all_df.drop('species', inplace=True, axis=1)
all_df.head()

100 100


Unnamed: 0,Mar_1,Mar_2,Mar_3,Mar_4,Mar_5,Mar_6,Mar_7,Mar_8,Mar_9,Mar_10,...,Tex_56,Tex_57,Tex_58,Tex_59,Tex_60,Tex_61,Tex_62,Tex_63,Tex_64,species_int
0,0.003906,0.003906,0.027344,0.033203,0.007812,0.017578,0.023438,0.005859,0.0,0.015625,...,0.0,0.001953,0.000977,0.022461,0.0,0.0,0.001953,0.0,0.027344,0
1,0.017578,0.011719,0.023438,0.019531,0.003906,0.011719,0.015625,0.0,0.0,0.03125,...,0.0,0.010742,0.0,0.007812,0.0,0.0,0.0,0.0,0.021484,0
2,0.009766,0.021484,0.019531,0.027344,0.003906,0.025391,0.023438,0.0,0.001953,0.023438,...,0.0,0.019531,0.0,0.003906,0.0,0.0,0.0,0.0,0.012695,0
3,0.015625,0.009766,0.025391,0.027344,0.001953,0.001953,0.011719,0.0,0.001953,0.013672,...,0.0,0.000977,0.0,0.021484,0.0,0.0,0.0,0.0,0.014648,0
4,0.017578,0.041016,0.017578,0.005859,0.003906,0.027344,0.017578,0.003906,0.0,0.017578,...,0.0,0.003906,0.0,0.012695,0.0,0.0,0.0,0.0,0.004883,0


In [7]:
all_df.to_csv('structured_data/100leaves.csv', index=False)