In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import silhouette_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, PrecisionRecallDisplay
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict

In [2]:
plant_data = pd.read_csv("all-plant-data.csv")
plant_data

Unnamed: 0,scientific_name,common_name,family,height (feet),height_min,height_max,spread (feet),spread_min,spread_max,type,...,maintenance,sun,sun_max,sun_min,water,zone,zone_min,zone_max,bloom_desc,native_range
0,Abelia chinensis,Chinese Abelia,Caprifoliaceae,5.00 to 8.00,5.0,8.0,3.00 to 5.00,3.00,5.0,Deciduous shrub,...,Low,Full sun to part shade,full sun,part shade,Medium,7 to 9,7.0,9.0,White,China
1,Abelia 'Edward Goucher',Abelia,Caprifoliaceae,3.00 to 5.00,3.0,5.0,3.00 to 5.00,3.00,5.0,Deciduous shrub,...,Low,Full sun to part shade,full sun,part shade,Medium,6 to 9,6.0,9.0,Lavender - pink,
2,Abelia × grandiflora,Glossy Abelia,Caprifoliaceae,3.00 to 6.00,3.0,6.0,3.00 to 6.00,3.00,6.0,Deciduous shrub,...,Low,Full sun to part shade,full sun,part shade,Medium,5 to 9,5.0,9.0,White/flushed pink,origin unknown
3,Abelia × grandiflora 'Hopleys' MISS LEMON,Glossy Abelia,Caprifoliaceae,3.00 to 4.00,3.0,4.0,3.00 to 4.00,3.00,4.0,Deciduous shrub,...,Low,Full sun to part shade,full sun,part shade,Medium,6 to 9,6.0,9.0,Light pink to white or lilac,
4,Abelia × grandiflora 'Kaleidoscope',Glossy Abelia,Caprifoliaceae,2.00 to 2.50,2.0,2.5,3.00 to 4.00,3.00,4.0,Deciduous shrub,...,Low,Full sun to part shade,full sun,part shade,Medium,5 to 9,5.0,9.0,White,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8443,Diervilla rivularis 'Morton' SUMMER STARS,Mountain Bush Honeysuckle,Caprifoliaceae,2.00 to 3.00,2.0,3.0,3.00 to 4.00,3.00,4.0,Deciduous shrub,...,Low,Full sun to part shade,full sun,part shade,Medium,5 to 7,5.0,7.0,Yellow,
8444,Diervilla sessilifolia,Bush Honeysuckle,Caprifoliaceae,3.00 to 5.00,3.0,5.0,3.00 to 5.00,3.00,5.0,Deciduous shrub,...,Medium,Full sun to part shade,full sun,part shade,Medium,5 to 8,5.0,8.0,Sulfur yellow,Southeastern United States
8445,Diervilla sessilifolia 'LPDC Podaras' COOL SPLASH,Southern Bush Honeysuckle,Caprifoliaceae,2.00 to 3.00,2.0,3.0,2.00 to 3.00,2.00,3.0,Deciduous shrub,...,Medium,Full sun to part shade,full sun,part shade,Medium,4 to 8,4.0,8.0,Yellow,
8446,Diervilla splendens 'El Madrigal' FIREFLY NIGH...,Bush Honeysuckle,Caprifoliaceae,2.00 to 3.00,2.0,3.0,2.00 to 3.00,2.00,3.0,Deciduous shrub,...,Low,Full sun,full sun,,Medium,4 to 8,4.0,8.0,Yellow,


In [6]:
plant_data.describe()

Unnamed: 0,height_min,height_max,spread_min,spread_max,zone_min,zone_max
count,8448.0,8448.0,8446.0,8446.0,8437.0,8433.0
mean,5.73668,9.124192,4.358235,7.019595,4.812493,8.670343
std,10.601969,16.698992,7.261354,11.410883,1.957568,1.203625
min,0.0,0.25,0.0,0.0,1.0,5.0
25%,1.0,1.5,1.0,1.5,3.0,8.0
50%,2.0,3.0,2.0,3.0,4.0,8.0
75%,5.0,8.0,4.0,6.0,5.0,9.0
max,180.0,275.0,75.0,125.0,12.0,12.0


In [19]:
plant_data.dtypes

scientific_name     object
common_name         object
family              object
height (feet)       object
height_min         float64
height_max         float64
spread (feet)       object
spread_min         float64
spread_max         float64
type                object
bloom_time          object
bloom_start         object
bloom_end           object
maintenance         object
sun                 object
sun_max             object
sun_min             object
water               object
zone                object
zone_min           float64
zone_max           float64
bloom_desc          object
native_range        object
dtype: object

In [12]:
X = plant_data.loc[:, plant_data.columns != 'scientific_name']
y = plant_data['scientific_name']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [13]:
numeric_features = X.select_dtypes(exclude='O').columns

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")),
           ("scaler", StandardScaler())]
)

categorical_features = X.select_dtypes(include='O').columns
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [18]:
# Now let's add a linear classifier to preprocessing pipeline to create a full prediction pipeline.
kmeans = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("classifier", KMeans(n_clusters=4, random_state=0))]
)

kmeans.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['height_min', 'height_max', 'spread_min', 'spread_max', 'zone_min',
       'zone_max'],
      dtype='object')),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  Index(['common_name', 'family', 'height (feet)', 'spread (feet)', 'type',
       'bloom_time', 'bloom_start', 'bloom_end', 'maintenance', 'sun',
       'sun_max', 'sun_min', 'water', 'zone', 'bloom_d