In [1]:
import pandas as pd

In [2]:
df_plants = pd.read_csv("all-plant-data.csv")

In [3]:
df_plants.columns

Index(['scientific_name', 'common_name', 'family', 'height (feet)',
       'height_min', 'height_max', 'spread (feet)', 'spread_min', 'spread_max',
       'type', 'bloom_time', 'bloom_start', 'bloom_end', 'maintenance', 'sun',
       'sun_max', 'sun_min', 'water', 'zone', 'zone_min', 'zone_max',
       'bloom_desc', 'native_range'],
      dtype='object')

In [4]:
df_plants.shape

(8448, 23)

In [5]:
df_plants = df_plants.dropna()
df_plants.shape

(1194, 23)

In [6]:
# cast bloom time as string -> to avoid NaNs causing issue later
df_plants["bloom_time"] = df_plants["bloom_time"].astype(str)

In [7]:
# split on "to" and create 2 new columns
df_plants[["bloom_start", "blomm_end"]] = df_plants["bloom_time"].str.split(" to ", expand=True)

In [8]:
# remove leading and trailing whitespaces
df_plants["bloom_start"] = df_plants["bloom_start"].apply(lambda x: x.strip())
df_plants["blomm_end"] = df_plants["blomm_end"].apply(lambda x: x.strip())

In [9]:
# get the format from here: https://strftime.org/
df_plants[["bloom_start", "bloom_end"]] = df_plants[["bloom_start", "bloom_end"]].apply(pd.to_datetime, format="%B", errors="coerce")

You can either use the months (12 features for each column) or convert them to seasons (in order to have less features based on just date)

In [10]:
# OPTION 1
# extracting month

df_plants["bloom_start_month"] = df_plants["bloom_start"].apply(lambda x: x.month)
df_plants["bloom_end_month"] = df_plants["bloom_end"].apply(lambda x: x.month)

In [11]:
# option 2 - find seasons
df_plants[["bloom_start_season", "bloom_end_season"]] = df_plants[["bloom_start_month", "bloom_end_month"]].apply(lambda x: x%12 // 3 + 1)

In [12]:
assert df_plants["bloom_start_season"].value_counts().sum() == df_plants["bloom_start_month"].value_counts().sum()

In [13]:
df_plants

Unnamed: 0,scientific_name,common_name,family,height (feet),height_min,height_max,spread (feet),spread_min,spread_max,type,...,zone,zone_min,zone_max,bloom_desc,native_range,blomm_end,bloom_start_month,bloom_end_month,bloom_start_season,bloom_end_season
0,Abelia chinensis,Chinese Abelia,Caprifoliaceae,5.00 to 8.00,5.0,8.0,3.00 to 5.00,3.00,5.0,Deciduous shrub,...,7 to 9,7.0,9.0,White,China,September,7,9.0,3,4.0
2,Abelia × grandiflora,Glossy Abelia,Caprifoliaceae,3.00 to 6.00,3.0,6.0,3.00 to 6.00,3.00,6.0,Deciduous shrub,...,5 to 9,5.0,9.0,White/flushed pink,origin unknown,September,5,9.0,2,4.0
11,Abeliophyllum distichum,White Forsythia,Oleaceae,3.00 to 5.00,3.0,5.0,3.00 to 4.00,3.00,4.0,Deciduous shrub,...,5 to 8,5.0,8.0,"White, sometimes with a pink tinge",Korea,April,3,4.0,2,2.0
24,Aconitum napellus,Monkshood,Ranunculaceae,2.00 to 4.00,2.0,4.0,1.00 to 1.50,1.00,1.5,Herbaceous perennial,...,3 to 7,3.0,7.0,Deep purplish blue,Europe,August,7,8.0,3,3.0
26,Aconitum volubile,Monkshood,Ranunculaceae,8.00 to 12.00,8.0,12.0,1.50 to 3.00,1.50,3.0,Vine,...,4 to 8,4.0,8.0,Purple,Eastern Asia,September,8,9.0,3,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8416,Diarrhena obovata,American Beakgrain,Poaceae,2.00 to 3.00,2.0,3.0,2.00 to 3.00,2.00,3.0,Ornamental grass,...,4 to 8,4.0,8.0,Green with yellow anthers,"United States, Canada",October,6,10.0,3,4.0
8441,Diervilla lonicera,Bush Honeysuckle,Caprifoliaceae,2.00 to 3.00,2.0,3.0,2.00 to 4.00,2.00,4.0,Deciduous shrub,...,3 to 7,3.0,7.0,Yellow,Eastern North America,July,6,7.0,3,3.0
8442,Diervilla rivularis,Mountain Bush Honeysuckle,Caprifoliaceae,3.00 to 6.00,3.0,6.0,4.00 to 8.00,4.00,8.0,Deciduous shrub,...,5 to 7,5.0,7.0,Yellow,Southeastern United States,August,7,8.0,3,3.0
8444,Diervilla sessilifolia,Bush Honeysuckle,Caprifoliaceae,3.00 to 5.00,3.0,5.0,3.00 to 5.00,3.00,5.0,Deciduous shrub,...,5 to 8,5.0,8.0,Sulfur yellow,Southeastern United States,July,6,7.0,3,3.0
