In [5]:
import pandas as pd
import numpy as np

df = pd.read_csv("plants.csv", encoding = "latin1")
df.head()

Unnamed: 0,Plant Name,Growth,Soil,Sunlight,Watering,Fertilization Type
0,Aloe Vera,slow,sandy,indirect sunlight,Water weekly,Balanced
1,Basil,fast,well-drained,full sunlight,Keep soil evenly moist,Organic
2,Snake Plant,slow,well-drained,indirect sunlight,Water when soil is dry,No
3,Lavender,moderate,sandy,full sunlight,Let soil dry between watering,No
4,Cactus,slow,sandy,full sunlight,Let soil dry between watering,Low-nitrogen


Dropping duplicates, so onlly unique parameters remain. Saving edited dataset into new file for further exploatation.

In [6]:
df = df.drop_duplicates(subset=df.columns[1:], keep='first')
df = df.reset_index(drop=True)
df.to_csv("dupa.csv", index=False)

In [7]:
#fixing spelling error
df['Watering'][df["Watering"]=="Regular watering"]="Regular Watering"

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['Watering'][df["Watering"]=="Regular watering"]="Regular Watering"


In [8]:
df['Watering'].unique()

array(['Water weekly', 'Keep soil evenly moist', 'Water when soil is dry',
       'Let soil dry between watering', 'Keep soil consistently moist',
       'Water when soil feels dry', 'Keep soil slightly moist',
       'Water when topsoil is dry', 'Keep soil moist', 'Regular Watering',
       'Regular, well-drained soil', 'Regular, moist soil'], dtype=object)

In [9]:
moisture_levels = [
    "Keep soil consistently moist",
    "Keep soil evenly moist",
    "Keep soil moist",
    "Keep soil slightly moist",
    "Regular, moist soil",
    "Regular Watering",
    "Regular, well-drained soil",
    "Water weekly",
    "Water when soil feels dry",
    "Water when topsoil is dry",
    "Water when soil is dry",
    "Let soil dry between watering"
]
level = np.linspace(1,-1,len(moisture_levels))
mapping = dict(zip(moisture_levels,level))
mapping

df['Watering Frequency']=df["Watering"].map(mapping)
df.head()

Unnamed: 0,Plant Name,Growth,Soil,Sunlight,Watering,Fertilization Type,Watering Frequency
0,Aloe Vera,slow,sandy,indirect sunlight,Water weekly,Balanced,-0.272727
1,Basil,fast,well-drained,full sunlight,Keep soil evenly moist,Organic,0.818182
2,Snake Plant,slow,well-drained,indirect sunlight,Water when soil is dry,No,-0.818182
3,Lavender,moderate,sandy,full sunlight,Let soil dry between watering,No,-1.0
4,Cactus,slow,sandy,full sunlight,Let soil dry between watering,Low-nitrogen,-1.0


In [10]:
df['Fertilization Type'].unique()

array(['Balanced', 'Organic', 'No', 'Low-nitrogen', 'Acidic'],
      dtype=object)

In [11]:
health_level =[
    "Balanced",
    "Organic",
    "No",
    "Low-nitrogen",
    "Acidic"
]

level = np.linspace(1,-1, len(health_level))
mapping = dict(zip(health_level, level))

df["Health Indicator"] = df['Fertilization Type'].map(mapping)

In [12]:
df.head()

Unnamed: 0,Plant Name,Growth,Soil,Sunlight,Watering,Fertilization Type,Watering Frequency,Health Indicator
0,Aloe Vera,slow,sandy,indirect sunlight,Water weekly,Balanced,-0.272727,1.0
1,Basil,fast,well-drained,full sunlight,Keep soil evenly moist,Organic,0.818182,0.5
2,Snake Plant,slow,well-drained,indirect sunlight,Water when soil is dry,No,-0.818182,0.0
3,Lavender,moderate,sandy,full sunlight,Let soil dry between watering,No,-1.0,0.0
4,Cactus,slow,sandy,full sunlight,Let soil dry between watering,Low-nitrogen,-1.0,-0.5


In [13]:
df['Growth'].unique()

array(['slow', 'fast', 'moderate'], dtype=object)

In [14]:
efficiency_level=[
    "fast",
    "moderate",
    "slow"
]

level = np.linspace(1,-1,len(efficiency_level))

mapping = dict(zip(efficiency_level, level))
df["Efficiency"] = df['Growth'].map(mapping)

In [15]:
df.head()

Unnamed: 0,Plant Name,Growth,Soil,Sunlight,Watering,Fertilization Type,Watering Frequency,Health Indicator,Efficiency
0,Aloe Vera,slow,sandy,indirect sunlight,Water weekly,Balanced,-0.272727,1.0,-1.0
1,Basil,fast,well-drained,full sunlight,Keep soil evenly moist,Organic,0.818182,0.5,1.0
2,Snake Plant,slow,well-drained,indirect sunlight,Water when soil is dry,No,-0.818182,0.0,-1.0
3,Lavender,moderate,sandy,full sunlight,Let soil dry between watering,No,-1.0,0.0,0.0
4,Cactus,slow,sandy,full sunlight,Let soil dry between watering,Low-nitrogen,-1.0,-0.5,-1.0


In [16]:
df['Soil'].unique()

array(['sandy', 'well-drained', 'loamy', 'acidic', 'moist'], dtype=object)

In [17]:
soil = [
    "loamy",
    "moist",
    "well-drained",
    "sandy",
    "acidic"
]
level = np.linspace(1,-1,len(soil))

mapping = dict(zip(soil,level))

df['Soil goodness'] = df['Soil'].map(mapping)

df.head()

Unnamed: 0,Plant Name,Growth,Soil,Sunlight,Watering,Fertilization Type,Watering Frequency,Health Indicator,Efficiency,Soil goodness
0,Aloe Vera,slow,sandy,indirect sunlight,Water weekly,Balanced,-0.272727,1.0,-1.0,-0.5
1,Basil,fast,well-drained,full sunlight,Keep soil evenly moist,Organic,0.818182,0.5,1.0,0.0
2,Snake Plant,slow,well-drained,indirect sunlight,Water when soil is dry,No,-0.818182,0.0,-1.0,0.0
3,Lavender,moderate,sandy,full sunlight,Let soil dry between watering,No,-1.0,0.0,0.0,-0.5
4,Cactus,slow,sandy,full sunlight,Let soil dry between watering,Low-nitrogen,-1.0,-0.5,-1.0,-0.5


In [18]:
df['Sunlight'].unique()

array(['indirect sunlight', 'full sunlight', 'partial sunlight'],
      dtype=object)

In [19]:
sun =['full sunlight',
    'partial sunlight',
     'indirect sunlight'
]

level =np.linspace(1,-1,len(sun))
mapping = dict(zip(sun,level))

df['sun liking'] = df['Sunlight'].map(mapping)

In [20]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification


ModuleNotFoundError: No module named 'transformers'

In [21]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [22]:
from scipy.special import softmax
encoded_text = tokenizer("i like swimming", return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
    'roberta_neg' : scores[0],
    'roberta_neu' : scores[1],
    'roberta_pos' : scores[2]
}
print(scores_dict)

{'roberta_neg': np.float32(0.0044351006), 'roberta_neu': np.float32(0.14534467), 'roberta_pos': np.float32(0.85022026)}


In [23]:
df.head()

Unnamed: 0,Plant Name,Growth,Soil,Sunlight,Watering,Fertilization Type,Watering Frequency,Health Indicator,Efficiency,Soil goodness,sun liking
0,Aloe Vera,slow,sandy,indirect sunlight,Water weekly,Balanced,-0.272727,1.0,-1.0,-0.5,-1.0
1,Basil,fast,well-drained,full sunlight,Keep soil evenly moist,Organic,0.818182,0.5,1.0,0.0,1.0
2,Snake Plant,slow,well-drained,indirect sunlight,Water when soil is dry,No,-0.818182,0.0,-1.0,0.0,-1.0
3,Lavender,moderate,sandy,full sunlight,Let soil dry between watering,No,-1.0,0.0,0.0,-0.5,1.0
4,Cactus,slow,sandy,full sunlight,Let soil dry between watering,Low-nitrogen,-1.0,-0.5,-1.0,-0.5,1.0


In [24]:
X = df.iloc[:,6:].values
y = df.iloc[:,0].values
y

array(['Aloe Vera', 'Basil', 'Snake Plant', 'Lavender', 'Cactus',
       'Rosemary', 'Mint', 'Thyme', 'Peace Lily', 'Spider Plant',
       'Parsley', 'Fern (Boston)', 'Bamboo Palm', 'Money Plant',
       'Jade Plant', 'Orchids', 'Cilantro', 'Dill', 'Oregano', 'Pothos',
       'Bougainvillea', 'Fiddle Leaf Fig', 'Sage', 'Cosmos', 'Geranium',
       'Marigold', 'Zinnia', 'Hibiscus', 'Begonia', 'Chrysanthemum',
       'Daffodil', 'Tulip', 'Jasmine', 'Ivy', 'Nasturtium', 'Carnation',
       'Pansy', 'Sweet Pea', 'Verbena', 'Gerbera Daisy', 'Hyacinth',
       'Foxglove', 'Bluebell', 'Camellia', 'Magnolia', 'Bougainvillea',
       'Gardenia', 'Azalea', 'Chrysanthemum', 'Eucalyptus', 'Lotus',
       'Orchid Cactus', 'Lotus Bamboo', 'Coneflower', 'Alyssum', 'Pansy',
       'Begonia', 'Lobelia', 'Geranium', 'Zinnia', 'Morning Glory',
       'Aster', 'Sunflower', 'Coral Bells', 'Bleeding Heart', 'Astilbe',
       'Catnip', 'Lavender', 'Sage', 'Cilantro', 'Dill', 'Bay Laurel',
       'Lemon Balm'

In [25]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=1)
classifier.fit(X,y)


In [26]:
import pickle

In [None]:
filename = 'knn_classifier.svn'
pickle.dump(classifier, open(filename, 'wb'))

TypeError: 'str' object cannot be interpreted as an integer

In [None]:
classifier.predict([[1,0.67,1,0,1]])

array(['Basil'], dtype=object)

In [None]:
print(df[df['Plant Name']=='Basil'][:6])

  Plant Name Growth          Soil       Sunlight                Watering  \
1      Basil   fast  well-drained  full sunlight  Keep soil evenly moist   

  Fertilization Type  Watering Frequency  Health Indicator  Efficiency  \
1            Organic            0.818182               0.5         1.0   

   Soil goodness  sun liking  
1            0.0         1.0  
