# Naive Bayes Classification

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("utkarshsaxenadn/fruits-classification")

print("Path to dataset files:", path)

Path to dataset files: /Users/kimsan/.cache/kagglehub/datasets/utkarshsaxenadn/fruits-classification/versions/1


In [4]:
df = pd.read_csv('../dataset/fruit_classification_dataset.csv')
print(df.head)
print(df.columns)
print(df.describe())

<bound method NDFrame.head of       size (cm)  shape  weight (g)  avg_price (₹)   color  taste  \
0          25.4  round      3089.2          137.1   green  sweet   
1          24.6  round      3283.9          163.8   green  sweet   
2           7.8  round       319.0           91.3   green  sweet   
3          20.0   oval      1607.0           85.7  orange  sweet   
4          10.2   long       131.5           37.8  yellow  sweet   
...         ...    ...         ...            ...     ...    ...   
9995        3.7  round        49.7           72.7  purple  tangy   
9996        2.8   oval        18.1           65.5     red  sweet   
9997        8.7   oval       177.7           72.2   green  sweet   
9998       18.0  round      1165.0          107.0   brown  sweet   
9999        7.2  round       252.1          111.3     red  sweet   

         fruit_name  
0        watermelon  
1        watermelon  
2     custard apple  
3            papaya  
4            banana  
...             ...  

In [None]:
# check unique values in each String attribute
print(pd.unique(df['shape']))
print(pd.unique(df['color']))
print(pd.unique(df['taste']))

In [9]:
# Initialize LabelEncoder
le = LabelEncoder()
# Encode each column separately
df['shape_encoded'] = le.fit_transform(df['shape'])
df['color_encoded'] = le.fit_transform(df['color'])
df['taste_encoded'] = le.fit_transform(df['taste'])

df['fruit_name_encoded'] = le.fit_transform(df['fruit_name'])
print(df[['shape', 'shape_encoded', 'color', 'color_encoded', 'taste', 'taste_encoded', 'fruit_name', 'fruit_name_encoded']].head())
print("whole dataset:\n",df.head(3))

   shape  shape_encoded   color  color_encoded  taste  taste_encoded  \
0  round              2   green              2  sweet              1   
1  round              2   green              2  sweet              1   
2  round              2   green              2  sweet              1   
3   oval              1  orange              3  sweet              1   
4   long              0  yellow              7  sweet              1   

      fruit_name  fruit_name_encoded  
0     watermelon                  19  
1     watermelon                  19  
2  custard apple                   5  
3         papaya                  13  
4         banana                   1  
whole dataset:
    size (cm)  shape  weight (g)  avg_price (₹)  color  taste     fruit_name  \
0       25.4  round      3089.2          137.1  green  sweet     watermelon   
1       24.6  round      3283.9          163.8  green  sweet     watermelon   
2        7.8  round       319.0           91.3  green  sweet  custard apple   



In [None]:
# mapping = {
#     'shape': {'round': 0, 'oval': 1, 'long': 2},
#     'color': {'green': 0, 'orange': 1, 'yellow': 2, 'red': 3, 'purple': 4, 'blue': 5, 'brown': 6, 'pink': 7},
#     'taste': {'sweet': 0, 'tangy': 1, 'sour': 2}
# }

# df['shape_transformed'] = df['shape'].map(mapping['shape'])
# df['color_transformed'] = df['color'].map(mapping['color'])
# df['taste_transformed'] = df['taste'].map(mapping['taste'])
# print(df.head())

In [15]:
# Independent Variable
# ( we drop the target column(fruit_name, fruit_name_encoded
#  and all string columns(shape, color, taste) )
x = df.drop(['fruit_name', 'fruit_name_encoded', 'shape', 'color', 'taste'] , axis=1)
# Dependent Variable
y = df['fruit_name_encoded']



In [17]:
# Split into train and test sets
Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size=1/3)

In [None]:
# Naive Bayes Classifier
nb = GaussianNB()
nb.fit(Xtrain, Ytrain)
y_predict_nb = nb.predict(Xtest)
y_predict_nb_decoded = le.inverse_transform(y_predict_nb)
y_test_nb_decoded = le.inverse_transform(Ytest)
print("=== Naive Bayes Metrics ===")
print('Accuracy: ', accuracy_score(y_test_nb_decoded, y_predict_nb_decoded))
print('Precision: ', precision_score(y_test_nb_decoded, y_predict_nb_decoded, average='macro'))
print('Recall: ',recall_score(y_test_nb_decoded, y_predict_nb_decoded, average='macro'))
print('F1 Score: ', f1_score(y_test_nb_decoded, y_predict_nb_decoded, average='macro'))
print('Classification report: ', classification_report(y_test_nb_decoded, y_predict_nb_decoded))

=== Naive Bayes Metrics ===
Accuracy:  1.0
Precision:  1.0
Recall:  1.0
F1 Score:  1.0
classification report:                 precision    recall  f1-score   support

        apple       1.00      1.00      1.00       170
       banana       1.00      1.00      1.00       184
    blueberry       1.00      1.00      1.00       165
       cherry       1.00      1.00      1.00       169
      coconut       1.00      1.00      1.00       161
custard apple       1.00      1.00      1.00       172
 dragon fruit       1.00      1.00      1.00       183
        grape       1.00      1.00      1.00       166
        guava       1.00      1.00      1.00       161
         kiwi       1.00      1.00      1.00       161
       lychee       1.00      1.00      1.00       164
        mango       1.00      1.00      1.00       161
       orange       1.00      1.00      1.00       164
       papaya       1.00      1.00      1.00       171
         pear       1.00      1.00      1.00       172
    pine

In [23]:
# K-Nearest Neighbors Classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(Xtrain, Ytrain)
y_predict_knn = knn.predict(Xtest)
y_predict_knn_decoded = le.inverse_transform(y_predict_knn)
y_test_knn_decoded = le.inverse_transform(Ytest)

print("=== KNN Metrics ===")
print('Accuracy: ', accuracy_score(y_test_knn_decoded, y_predict_knn_decoded))
print('Precision: ', precision_score(y_test_knn_decoded, y_predict_knn_decoded, average='macro'))
print('Recall: ',recall_score(y_test_knn_decoded, y_predict_knn_decoded, average='macro'))
print('F1 Score: ', f1_score(y_test_knn_decoded, y_predict_knn_decoded, average='macro'))
print('Classification report: ' ,classification_report(y_test_knn_decoded, y_predict_knn_decoded))

=== KNN Metrics ===
Accuracy:  0.9832033593281344
Precision:  0.9836242491548818
Recall:  0.983115363942078
F1 Score:  0.9832443289315739
Classification report:                 precision    recall  f1-score   support

        apple       1.00      1.00      1.00       170
       banana       1.00      1.00      1.00       184
    blueberry       1.00      1.00      1.00       165
       cherry       1.00      1.00      1.00       169
      coconut       0.91      0.84      0.87       161
custard apple       1.00      1.00      1.00       172
 dragon fruit       1.00      1.00      1.00       183
        grape       1.00      1.00      1.00       166
        guava       1.00      0.99      1.00       161
         kiwi       1.00      1.00      1.00       161
       lychee       1.00      1.00      1.00       164
        mango       0.96      0.94      0.95       161
       orange       1.00      1.00      1.00       164
       papaya       1.00      1.00      1.00       171
         pea