## Prac4: NB, K-NN, Decision Tree

In [None]:
''' Use Naive bayes, K-nearest, and Decision tree classification algorithms to build classifiers on any two datasets. Pre-process the datasets using techniques specified in Q2. Compare the Accuracy, Precision, Recall and F1 measure reported for each dataset using the abovementioned classifiers under the following situations:
    i. Using Holdout method (Random sampling):
        a) Training set = 80% Test set = 20%
        b) Training set = 66.6% (2/3rd of total), Test set = 33.3%
    ii. Using Cross-Validation:
        a) 10-fold
        b) 5-fold
'''

In [2]:
import pandas as pd
import numpy as np
# preprocessing
from sklearn.preprocessing import LabelEncoder
# split model for train and test sets
from sklearn.model_selection import train_test_split, cross_val_score
# naive bayes
from sklearn.naive_bayes import CategoricalNB
# k-nearest neighbors
from sklearn.neighbors import KNeighborsClassifier
# decision tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

fruits = pd.read_csv('fruit_classification_dataset.csv')
print(fruits.head())

   size (cm)  shape  weight (g)  avg_price (₹)   color  taste     fruit_name
0       25.4  round      3089.2          137.1   green  sweet     watermelon
1       24.6  round      3283.9          163.8   green  sweet     watermelon
2        7.8  round       319.0           91.3   green  sweet  custard apple
3       20.0   oval      1607.0           85.7  orange  sweet         papaya
4       10.2   long       131.5           37.8  yellow  sweet         banana


## Preprocessing

In [3]:
le = LabelEncoder()
fruits['shape'] = le.fit_transform(fruits['shape'])
fruits['color'] = le.fit_transform(fruits['color'])
fruits['taste'] = le.fit_transform(fruits['taste'])
fruits['fruit_name'] = le.fit_transform(fruits['fruit_name'])
print(fruits.head())

   size (cm)  shape  weight (g)  avg_price (₹)  color  taste  fruit_name
0       25.4      2      3089.2          137.1      2      1          19
1       24.6      2      3283.9          163.8      2      1          19
2        7.8      2       319.0           91.3      2      1           5
3       20.0      1      1607.0           85.7      3      1          13
4       10.2      0       131.5           37.8      7      1           1


## k-nn with test size 20% or 1/5 or 0.2

In [25]:
# take all cols except fruit_name as features for independent variables
x = fruits.drop('fruit_name', axis=1)
# take fruit_name as target /dependent variable / predictor for classification
y = fruits['fruit_name']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=1/5)
# print(X_train)
# print(X_test)
# print(y_train)
# print(y_test)

k = 5
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
y_pred_decode = le.inverse_transform(y_pred)
y_test_decode = le.inverse_transform(y_test)
print(y_pred_decode)
print(classification_report(y_test_decode, y_pred_decode))

['papaya' 'custard apple' 'blueberry' ... 'dragon fruit' 'papaya'
 'strawberry']
               precision    recall  f1-score   support

        apple       1.00      1.00      1.00        84
       banana       1.00      1.00      1.00       107
    blueberry       1.00      1.00      1.00        91
       cherry       1.00      1.00      1.00        97
      coconut       0.89      0.88      0.89       106
custard apple       1.00      1.00      1.00       106
 dragon fruit       1.00      1.00      1.00        98
        grape       1.00      1.00      1.00       110
        guava       1.00      1.00      1.00       100
         kiwi       1.00      1.00      1.00       102
       lychee       1.00      1.00      1.00       103
        mango       0.95      0.99      0.97        99
       orange       1.00      1.00      1.00       119
       papaya       1.00      1.00      1.00        96
         pear       1.00      1.00      1.00        81
    pineapple       0.88      0.90    

## nb with test size 20% or 1/5 or 0.2

In [27]:
nb = CategoricalNB()
categorical_features = ['shape', 'color', 'taste']

X_train_cat = X_train[categorical_features]
X_test_cat = X_test[categorical_features]

nb.fit(X_train_cat, y_train)
y_pred_nb = nb.predict(X_test_cat)
y_pred_nb_decode = le.inverse_transform(y_pred_nb)
y_test_decode = le.inverse_transform(y_test)
print(y_pred_nb_decode)
print(classification_report(y_test_decode, y_pred_nb_decode))

['papaya' 'custard apple' 'blueberry' ... 'dragon fruit' 'papaya' 'cherry']
               precision    recall  f1-score   support

        apple       0.00      0.00      0.00        84
       banana       1.00      1.00      1.00       107
    blueberry       1.00      1.00      1.00        91
       cherry       0.54      1.00      0.70        97
      coconut       1.00      1.00      1.00       106
custard apple       0.48      1.00      0.65       106
 dragon fruit       1.00      1.00      1.00        98
        grape       1.00      1.00      1.00       110
        guava       1.00      1.00      1.00       100
         kiwi       0.00      0.00      0.00       102
       lychee       1.00      1.00      1.00       103
        mango       1.00      1.00      1.00        99
       orange       1.00      1.00      1.00       119
       papaya       1.00      1.00      1.00        96
         pear       1.00      1.00      1.00        81
    pineapple       0.51      1.00      0.6

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## decision tree with test size 20% or 1/5 or 0.2

In [47]:
dt = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=None
)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
y_pred_dt_decode = le.inverse_transform(y_pred_dt)
print(y_pred_dt_decode)
print(classification_report(y_test_decode, y_pred_dt_decode))

['mango' 'papaya' 'plum' ... 'banana' 'papaya' 'papaya']
               precision    recall  f1-score   support

        apple       1.00      1.00      1.00       156
       banana       1.00      1.00      1.00       148
    blueberry       1.00      1.00      1.00       164
       cherry       1.00      1.00      1.00       141
      coconut       1.00      1.00      1.00       179
custard apple       1.00      1.00      1.00       176
 dragon fruit       1.00      1.00      1.00       165
        grape       1.00      1.00      1.00       167
        guava       1.00      1.00      1.00       162
         kiwi       1.00      1.00      1.00       164
       lychee       1.00      1.00      1.00       177
        mango       1.00      1.00      1.00       165
       orange       1.00      1.00      1.00       165
       papaya       1.00      1.00      1.00       173
         pear       1.00      1.00      1.00       185
    pineapple       1.00      1.00      1.00       186
       

## Cross-Validation

In [43]:
print("KNN 5-Fold Accuracy:")
print(cross_val_score(knn, x, y, cv=5))
print("KNN 10-Fold Accuracy:")
print(cross_val_score(knn, x, y, cv=10))
print("===============================")
print("Naive Bayes 5-Fold Accuracy:")
print(cross_val_score(nb, x, y, cv=5))
print("Naive Bayes 10-Fold Accuracy:")
print(cross_val_score(nb, x, y, cv=10))
print("===============================")
print("Decision Tree 5-Fold Accuracy:")
print(cross_val_score(dt, x, y, cv=5))
print("Decision Tree 10-Fold Accuracy:")
print(cross_val_score(dt, x, y, cv=10))


KNN 5-Fold Accuracy:
[0.985  0.9835 0.9855 0.983  0.9865]
KNN 10-Fold Accuracy:
[0.986 0.987 0.99  0.982 0.986 0.989 0.988 0.989 0.99  0.985]
Naive Bayes 5-Fold Accuracy:
[ 1.  1.  1. nan  1.]
Naive Bayes 10-Fold Accuracy:
[ 1.  1.  1.  1.  1.  1.  1. nan  1.  1.]
Decision Tree 5-Fold Accuracy:
[1. 1. 1. 1. 1.]
Decision Tree 10-Fold Accuracy:
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


Traceback (most recent call last):
  File "/Users/kimsan/miniconda3/envs/ml/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 156, in __call__
    score = scorer(estimator, *args, **routed_params.get(name).score)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kimsan/miniconda3/envs/ml/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 492, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kimsan/miniconda3/envs/ml/lib/python3.12/site-packages/sklearn/base.py", line 548, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
                             ^^^^^^^^^^^^^^^
  File "/Users/kimsan/miniconda3/envs/ml/lib/python3.12/site-packages/sklearn/naive_bayes.py", line 106, in predict
    jll = self._joint_log_likelihood(X)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kimsan/miniconda3/envs/ml/lib/python3.12/site-packages/sk

## k-nn with test size 33.33% or 1/3 or 0.33

In [44]:
x = fruits.drop('fruit_name', axis=1)
y = fruits['fruit_name']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=1/3)

k = 5
knn_classifier = KNeighborsClassifier(n_neighbors=k)
knn_classifier.fit(X_train, y_train)

y_pred = knn_classifier.predict(X_test)
y_pred_decode = le.inverse_transform(y_pred)
y_test_decode = le.inverse_transform(y_test)
print(y_pred_decode)
print(classification_report(y_test_decode, y_pred_decode))

['mango' 'papaya' 'plum' ... 'banana' 'papaya' 'papaya']
               precision    recall  f1-score   support

        apple       1.00      1.00      1.00       156
       banana       1.00      1.00      1.00       148
    blueberry       1.00      1.00      1.00       164
       cherry       1.00      1.00      1.00       141
      coconut       0.87      0.83      0.85       179
custard apple       1.00      1.00      1.00       176
 dragon fruit       1.00      1.00      1.00       165
        grape       1.00      1.00      1.00       167
        guava       0.99      0.99      0.99       162
         kiwi       1.00      1.00      1.00       164
       lychee       1.00      1.00      1.00       177
        mango       0.94      0.92      0.93       165
       orange       1.00      1.00      1.00       165
       papaya       1.00      1.00      1.00       173
         pear       0.99      0.99      0.99       185
    pineapple       0.84      0.88      0.86       186
       

## nb with test size 33.33% or 1/3 or 0.33

In [45]:
nb = CategoricalNB()
categorical_features = ['shape', 'color', 'taste']
X_train_cat = X_train[categorical_features]
X_test_cat = X_test[categorical_features]

nb.fit(X_train_cat, y_train)
y_pred_nb = nb.predict(X_test_cat)
y_pred_nb_decode = le.inverse_transform(y_pred_nb)
y_test_decode = le.inverse_transform(y_test)
print(y_pred_nb_decode)
print(classification_report(y_test_decode, y_pred_nb_decode))

['mango' 'papaya' 'plum' ... 'banana' 'papaya' 'papaya']
               precision    recall  f1-score   support

        apple       0.00      0.00      0.00       156
       banana       1.00      1.00      1.00       148
    blueberry       1.00      1.00      1.00       164
       cherry       0.49      1.00      0.66       141
      coconut       1.00      1.00      1.00       179
custard apple       0.52      1.00      0.69       176
 dragon fruit       1.00      1.00      1.00       165
        grape       1.00      1.00      1.00       167
        guava       1.00      1.00      1.00       162
         kiwi       0.00      0.00      0.00       164
       lychee       1.00      1.00      1.00       177
        mango       1.00      1.00      1.00       165
       orange       1.00      1.00      1.00       165
       papaya       1.00      1.00      1.00       173
         pear       1.00      1.00      1.00       185
    pineapple       0.53      1.00      0.69       186
       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## decision tree with test size 33.33% or 1/3 or 0.33

In [49]:
dt = DecisionTreeClassifier(
    criterion='entropy', max_depth=5, 
)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
y_pred_dt_decode = le.inverse_transform(y_pred_dt)
print(y_pred_dt_decode)
print(classification_report(y_test_decode, y_pred_dt_decode))

['mango' 'papaya' 'plum' ... 'banana' 'papaya' 'papaya']
               precision    recall  f1-score   support

        apple       1.00      1.00      1.00       156
       banana       1.00      1.00      1.00       148
    blueberry       1.00      1.00      1.00       164
       cherry       1.00      1.00      1.00       141
      coconut       1.00      1.00      1.00       179
custard apple       1.00      1.00      1.00       176
 dragon fruit       1.00      1.00      1.00       165
        grape       1.00      1.00      1.00       167
        guava       1.00      1.00      1.00       162
         kiwi       1.00      1.00      1.00       164
       lychee       1.00      1.00      1.00       177
        mango       1.00      1.00      1.00       165
       orange       1.00      1.00      1.00       165
       papaya       1.00      1.00      1.00       173
         pear       1.00      1.00      1.00       185
    pineapple       1.00      1.00      1.00       186
       

## Cross-Validation

In [50]:
print("KNN 5-Fold Accuracy:")
print(cross_val_score(knn, x, y, cv=5))
print("KNN 10-Fold Accuracy:")
print(cross_val_score(knn, x, y, cv=10))
print("===============================")
print("Naive Bayes 5-Fold Accuracy:")
print(cross_val_score(nb, x, y, cv=5))
print("Naive Bayes 10-Fold Accuracy:")
print(cross_val_score(nb, x, y, cv=10))
print("===============================")
print("Decision Tree 5-Fold Accuracy:")
print(cross_val_score(dt, x, y, cv=5))
print("Decision Tree 10-Fold Accuracy:")
print(cross_val_score(dt, x, y, cv=10))


KNN 5-Fold Accuracy:
[0.985  0.9835 0.9855 0.983  0.9865]
KNN 10-Fold Accuracy:
[0.986 0.987 0.99  0.982 0.986 0.989 0.988 0.989 0.99  0.985]
Naive Bayes 5-Fold Accuracy:
[ 1.  1.  1. nan  1.]
Naive Bayes 10-Fold Accuracy:
[ 1.  1.  1.  1.  1.  1.  1. nan  1.  1.]
Decision Tree 5-Fold Accuracy:
[1. 1. 1. 1. 1.]
Decision Tree 10-Fold Accuracy:
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


Traceback (most recent call last):
  File "/Users/kimsan/miniconda3/envs/ml/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 156, in __call__
    score = scorer(estimator, *args, **routed_params.get(name).score)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kimsan/miniconda3/envs/ml/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 492, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kimsan/miniconda3/envs/ml/lib/python3.12/site-packages/sklearn/base.py", line 548, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
                             ^^^^^^^^^^^^^^^
  File "/Users/kimsan/miniconda3/envs/ml/lib/python3.12/site-packages/sklearn/naive_bayes.py", line 106, in predict
    jll = self._joint_log_likelihood(X)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kimsan/miniconda3/envs/ml/lib/python3.12/site-packages/sk