## Prac4: NB, K-NN, Decision Tree

In [None]:
''' Use Naive bayes, K-nearest, and Decision tree classification algorithms to build classifiers on any two datasets. Pre-process the datasets using techniques specified in Q2. Compare the Accuracy, Precision, Recall and F1 measure reported for each dataset using the abovementioned classifiers under the following situations:
    i. Using Holdout method (Random sampling):
        a) Training set = 80% Test set = 20%
        b) Training set = 66.6% (2/3rd of total), Test set = 33.3%
    ii. Using Cross-Validation:
        a) 10-fold
        b) 5-fold
'''

In [34]:
import pandas as pd
import numpy as np
# preprocessing
from sklearn.preprocessing import LabelEncoder
# split model for train and test sets
from sklearn.model_selection import train_test_split
# naive bayes
from sklearn.naive_bayes import CategoricalNB
# k-nearest neighbors
from sklearn.neighbors import KNeighborsClassifier
# decision tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

fruits = pd.read_csv('fruit_classification_dataset.csv')
print(fruits.head())

   size (cm)  shape  weight (g)  avg_price (₹)   color  taste     fruit_name
0       25.4  round      3089.2          137.1   green  sweet     watermelon
1       24.6  round      3283.9          163.8   green  sweet     watermelon
2        7.8  round       319.0           91.3   green  sweet  custard apple
3       20.0   oval      1607.0           85.7  orange  sweet         papaya
4       10.2   long       131.5           37.8  yellow  sweet         banana


## Preprocessing

In [35]:
le = LabelEncoder()
fruits['shape'] = le.fit_transform(fruits['shape'])
fruits['color'] = le.fit_transform(fruits['color'])
fruits['taste'] = le.fit_transform(fruits['taste'])
fruits['fruit_name'] = le.fit_transform(fruits['fruit_name'])
print(fruits.head())

   size (cm)  shape  weight (g)  avg_price (₹)  color  taste  fruit_name
0       25.4      2      3089.2          137.1      2      1          19
1       24.6      2      3283.9          163.8      2      1          19
2        7.8      2       319.0           91.3      2      1           5
3       20.0      1      1607.0           85.7      3      1          13
4       10.2      0       131.5           37.8      7      1           1


## k-nn with test size 20% or 1/5 or 0.2

In [43]:
# take all cols except fruit_name as features for independent variables
x = fruits.drop('fruit_name', axis=1)
# take fruit_name as target /dependent variable / predictor for classification
y = fruits['fruit_name']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=1/5)
# print(X_train)
# print(X_test)
# print(y_train)
# print(y_test)

k = 5
knn_classifier = KNeighborsClassifier(n_neighbors=k)
knn_classifier.fit(X_train, y_train)

y_pred = knn_classifier.predict(X_test)
y_pred_decode = le.inverse_transform(y_pred)
y_test_decode = le.inverse_transform(y_test)
print(y_pred_decode)
print(classification_report(y_test_decode, y_pred_decode))

['kiwi' 'pomegranate' 'kiwi' ... 'guava' 'blueberry' 'banana']
               precision    recall  f1-score   support

        apple       1.00      1.00      1.00        99
       banana       1.00      1.00      1.00       106
    blueberry       1.00      1.00      1.00        96
       cherry       1.00      1.00      1.00        85
      coconut       0.88      0.85      0.87       107
custard apple       1.00      1.00      1.00       104
 dragon fruit       1.00      1.00      1.00       103
        grape       1.00      1.00      1.00       112
        guava       0.97      1.00      0.98        86
         kiwi       1.00      1.00      1.00        93
       lychee       1.00      1.00      1.00        91
        mango       0.99      0.96      0.98       102
       orange       1.00      1.00      1.00        96
       papaya       1.00      1.00      1.00       100
         pear       1.00      0.98      0.99       123
    pineapple       0.85      0.88      0.87       104
 

## nb with test size 20% or 1/5 or 0.2

In [44]:
nb = CategoricalNB()
categorical_features = ['shape', 'color', 'taste']

X_train_cat = X_train[categorical_features]
X_test_cat = X_test[categorical_features]

nb.fit(X_train_cat, y_train)
y_pred_nb = nb.predict(X_test_cat)
y_pred_nb_decode = le.inverse_transform(y_pred_nb)
y_test_decode = le.inverse_transform(y_test)
print(y_pred_nb_decode)
print(classification_report(y_test_decode, y_pred_nb_decode))

['pineapple' 'pomegranate' 'pineapple' ... 'guava' 'blueberry' 'banana']
               precision    recall  f1-score   support

        apple       0.00      0.00      0.00        99
       banana       1.00      1.00      1.00       106
    blueberry       1.00      1.00      1.00        96
       cherry       0.47      1.00      0.64        85
      coconut       1.00      1.00      1.00       107
custard apple       0.53      1.00      0.69       104
 dragon fruit       1.00      1.00      1.00       103
        grape       1.00      1.00      1.00       112
        guava       1.00      1.00      1.00        86
         kiwi       0.00      0.00      0.00        93
       lychee       1.00      1.00      1.00        91
        mango       1.00      1.00      1.00       102
       orange       1.00      1.00      1.00        96
       papaya       1.00      1.00      1.00       100
         pear       1.00      1.00      1.00       123
    pineapple       0.53      1.00      0.69  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## k-nn with test size 33.33% or 1/3 or 0.33

In [45]:
x = fruits.drop('fruit_name', axis=1)
y = fruits['fruit_name']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=1/3)

k = 5
knn_classifier = KNeighborsClassifier(n_neighbors=k)
knn_classifier.fit(X_train, y_train)

y_pred = knn_classifier.predict(X_test)
y_pred_decode = le.inverse_transform(y_pred)
y_test_decode = le.inverse_transform(y_test)
print(y_pred_decode)
print(classification_report(y_test_decode, y_pred_decode))

['papaya' 'papaya' 'custard apple' ... 'orange' 'guava' 'watermelon']
               precision    recall  f1-score   support

        apple       1.00      1.00      1.00       161
       banana       1.00      1.00      1.00       145
    blueberry       1.00      1.00      1.00       163
       cherry       1.00      1.00      1.00       150
      coconut       0.90      0.85      0.87       174
custard apple       1.00      1.00      1.00       171
 dragon fruit       1.00      1.00      1.00       171
        grape       1.00      1.00      1.00       162
        guava       0.99      0.99      0.99       154
         kiwi       1.00      1.00      1.00       177
       lychee       1.00      1.00      1.00       162
        mango       0.93      0.96      0.94       158
       orange       1.00      1.00      1.00       171
       papaya       1.00      1.00      1.00       183
         pear       0.99      0.99      0.99       173
    pineapple       0.87      0.91      0.89     

## nb with test size 33.33% or 1/3 or 0.33

In [46]:
nb = CategoricalNB()
categorical_features = ['shape', 'color', 'taste']
X_train_cat = X_train[categorical_features]
X_test_cat = X_test[categorical_features]

nb.fit(X_train_cat, y_train)
y_pred_nb = nb.predict(X_test_cat)
y_pred_nb_decode = le.inverse_transform(y_pred_nb)
y_test_decode = le.inverse_transform(y_test)
print(y_pred_nb_decode)
print(classification_report(y_test_decode, y_pred_nb_decode))

['papaya' 'papaya' 'custard apple' ... 'orange' 'guava' 'custard apple']
               precision    recall  f1-score   support

        apple       0.00      0.00      0.00       161
       banana       1.00      1.00      1.00       145
    blueberry       1.00      1.00      1.00       163
       cherry       0.48      1.00      0.65       150
      coconut       1.00      1.00      1.00       174
custard apple       0.51      1.00      0.67       171
 dragon fruit       1.00      1.00      1.00       171
        grape       1.00      1.00      1.00       162
        guava       1.00      1.00      1.00       154
         kiwi       0.00      0.00      0.00       177
       lychee       1.00      1.00      1.00       162
        mango       1.00      1.00      1.00       158
       orange       1.00      1.00      1.00       171
       papaya       1.00      1.00      1.00       183
         pear       1.00      1.00      1.00       173
    pineapple       0.51      1.00      0.68  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
