# **Importing Libraries**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from imutils import paths
import numpy as np
from sklearn.metrics import classification_report
import imutils
import cv2
import os,zipfile
from google.colab import files

# **Data Preparation**

[SoyaBean Dataset from Kaggle](https://www.kaggle.com/datasets/kaustubhachaturvedi/soyabean-disease-dataset)

In [None]:
#upload kaggle api key
! pip install kaggle
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Saving kaggle.json to kaggle.json


In [None]:
# Create a helper function to unzip file(s)
def unzip(zipfileNameList):
  for file in zipfileNameList:
    fileZip = str(file) + '.zip'
    try:
      zipRef = zipfile.ZipFile(fileZip, 'r')
    except:
      print("Something went wrong when read to the unzip file")
    zipRef.extractall('./' + file + '/')
    zipRef.close()

def downloadAndUnzipDataset(*datasetLink):
  zipfileName = [] 
  for link in datasetLink:
    slashSplit = str(link).split('/')
    if slashSplit[-3] != 'datasets':
      raise Exception('Link(s) format are not as expected')
    if slashSplit[-3] == 'datasets':
      datasetName = slashSplit[-2] + '/' + slashSplit[-1]
      ! kaggle datasets download $datasetName
    zipfileName.append(slashSplit[-1])
  unzip(zipfileName)
  
downloadAndUnzipDataset('https://www.kaggle.com/datasets/kaustubhachaturvedi/soyabean-disease-dataset')

Downloading soyabean-disease-dataset.zip to /content
 98% 249M/255M [00:04<00:00, 91.6MB/s]
100% 255M/255M [00:04<00:00, 65.0MB/s]


# **Model V1**

## **Image Freature Extraction and Resizing**

In [None]:
def image_to_feature_vector(image, size=(150, 150)):
	return cv2.resize(image, size, interpolation=cv2.INTER_AREA).flatten()
def extract_color_histogram(image, bins=(8, 8, 8)):
	hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
	hist = cv2.calcHist([hsv], [0, 1, 2], None, bins,
		[0, 180, 0, 256, 0, 256])
	if imutils.is_cv2():
		hist = cv2.normalize(hist)
	else:
		cv2.normalize(hist, hist)
	return hist.flatten()

In [None]:
classList = {'angular_leaf_spot':'Angular Leaf Spot', 
            'bacterial_blight':'Bacterial Blight',
                  'bean_rust':'Bean Rust',
                  'downy_mildew':'Downy Mildew',
                  'healthy':'Healthy'}
# defining classes of diseases in soyabean plant from soyabean-disease-dataset
rawImages = []
features = []
labels = []
for classDir,className in classList.items():
  path=f'./soyabean-disease-dataset/train/train/{classDir}/'
  for image in os.listdir(path):
    image = cv2.imread(os.path.join(path, image),)
    label = className
    pixels = image_to_feature_vector(image)
    hist = extract_color_histogram(image)
    rawImages.append(pixels)
    features.append(hist)
    labels.append(label)
for classDir,className in classList.items():
  path=f'./soyabean-disease-dataset/validation/validation/{classDir}/'
  for image in os.listdir(path):
    image = cv2.imread(os.path.join(path, image),)
    label = className
    pixels = image_to_feature_vector(image)
    hist = extract_color_histogram(image)
    rawImages.append(pixels)
    features.append(hist)
    labels.append(label)
for classDir,className in classList.items():
  path=f'./soyabean-disease-dataset/test/test/{classDir}/'
  for image in os.listdir(path):
    image = cv2.imread(os.path.join(path, image),)
    label = className
    pixels = image_to_feature_vector(image)
    hist = extract_color_histogram(image)
    rawImages.append(pixels)
    features.append(hist)
    labels.append(label)
print(f"Total Images : {len(rawImages)}")
print(f"Total Features : {len(features)}")

Total Images : 1654
Total Features : 1654


In [None]:
rawImages = np.array(rawImages)
features = np.array(features)
labels = np.array(labels)
print("pixels matrix: {:.2f}MB".format(
	rawImages.nbytes / (1024 * 1000.0)))
print("features matrix: {:.2f}MB".format(
	features.nbytes / (1024 * 1000.0)))

pixels matrix: 109.03MB
features matrix: 3.31MB


## **Data Test Train Split for KNN**

In [None]:
(trainR, testR, trainRL, testRL) = train_test_split(
	rawImages, labels, test_size=0.25, random_state=42)
(trainH, testH, trainHL, testHL) = train_test_split(
	features, labels, test_size=0.25, random_state=42)

## **Fitting and Training Model with Raw Images**

In [None]:
print("evaluating raw pixel accuracy...")
model = KNeighborsClassifier(n_neighbors=1,
	n_jobs=-1)
model.fit(trainR, trainRL)
acc = model.score(testR, testRL)
print(classification_report(testRL,model.predict(testR)))
print("raw pixel accuracy: {:.2f}%".format(acc * 100))

evaluating raw pixel accuracy...
                   precision    recall  f1-score   support

Angular Leaf Spot       0.48      0.73      0.58       102
 Bacterial Blight       0.94      0.95      0.95        64
        Bean Rust       0.59      0.37      0.46       103
     Downy Mildew       0.34      0.93      0.50        28
          Healthy       0.91      0.42      0.57       117

         accuracy                           0.60       414
        macro avg       0.65      0.68      0.61       414
     weighted avg       0.69      0.60      0.60       414

raw pixel accuracy: 59.90%


## **Fitting and Training Model with Images Features Histogram**

In [None]:
print("evaluating histogram accuracy...")
model = KNeighborsClassifier(n_neighbors=1,
	n_jobs=-1)
model.fit(trainH, trainHL)
acc = model.score(testH, testHL)
print(classification_report(testHL,model.predict(testH)))
print("histogram accuracy: {:.2f}%".format(acc * 100))

evaluating histogram accuracy...
                   precision    recall  f1-score   support

Angular Leaf Spot       0.72      0.60      0.65       102
 Bacterial Blight       0.97      0.95      0.96        64
        Bean Rust       0.62      0.74      0.68       103
     Downy Mildew       0.79      0.82      0.81        28
          Healthy       0.77      0.76      0.77       117

         accuracy                           0.75       414
        macro avg       0.78      0.77      0.77       414
     weighted avg       0.75      0.75      0.75       414

histogram accuracy: 74.88%
