# Discussion 8

### Image Classification with KMeans

Thursday, February 27th, 2025

### Introduction
In this week’s discussion section, we will use a dataset containing images of different plant diseases, and classify these images into different clusters. We will create a widget to see how our model classified a few of the images, as well as see how our classification changes when we change the value of K.

### Data
The dataset this week is zipped file contain many different folders containg images of plants. Each folder represents a different plant disease, and all images in that folder house pictures representing the corresponding disease. The dataset can be found (put in data folder)



In [24]:
import numpy as np
import os
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from matplotlib.patches import Patch
from ipywidgets import IntSlider, interact, Layout
from IPython.display import display
import zipfile

In [25]:
# Function to unzip the plant data 
#def unzip(zip_path, extract_to):
    # Ensure the extraction directory exists
#    if not os.path.exists(extract_to):
#        os.makedirs(extract_to)

    # Open the zip file
#    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#        # Extract all the contents into the directory
#        zip_ref.extractall(extract_to)
#        print(f"Files extracted to {extract_to}")

#unzip("/Users/jcsibley/MEDS/eds-232/EDS232-discussion/data/plant_disease.zip", "/Users/jcsibley/MEDS/eds-232/EDS232-discussion/data/plant_disease")

Files extracted to /Users/jcsibley/MEDS/eds-232/EDS232-discussion/data/plant_disease


In [29]:
# Function to open and standardize images used in model

def load_images(base_path, max_per_folder=20):
    images = [] # Empty list to store images
    labels = [] # Empty list to store label of each images
    class_names = [] # Empty list to store the names of the folders for all images

    for i, folder in enumerate(sorted(os.listdir(base_path))):
        folder_path = os.path.join(base_path, folder) # Join base path with folders to iterate over
        if not os.path.isdir(folder_path):
            continue

        class_names.append(folder)
        print(f"Loading from {folder}...")

        count = 0
        for img_file in os.listdir(folder_path): # Iterate over each item in each folder
            if count >= max_per_folder: # Stop when counter gets to 20 images
                break

            if img_file.lower().endswith(('.png', '.jpg', '.jpeg')): # Ensure file in folder is correct format
                try:
                    img_path = os.path.join(folder_path, img_file)
                    with Image.open(img_path) as img: # Open image
                        img = img.convert('RGB') # Convert it to RGB to standardize color channels
                        img = img.resize((100, 100), Image.Resampling.LANCZOS) # Resize image using LANCZOS resampling method

                    images.append(np.array(img)) # Convert image to array and add to image list
                    labels.append(i) # Add label to label list 
                    count += 1
                except Exception as e: # Print error message if error with a file
                    print(f"Error with {img_file}: {e}")

    return np.array(images), np.array(labels), class_names

data_path = input("Enter path to the folder containing disease folders: ")
images, labels, class_names = load_images(data_path)
print(f"Loaded {len(images)} images from {len(class_names)} disease classes")
#data/plant_disease/Plant_leave_diseases_dataset_with_augmentation

Enter path to the folder containing disease folders:  data/plant_disease/Plant_leave_diseases_dataset_with_augmentation


Loading from Apple___Apple_scab...
Loading from Apple___Black_rot...
Loading from Apple___Cedar_apple_rust...
Loading from Apple___healthy...
Loading from Background_without_leaves...
Loading from Blueberry___healthy...
Loading from Cherry___Powdery_mildew...
Loading from Cherry___healthy...
Loading from Corn___Cercospora_leaf_spot Gray_leaf_spot...
Loading from Corn___Common_rust...
Loading from Corn___Northern_Leaf_Blight...
Loading from Corn___healthy...
Loading from Grape___Black_rot...
Loading from Grape___Esca_(Black_Measles)...
Loading from Grape___Leaf_blight_(Isariopsis_Leaf_Spot)...
Loading from Grape___healthy...
Loading from Orange___Haunglongbing_(Citrus_greening)...
Loading from Peach___Bacterial_spot...
Loading from Peach___healthy...
Loading from Pepper,_bell___Bacterial_spot...
Loading from Pepper,_bell___healthy...
Loading from Potato___Early_blight...
Loading from Potato___Late_blight...
Loading from Potato___healthy...
Loading from Raspberry___healthy...
Loading fro

### More preprocessing ... 

Extract features from data and perform PCA 

In [30]:
# Function to extract features and perform PCA 

def extract_features_and_reduce(images): 
    features = []
    for img_array in images: 
        img = Image.fromarray(img_array) # Convert back into image 
        histogram = np.array(img.histogram()).astype('float32') # numeric values representing the pixels 
        histogram = histogram/histogram.sum() # Normalize histogram 
        features.append(histogram)
    features = np.array(features) # Convert back to array for PCA 
    
    # Perform PCA 
    pca = PCA(n_components = 2)
    reduced_features = pca.fit_transform(features)
    print(f"Reduced feature shape: {reduced_features.shape}")
    return reduced_features 
features = extract_features_and_reduce(images)

Reduced feature shape: (780, 2)


780 = pictures 
768 = number of features per image (red blue green color channels)

In [None]:
# Function to create interactive widget for clustering with visualization 

def create_interactive_widgets(images, features, labels, class_names): 
    slider = IntSlider(value = 3, min = 2, max = 15, description = "Clusters:", layout = Layout(width = "80%"))
    
    # function that updates clusters based on the users slider inputs 
    def update_clusters(cluster_num): 
        
        plt.figure(figsize = (15,7))
        
        kmeans = KMeans(n_clusters = cluster_num, random_state = 42, n_init = 10)
        cluster_labels = kmeans.fit_predict(features)
        
        # Visualize clusters 
        scatter = plt.scatter(features[:,0], features[:, 1], c = cluster_labels, cmap = 'viridis')
        plt.title('PCA Reduced Features by Cluster')
        plt.xlabel('PCA Component 1')
        plt.ylabel('PCA COmponent 2')
        plt.grid(True)
        
        # Update legend 
        colors = plt.cm.viridis()
        
        legends = [Patch(facecolor = clr, label = f'Cluster {i + 1}') for i,clr in enumerate(colors)]
        
        # Show plot
        plt.tight_layout()
        plt.show()
        
        # Print results for each cluster 
        print(f"Results for {cluster_num} clusters:")
        for i in range(cluster_num): print(f"Cluster{i+1}: {np.sum(cluster_labels ==i)} samples")
        
        # Display a few images from each cluster 
        for i in range(cluster_num): 
            plt.figure(figsize = (15,5))
            
            # Find the indices of all images that belong to current cluster 
            cluster_labels = np.where(cluster_labels== i)[0]
            
            # Randomly select 5 indices 
            selected_indices = np.random.choice(cluster_indices, min(len(cluster_indices),5), replace = False)
            
            for j, idx in enumerate(selected_indices): 
                
                plt.subplots(1, 5, j+1) # position of photos 
                plt.imshow(images[idx])
                plt.title(f"Cluster{i+1}")
                plt.axis('off')
            plt.show()
            
    interact(update_clusters, cluster_num = slider)
    
knn_widget = create_interactive_widgets(images, features, labels, class_names)
display(knn_widget)
            