In [1]:
import numpy as np
import os
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from matplotlib.patches import Patch
from ipywidgets import IntSlider, interact, Layout
from IPython.display import display
import zipfile

In [3]:
def unzip(zip_path, extract_to):
    # Ensure the extraction directory exists
    if not os.path.exists(extract_to):
        os.makedirs(extract_to)

    # Open the zip file
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # Extract all the contents into the directory
        zip_ref.extractall(extract_to)
        print(f"Files extracted to {extract_to}")
unzip("data/plant_disease.zip", "data/plant_disease")

Files extracted to data/plant_disease


In [6]:
# Function to open and standardize images used in model

def load_images(base_path, max_per_folder=20):
    images = [] # Empty list to store images
    labels = [] # Empty list to store label of each images
    class_names = [] # Empty list to store the names of the folders for all images

    for i, folder in enumerate(sorted(os.listdir(base_path))):
        folder_path = os.path.join(base_path, folder) # Join base path with folders to iterate over
        if not os.path.isdir(folder_path):
            continue

        class_names.append(folder)
        print(f"Loading from {folder}...")

        count = 0
        for img_file in os.listdir(folder_path): # Iterate over each item in each folder
            if count >= max_per_folder: # Stop when counter gets to 20 images
                break

            if img_file.lower().endswith(('.png', '.jpg', '.jpeg')): # Ensure file in folder is correct format
                try:
                    img_path = os.path.join(folder_path, img_file)
                    with Image.open(img_path) as img: # Open image
                        img = img.convert('RGB') # Convert it to RGB to standardize color channels
                        img = img.resize((100, 100), Image.Resampling.LANCZOS) # Resize image using LANCZOS resampling method

                    images.append(np.array(img)) # Convert image to array and add to image list
                    labels.append(i) # Add label to label list 
                    count += 1
                except Exception as e: # Print error message if error with a file
                    print(f"Error with {img_file}: {e}")

    return np.array(images), np.array(labels), class_names

data_path = "data/plant_disease/Plant_leave_diseases_dataset_with_augmentation/"
images, labels, class_names = load_images(data_path)
print(f"Loaded {len(images)} images from {len(class_names)} disease classes")

Loading from Apple___Apple_scab...
Loading from Apple___Black_rot...
Loading from Apple___Cedar_apple_rust...
Loading from Apple___healthy...
Loading from Background_without_leaves...
Loading from Blueberry___healthy...
Loading from Cherry___Powdery_mildew...
Loading from Cherry___healthy...
Loading from Corn___Cercospora_leaf_spot Gray_leaf_spot...
Loading from Corn___Common_rust...
Loading from Corn___Northern_Leaf_Blight...
Loading from Corn___healthy...
Loading from Grape___Black_rot...
Loading from Grape___Esca_(Black_Measles)...
Loading from Grape___Leaf_blight_(Isariopsis_Leaf_Spot)...
Loading from Grape___healthy...
Loading from Orange___Haunglongbing_(Citrus_greening)...
Loading from Peach___Bacterial_spot...
Loading from Peach___healthy...
Loading from Pepper,_bell___Bacterial_spot...
Loading from Pepper,_bell___healthy...
Loading from Potato___Early_blight...
Loading from Potato___Late_blight...
Loading from Potato___healthy...
Loading from Raspberry___healthy...
Loading fro

### Extract features and perform PCA

In [10]:
# Function to extract features and perform PCA

def extract_features_and_reduce(images):
    features = []
    for img_array in images: # the images are all arrays rn
        img = Image.fromarray(img_array) # convert back into an image
        histogram = np.array(img.histogram()).astype('float') # of densities of pixels in image. Need it to be an image to run '.histogram()'
        histogram = histogram / histogram.sum() # Normalize histograms
        features.append(histogram)
    features = np.array(features) # PCA needs an array, so we convert back
    print(f"Feature shape: {features.shape}")

    # Perform PCA
    pca = PCA(n_components=2)
    reduced_features = pca.fit_transform(features)
    print(f"Reduced feature shape: {reduced_features.shape}")
    return reduced_features

features = extract_features_and_reduce(images)

Feature shape: (780, 768)
Reduced feature shape: (780, 2)


780 = number of images
768 = features per image. RGB when converted. Each channel contributes 256 bins. 256x3 for each of the color channels
We reduce 768 color channels down to 2. 


### Create an interactive widget and visualize clustering

**Function to run a KMeans model and create a visualization**

In [13]:
# Function to create widget for clustering with viz

def create_widget(images, features, labels, class_names):
    slider = IntSlider(value = 3, min = 2, max = 15, description = "Clusters:", layout = Layout(width = '80%'))

    # Function that updates clusters based on slider input
    def update_clusters(cluster_num):

        plt.figure(figsize=(15,7))

        kmeans = KMeans(n_clusters=cluster_num, random_state=42, n_init=10) # how many times do we want the model to try different centroids until it finds what's best.
        cluster_labels = kmeans.fit_predict(features)

        # Visualize clusters
        scatter = plt.scatter(features[:,0], features[:,1], c = cluster_labels, cmap = 'viridis') # all rows first col for first PC, all rows sec col for second component
        plt.title('PCA Reduced Features by Cluster')
        plt.xlabel('PCA Component 1')
        plt.ylabel('PCA Component 2')
        plt.grid(True)

        # Update legend. Want it to be discrete, not a gradient
        colors = plt.cm.viridis(np.linspace(0,1, cluster_num))

        legends = [Patch(facecolor=clr, label = f'Cluster {i + 1}') for i, clr in enumerate(colors)] # looping over list of colors and applying it to a cluster label for each one. Start at i+1 because our clusters start at 1 but python is 0 index. 
        plt.legend(handles = legends, title = "Clusters")

        # Show plot
        plt.tight_layout()
        plt.show()

        # Print resutls for each cluster
        print(f"Results for {cluster_num} clusters:")
        for i in range(cluster_num):
            print(f"Cluster {i+1}: {np.sum(cluster_labels == i)} samples")

        # Display a few images from each cluster
        for i in range(cluster_num):
            plt.figure(figsize=(15,5))

            # Find the indices of all images that belong to the current cluster
            # Which images are classified at cluster 1? etc
            cluster_indices = np.where(cluster_labels == i)[0] # tuple, just want index

            # Randomly pick 5 images to print
            selected_indices = np.random.choice(cluster_indices, min(len(cluster_indices), 5), replace=False) # some might have less than 5

            for j, idx in enumerate(selected_indices): # j is position of each item in arrangement of images. idx is index itself. 
                plt.subplot(1, 5, j+1) # j+1 the position on the grid of the image. 1 row, 5 columns
                plt.imshow(images[idx])
                plt.title(f"Cluster {i+1}")
                plt.axis('off')

            plt.show()

    interact(update_clusters, cluster_num = slider) # user input

knn_widget = create_widget(images, features, labels, class_names)
display(knn_widget)

interactive(children=(IntSlider(value=3, description='Clusters:', layout=Layout(width='80%'), max=15, min=2), …

None