# Data Visualization for Mildew Detection in Cherry Leaves

## Objectives
* Understand the visual differences between healthy and powdery mildew-infected cherry leaves.
* Analyze the distribution of image sizes and ensure the dataset is balanced.
* Prepare visual content that can be used for the dashboard or further analysis.

## Inputs
* Preprocessed dataset with images categorized into healthy and powdery mildew classes.

## Outputs
* Visualizations including sample images, average images, and variability images for each class.
* Any intermediate files or plots saved for future reference or use in presentations.

## Additional Comments
* Visualization is key to understanding the data and guiding the model development process.

---

# Set Data Directory

## Import libraries

In [None]:
import matplotlib.pyplot as plt
from matplotlib.image import imread
import seaborn as sns
import numpy as np
import os
from PIL import Image

## Set working directory

In [None]:
current_dir = os.getcwd()
print("Original working directory:", current_dir)

In [None]:
# Change the current working directory to the project root
relative_path_to_root = '..'
os.chdir(os.path.abspath(os.path.join(current_dir, relative_path_to_root)))

# Verify the change
print("New current working directory:", os.getcwd())

## Set input directories

In [None]:
base_path = "inputs/cherry_leaves_dataset/cherry-leaves"
train_path = os.path.join(base_path, 'train')
val_path = os.path.join(base_path, 'validation')
test_path = os.path.join(base_path, 'test')
categories = ['healthy', 'powdery_mildew']

## Set output directory

In [None]:
# Define the base output directory name
output_base_path = "outputs/data_visualization"

# Optional: add versioning or categorization
version = "v1"
output_dir = os.path.join(output_base_path, version)

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created output directory: {output_dir}")
else:
    print(f"Output directory already exists: {output_dir}")


## Set Label Names

In [None]:
# Set the labels by listing the directories in your train_path
labels = os.listdir(train_path)
print('Labels for the images:', labels)

---

# Data visualisation of image data

## Image shape 

Analyze the image size distribution

In [None]:
# Initialize lists to store dimensions
dim1, dim2 = [], []

# Iterate over each label in your dataset and collect image dimensions
for label in labels:
    label_path = os.path.join(train_path, label)
    for image_filename in os.listdir(label_path):
        img_path = os.path.join(label_path, image_filename)
        img = imread(img_path)
        d1, d2 = img.shape[:2]  # Only need width and height
        dim1.append(d1)
        dim2.append(d2)

# Visualize the distribution of image dimensions
sns.set_style("whitegrid")
plt.figure(figsize=(10, 6))
sns.scatterplot(x=dim2, y=dim1, alpha=0.5)
plt.xlabel("Width (pixels)")
plt.ylabel("Height (pixels)")
plt.title("Distribution of Image Dimensions")

# Calculate and display the average dimensions
dim1_mean, dim2_mean = int(np.mean(dim1)), int(np.mean(dim2))
plt.axvline(x=dim2_mean, color='r', linestyle='--', label=f'Average Width: {dim2_mean}')
plt.axhline(y=dim1_mean, color='b', linestyle='--', label=f'Average Height: {dim1_mean}')
plt.legend()

plt.show()

print(f"Average Width: {dim2_mean} pixels \nAverage Height: {dim1_mean} pixels")

Save the average image dimensions

In [None]:

image_shape = (dim1_mean, dim2_mean, 3)

Save the image shape for future use

In [None]:
import joblib
joblib.dump(value=image_shape, filename=os.path.join(output_dir, "average_image_shape.pkl"))

## Visualize the different sample images

In [None]:
def display_samples_from_each_category(train_path, categories, samples_per_category=5):
    """
    Display a grid of sample images from each specified category.
    
    Parameters:
    - train_path (str): The path to the training dataset.
    - categories (list of str): A list of category names.
    - samples_per_category (int): The number of sample images to display per category.
    """
    
    fig, axes = plt.subplots(nrows=len(categories), ncols=samples_per_category, figsize=(samples_per_category * 2, len(categories) * 2))
    
    for i, category in enumerate(categories):
        sample_images = os.listdir(os.path.join(train_path, category))[:samples_per_category]
        
        for j, image_name in enumerate(sample_images):
            img_path = os.path.join(train_path, category, image_name)
            img = Image.open(img_path)
            ax = axes[i, j] if len(categories) > 1 else axes[j]
            ax.imshow(img)
            ax.axis('off')
            ax.set_title(f"{category}\n{image_name}")
    
    plt.tight_layout()
    plt.show()

# Call the function with the path to your training data and the list of categories
display_samples_from_each_category(train_path, categories)

---