In [1]:
import cv2
import numpy as np
import os
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis

In [2]:
def compute_statistics(hist):
    hist_norm = hist / hist.sum()  # Normalize histogram to get probability distribution
    bin_edges = np.arange(257)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
    
    mean = np.sum(bin_centers * hist_norm)
    median = np.percentile(bin_centers, 50, method='nearest')
    std_dev = np.sqrt(np.sum((bin_centers - mean) ** 2 * hist_norm))
    skewness = skew(hist_norm, bias=False)
    kurt = kurtosis(hist_norm, bias=False)
    
    return [mean, median, std_dev, skewness, kurt]

In [3]:
def make_histogram(class_, files, dest_dir, dest_stats_dir):
    dest_class = class_.split("/")[-1]
    class_dest_dir = os.path.join(dest_dir, dest_class)
    class_stats_dest_dir = os.path.join(dest_stats_dir, dest_class)
    os.makedirs(class_dest_dir, exist_ok=True)
    os.makedirs(class_stats_dest_dir, exist_ok=True)
    for file in files:
        filepath = os.path.join(class_, file)
        img = cv2.imread(filepath)
        # Convert BGR to RGB (OpenCV uses BGR by default)
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        luminance_hist = cv2.calcHist([img_gray], [0], None, [256], [0,256]).flatten()

        hist_b = cv2.calcHist([img_rgb], [0], None, [256], [0, 256]).flatten()
        hist_g = cv2.calcHist([img_rgb], [1], None, [256], [0, 256]).flatten()
        hist_r = cv2.calcHist([img_rgb], [2], None, [256], [0, 256]).flatten()
        
        file_base = file.replace('.JPG', '')
        destination_path = os.path.join(class_dest_dir, file_base)
        np.save(destination_path, np.array([luminance_hist, hist_r, hist_b, hist_g]))
        
        luminance_stats = compute_statistics(luminance_hist)
        red_stats = compute_statistics(hist_r)
        blue_stats = compute_statistics(hist_b)
        green_stats = compute_statistics(hist_g)
        
         # Save the statistics
        stats_destination_path = os.path.join(class_stats_dest_dir, file_base)
        np.save(stats_destination_path, np.array([luminance_stats, red_stats, blue_stats, green_stats]))

In [4]:
def process_directory_structure(src_dir, dest_dir, stats_dest_dir):
    os.makedirs(dest_dir, exist_ok=True)
    os.makedirs(stats_dest_dir, exist_ok=True)
    all_dirs = list(os.walk(src_dir))
    
    # Iterate over each directory
    for dirpath, _, files in tqdm(all_dirs, desc="Processing directories"):
        if len(files) > 0:
            make_histogram(dirpath, files, dest_dir, stats_dest_dir)

In [5]:
process_directory_structure("/home/heitor/USP/IC/FAPESP/code_dataset/dataset/Plant_leave_diseases_dataset_without_augmentation",
                            "/home/heitor/USP/IC/FAPESP/code_dataset/dataset/Regular_Histograms",
                            "/home/heitor/USP/IC/FAPESP/code_dataset/dataset/Regular_Histograms_Stats")

Processing directories:   0%|          | 0/40 [00:00<?, ?it/s]

In [None]:
process_directory_structure("/home/heitor/USP/IC/FAPESP/code_dataset/dataset/Plant_leave_diseases_dataset_without_augmentation",
                            "/home/heitor/USP/IC/FAPESP/code_dataset/dataset/Regular_Histograms", make_histogram)