In [15]:
import cv2
import numpy as np
import os
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis
import pickle

In [16]:
def compute_statistics(hist):
    hist_norm = hist / hist.sum()  # Normalize histogram to get probability distribution
    bin_edges = np.arange(257)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
    
    mean = np.sum(bin_centers * hist_norm)
    median = np.percentile(bin_centers, 50, method='nearest')  # Ensure correct use of percentile
    std_dev = np.sqrt(np.sum((bin_centers - mean) ** 2 * hist_norm))
    skewness = skew(hist_norm, bias=False)
    kurt = kurtosis(hist_norm, bias=False)
    
    return {
        'mean': mean,
        'median': median,
        'std_dev': std_dev,
        'skewness': skewness,
        'kurtosis': kurt
    }

In [17]:
def make_histogram(class_, files, dest_dir, dest_stats_dir):
    dest_class = class_.split("/")[-1]
    class_dest_dir = os.path.join(dest_dir, dest_class)
    class_stats_dest_dir = os.path.join(dest_stats_dir, dest_class)
    os.makedirs(class_dest_dir, exist_ok=True)
    os.makedirs(class_stats_dest_dir, exist_ok=True)
    
    for file in files:
        filepath = os.path.join(class_, file)
        img = cv2.imread(filepath)
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB (OpenCV uses BGR by default)
        img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        luminance_hist = cv2.calcHist([img_gray], [0], None, [256], [0, 256]).flatten()
        hist_b = cv2.calcHist([img_rgb], [0], None, [256], [0, 256]).flatten()
        hist_g = cv2.calcHist([img_rgb], [1], None, [256], [0, 256]).flatten()
        hist_r = cv2.calcHist([img_rgb], [2], None, [256], [0, 256]).flatten()
        
        histograms = {
            'luminance': luminance_hist,
            'red': hist_r,
            'green': hist_g,
            'blue': hist_b
        }
        
        statistics = {
            'luminance': compute_statistics(luminance_hist),
            'red': compute_statistics(hist_r),
            'green': compute_statistics(hist_g),
            'blue': compute_statistics(hist_b)
        }
        
        file_base = file.replace('.JPG', '')
        destination_path = os.path.join(class_dest_dir, file_base + '_histograms.pkl')
        stats_destination_path = os.path.join(class_stats_dest_dir, file_base + '_stats.pkl')
        
        if not isinstance(histograms, dict):
            print("Error in histograms")
        with open(destination_path, 'wb') as f:
            pickle.dump(histograms, f)  # Save histograms using pickle

        with open(stats_destination_path, 'wb') as f:
            pickle.dump(statistics, f)  # Save statistics using pickle

In [18]:
def process_directory_structure(src_dir, dest_dir, stats_dest_dir):
    os.makedirs(dest_dir, exist_ok=True)
    os.makedirs(stats_dest_dir, exist_ok=True)
    all_dirs = list(os.walk(src_dir))
    
    # Iterate over each directory
    for dirpath, _, files in tqdm(all_dirs, desc="Processing directories"):
        if len(files) > 0:
            make_histogram(dirpath, files, dest_dir, stats_dest_dir)

In [19]:
process_directory_structure("/home/heitor/USP/IC/FAPESP/code_dataset/dataset/Plant_leave_diseases_dataset_without_augmentation",
                            "/home/heitor/USP/IC/FAPESP/code_dataset/dataset/Regular_Histograms",
                            "/home/heitor/USP/IC/FAPESP/code_dataset/dataset/Regular_Histograms_Stats")

Processing directories:   0%|          | 0/40 [00:00<?, ?it/s]