In [4]:
import os
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go

In [5]:
def calculate_mean_size(sizes):
    widths, heights = zip(*sizes)  # Unzip the list of tuples
    mean_width = sum(widths) / len(widths)
    mean_height = sum(heights) / len(heights)
    return mean_width, mean_height

In [6]:
import plotly.graph_objects as go

def plot_size_distribution_plotly(image_sizes_dict):
    fig = go.Figure()
    
    for label, sizes in image_sizes_dict.items():
        if not sizes:  # Skip if there are no sizes (empty list)
            continue
        
        # Assuming `sizes` is a list of (width, height) tuples
        widths, heights = zip(*sizes)  # This line should now work as expected
        
        mean_width = sum(widths) / len(widths)
        mean_height = sum(heights) / len(heights)
        
        # Print mean width and height
        print(f"{label} - Mean Width: {mean_width:.2f}, Mean Height: {mean_height:.2f}")
        
        # Plot histograms
        fig.add_trace(go.Histogram(x=widths, name=f'{label} Widths', marker_color='blue', opacity=0.6))
        fig.add_trace(go.Histogram(x=heights, name=f'{label} Heights', marker_color='red', opacity=0.6))
    
    # Update layout for better visualization
    fig.update_layout(title='Image Size Distribution', xaxis_title='Size', yaxis_title='Frequency', barmode='overlay')
    fig.update_traces(opacity=0.75)  # Update all traces to make overlay visible
    fig.show()


In [7]:
import plotly.graph_objects as go

def plot_separate_size_distributions(image_sizes_dict):
    for label, sizes in image_sizes_dict.items():
        if not sizes:  # Skip if there are no sizes (empty list)
            continue
        
        widths, heights = zip(*sizes)  # Unzip the list of tuples
        
        # Plot Width Distribution
        fig_widths = go.Figure()
        fig_widths.add_trace(go.Histogram(x=widths, marker_color='blue', opacity=0.75))
        fig_widths.update_layout(title=f'{label} Width Distribution', xaxis_title='Width', yaxis_title='Frequency')
        fig_widths.show()

        # Plot Height Distribution
        fig_heights = go.Figure()
        fig_heights.add_trace(go.Histogram(x=heights, marker_color='red', opacity=0.75))
        fig_heights.update_layout(title=f'{label} Height Distribution', xaxis_title='Height', yaxis_title='Frequency')
        fig_heights.show()

        # Print Mean Width and Height
        mean_width = sum(widths) / len(widths)
        mean_height = sum(heights) / len(heights)
        print(f"{label} - Mean Width: {mean_width:.2f}, Mean Height: {mean_height:.2f}")


In [8]:
def collect_image_sizes(root_dir):
    image_sizes = {}
    for class_dir in tqdm(os.listdir(root_dir), desc="Classes Processed"):
        class_path = os.path.join(root_dir, class_dir)
        if os.path.isdir(class_path):
            sizes = []
            for img_file in os.listdir(class_path):
                try:
                    with Image.open(os.path.join(class_path, img_file)) as img:
                        sizes.append(img.size)  # (width, height)
                except Exception as e:
                    continue  # Skip files that cause errors
            image_sizes[class_dir] = sizes
    return image_sizes


def collect_test_image_sizes(test_dir):
    sizes = []
    for img_file in tqdm(os.listdir(test_dir), desc="Processing Test Images"):
        try:
            with Image.open(os.path.join(test_dir, img_file)) as img:
                sizes.append(img.size)  # (width, height)
        except Exception as e:
            continue  # Skip files that cause errors
    return sizes


In [9]:
# Adjust these paths to your local setup
train_dir = "iith-dl-contest-2024\\train\\train"
val_dir = "C:\\Users\\gupta\\Downloads\\Kaggle-Data-Set\\test"

# Example usage:
# Adjust this path to your local setup
test_dir = "iith-dl-contest-2024\\test\\test"

In [10]:
# Assuming you have directories set up and functions defined for collecting sizes
train_sizes = collect_image_sizes(train_dir)
val_sizes = collect_image_sizes(val_dir)
test_sizes = collect_test_image_sizes(test_dir)  # If you have a test set prepared




Classes Processed:   0%|          | 0/50 [00:00<?, ?it/s]

Classes Processed: 100%|██████████| 50/50 [04:19<00:00,  5.18s/it]
Classes Processed: 100%|██████████| 50/50 [00:00<00:00, 50.49it/s]
Processing Test Images: 100%|██████████| 38366/38366 [02:17<00:00, 278.15it/s]


In [11]:
train_all_sizes = [size for sizes in train_sizes.values() for size in sizes]
val_all_sizes = [size for sizes in val_sizes.values() for size in sizes]
# Test sizes are assumed to be directly in the correct format

sizes_dict = {
    "Training": train_all_sizes,
    "Validation": val_all_sizes,
    "Test": test_sizes  # Assuming this is already a list of (width, height) tuples
}

plot_size_distribution_plotly(sizes_dict)

Training - Mean Width: 494.41, Mean Height: 423.60
Validation - Mean Width: 486.97, Mean Height: 424.94
Test - Mean Width: 182.22, Mean Height: 173.51


In [12]:
plot_separate_size_distributions(sizes_dict)


Training - Mean Width: 494.41, Mean Height: 423.60


Validation - Mean Width: 486.97, Mean Height: 424.94


Test - Mean Width: 182.22, Mean Height: 173.51


In [13]:
sizes_dict['Test'][0]

(64, 64)