In [11]:
from PIL import Image, ImageDraw
import numpy as np
import os
import xml.etree.ElementTree as ET

In [None]:

source_path = 'Stanford40/JPEGImages/'

# enumerate all images
# collect the following statistics:
# max width, max height, min width, min height, average width, average height
# median width, median height
# for every channel in RGB, collect the mean and std for the normalization later

# Initialize variables to collect statistics
widths = []
heights = []

# for every channel in RGB, collect the mean and std for the normalization later
mean = np.zeros(3)
std = np.zeros(3)

# Iterate through all images in the source path
for filename in os.listdir(source_path):
    if filename.endswith('.jpg') or filename.endswith('.png'):
        with Image.open(os.path.join(source_path, filename)) as img:
            width, height = img.size
            widths.append(width)
            heights.append(height)
            img = np.array(img)
            mean += np.mean(img, axis=(0, 1))
            std += np.std(img, axis=(0, 1))

mean /= len(widths)
std /= len(widths)

# Calculate statistics
max_width = np.max(widths)
max_height = np.max(heights)
min_width = np.min(widths)
min_height = np.min(heights)
avg_width = np.mean(widths)
avg_height = np.mean(heights)
median_width = np.median(widths)
median_height = np.median(heights)

print(f"Max Width: {max_width}, Max Height: {max_height}")
print(f"Min Width: {min_width}, Min Height: {min_height}")
print(f"Average Width: {avg_width}, Average Height: {avg_height}")
print(f"Median Width: {median_width}, Median Height: {median_height}")
print(f"Mean: {mean}, Std: {std}")



In [5]:
# Mean: [119.40268332 112.331025   102.14789407], Std: [63.13785612 61.05932064 61.90802032]
# scale it to 0...1 range

mean /= 255
std /= 255

print(f"Mean: {mean}, Std: {std}")

Mean: [0.46824582 0.44051382 0.40057998], Std: [0.24759944 0.23944832 0.24277655]


In [7]:
dataset_path = 'C:/Users/lootman/Загрузки/data/'
images_path = os.path.join(dataset_path, 'JPEGImages')
annotations_path = os.path.join(dataset_path, 'XMLAnnotations')
bitmask_path = os.path.join(dataset_path, 'Bitmasks')

In [10]:
def get_bounding_box(image_id):
    xml_path = os.path.join(annotations_path, image_id + '.xml')
    if os.path.exists(xml_path):
        # Parse and print the contents of the XML file
        tree = ET.parse(xml_path)
        root = tree.getroot()
        bbox = root.find('object/bndbox')

        if bbox is None:
            raise ValueError(f"No bounding box found in {xml_path}")

        box = [
            float(bbox.find('xmin').text),
            float(bbox.find('ymin').text),
            float(bbox.find('xmax').text),
            float(bbox.find('ymax').text)
        ]
        size = root.find('size')
        if size is None:
            raise ValueError(f"No size found in {xml_path}")
        if size.depth is None or size.depth.text != '3':
            raise ValueError(f"Image {image_id} is not a 3-channel image")
        
        size = [
            float(size.width.text),
            float(size.height.text)
        ]

        return size, box
    else:
        print(f"XML file for {image_id} not found at {xml_path}")

In [13]:
def create_bitmask(image_size, bbox):
    """
    Create a bitmask for a given bounding box.
    :param image_size: (width, height) of the image
    :param bbox: (x_min, y_min, x_max, y_max) bounding box coordinates
    :return: Bitmask as a tensor
    """
    mask = Image.new('L', image_size, 0)  # Create a blank mask
    draw = ImageDraw.Draw(mask)
    draw.rectangle(bbox, fill=1)  # Fill the bounding box
    return mask