In this notebook we use the bounding boxes given with the [Imagenet data set](https://www.kaggle.com/c/imagenet-object-localization-challenge) to generate a new sub-set. From the original data set `Imagenet_full` by only keeping the smallest square comprising the bounding boxe we generate `Imagenet_bbox` sub-set




In [2]:
import retinoto_py as fovea
args = fovea.Params(batch_size=1, shuffle=False)
args


Params(image_size=224, do_mask=False, do_fovea=False, rs_min=0.0, rs_max=-5.0, padding_mode='zeros', seed=2018, batch_size=1, num_workers=4, in_memory=True, model_name='resnet101', do_scratch=False, num_epochs=20, n_train_stop=40960, n_val_stop=5120, do_full_training=False, lr=1e-05, delta1=0.05, delta2=0.0, weight_decay=0.0, label_smoothing=0.0, shuffle=False, verbose=True)

## reading localisation metadata

First for the 'train' dataset:

In [None]:
folder = 'train'
annotation_file = args.DATAROOT / f'LOC_{folder}_solution.csv'

with open(annotation_file, 'r') as csv_file:
    df_data = fovea.pd.read_csv(csv_file)
df_data.head()

In [None]:
def get_boxes(df, value):
    idx = list(df['ImageId'][df['ImageId'] == value].index)
    bboxes = []
    if idx:
        for i in range(len(df["PredictionString"][idx[0]].split(' '))//5):
            pos =(5*i)
            bboxes.append({'xmin' : int(df["PredictionString"][idx[0]].split(' ')[1 + pos]),
                           'ymin' : int(df["PredictionString"][idx[0]].split(' ')[2 + pos]),
                           'xmax' : int(df["PredictionString"][idx[0]].split(' ')[3 + (5 *i)]),
                           'ymax' : int(df["PredictionString"][idx[0]].split(' ')[4 + (5 *i)])
                                        })
    return bboxes

In [None]:
get_boxes(df_data, 'n02099849_2300')

In [None]:
get_boxes(df_data, 'n01440764_32420')

Now for the 'val' dataset:

In [None]:
folder = 'val'
annotation_file = args.DATAROOT / f'LOC_{folder}_solution.csv'

with open(annotation_file, 'r') as csv_file:
    df_data = fovea.pd.read_csv(csv_file)
df_data.head()

In [None]:
get_boxes(df_data, 'ILSVRC2012_val_00026171')

In [None]:
from pathlib import Path

# Répertoire des images de validation (adapter si nécessaire)
val_dir = args.DATAROOT / 'Imagenet_full' / 'val'
N_show = 12
# Choisir quelques images (ici les 8 premières ImageId présentes dans df_data)
image_ids = df_data['ImageId'].unique()[:N_show].tolist()
image_ids

In [None]:
from matplotlib.patches import Rectangle
from PIL import Image

n = len(image_ids)
cols = 4
rows = (n + cols - 1) // cols

fig, axes = fovea.plt.subplots(rows, cols, figsize=(4*cols, 4*rows))
axes = axes.flatten()

for ax_idx, imgid in enumerate(image_ids):
    ax = axes[ax_idx]
    # trouver le fichier d'image correspondant (quelque soit l'extension)
    files = list(val_dir.rglob(f'{imgid}.*'))
    if not files:
        ax.set_title(f'{imgid} not found')
        ax.axis('off')
        continue
    img_path = files[0]
    img = Image.open(img_path).convert('RGB')
    ax.imshow(img)

    # récupérer les boîtes depuis df_data via la fonction get_boxes définie plus haut
    boxes = get_boxes(df_data, imgid)
    for b in boxes:
        xmin, ymin, xmax, ymax = b['xmin'], b['ymin'], b['xmax'], b['ymax']
        w = xmax - xmin
        h = ymax - ymin
        rect = Rectangle((xmin, ymin), w, h, linewidth=2, edgecolor='red', facecolor='none')
        ax.add_patch(rect)

    ax.set_title(imgid)
    ax.axis('off')

# masquer les axes restants si nécessaire
for i in range(len(image_ids), rows*cols):
    axes[i].axis('off')

fovea.plt.tight_layout()
fovea.plt.show()

## cropping and padding images


In [None]:

def bbox_to_square(xmin, ymin, xmax, ymax, margin: int = 0):
    """Retourne (cx, cy, size) du plus petit carré contenant la bbox + marge (en pixels)."""
    w = xmax - xmin
    h = ymax - ymin
    size = int(round(max(w, h))) + int(margin)
    cx = (xmin + xmax) / 2.0
    cy = (ymin + ymax) / 2.0
    return cx, cy, size


def crop_square_around_bbox(img, bbox, output_size=None, margin:int=0, fill=0):
    """
    Crop (ou pad) un carré autour de bbox.
    - img: PIL.Image ou torch.Tensor (C,H,W)
    - bbox: (xmin, ymin, xmax, ymax) en pixels (image coordinates)
    - output_size: None -> taille = taille du carré englobant ; sinon int ou (w,h)
    - margin: pixels à ajouter au carré avant crop
    - fill: valeur de remplissage pour le padding (int ou tuple)
    Retourne le patch (PIL.Image ou torch.Tensor selon l'input).
    """
    xmin, ymin, xmax, ymax = bbox
    cx, cy, size = bbox_to_square(xmin, ymin, xmax, ymax, margin=margin)

    if output_size is None:
        ow = oh = size
    elif isinstance(output_size, int):
        ow = oh = output_size
    else:
        ow, oh = output_size

    left = int(round(cx - ow / 2))
    top = int(round(cy - oh / 2))
    right = left + ow
    bottom = top + oh

    # PIL handling
    if isinstance(img, fovea.Image.Image):
        pad_left = max(0, -left)
        pad_top = max(0, -top)
        pad_right = max(0, right - img.width)
        pad_bottom = max(0, bottom - img.height)
        if any((pad_left, pad_top, pad_right, pad_bottom)):
            img = fovea.TF.pad(img, (pad_left, pad_top, pad_right, pad_bottom), fill=fill)
            left += pad_left
            top += pad_top
        return fovea.TF.crop(img, top, left, oh, ow)

    # Tensor handling (C,H,W)
    if isinstance(img, fovea.torch.Tensor):
        _, H, W = img.shape
        pad_left = max(0, -left)
        pad_top = max(0, -top)
        pad_right = max(0, right - W)
        pad_bottom = max(0, bottom - H)
        if any((pad_left, pad_top, pad_right, pad_bottom)):
            # TF.pad accepts tensors; padding tuple is (left, top, right, bottom)
            img = fovea.TF.pad(img, (pad_left, pad_top, pad_right, pad_bottom), fill=fill)
            left += pad_left
            top += pad_top
        return img[:, top:top + oh, left:left + ow]

    # fallback: convert to PIL and ré-appeler
    img_pil = fovea.TF.to_pil_image(img)
    return crop_square_around_bbox(img_pil, bbox, output_size=(ow,oh), margin=0, fill=fill)


In [None]:
fig, axes = fovea.plt.subplots(rows, cols, figsize=(4*cols, 4*rows))
axes = axes.flatten()

rel_margin = .1 # margin computed relative to the diagonal length of the bnounding box

for ax_idx, imgid in enumerate(image_ids):
    ax = axes[ax_idx]
    # trouver le fichier d'image correspondant (quelque soit l'extension)
    files = list(val_dir.rglob(f'{imgid}.*'))
    if not files:
        ax.set_title(f'{imgid} not found')
        ax.axis('off')
        continue
    img_path = files[0]
    img = fovea.Image.open(img_path).convert('RGB')
    boxes = get_boxes(df_data, imgid)
    b = boxes[0]
    xmin, ymin, xmax, ymax = b['xmin'], b['ymin'], b['xmax'], b['ymax']
    margin = int( fovea.np.sqrt(((xmax-xmin)**2 + (ymax-ymin)**2)) * rel_margin )
    img_crop = crop_square_around_bbox(img, [xmin, ymin, xmax, ymax], margin=margin)
    ax.imshow(img_crop)

    ax.set_title(imgid)
    ax.axis('off')

# masquer les axes restants si nécessaire
for i in range(len(image_ids), rows*cols):
    axes[i].axis('off')

fovea.plt.tight_layout()
fovea.plt.show()

## Building the new dataset


In [None]:
def clean_list(list_dir, EXCLUDED_FILES={'.DS_Store', '.ipynb_checkpoints'}):
    return [ p for p in list_dir if p.is_file() and p.name not in EXCLUDED_FILES  ]
    
FULL_DATA_DIR = args.DATAROOT / 'Imagenet_full'
src_root = FULL_DATA_DIR / 'val'
for img_path in clean_list(list(src_root.rglob('*.*'))):
    print(f'File {img_path} decomposes into a class_id = {img_path.parent.name} and a imgid = {img_path.stem}')
    break

In [None]:
FULL_DATA_DIR = args.DATAROOT / 'Imagenet_full'
BBOX_DATA_DIR = args.DATAROOT / 'Imagenet_bbox'
BBOX_DATA_DIR.mkdir(exist_ok=True)
IMG_EXTS = {'.jpg', '.jpeg', '.JPEG', '.png', '.bmp'}

# parameters for the new dataset
format = 'png' # lossless encoding
rel_margin = .1 # margin computed relative to the diagonal length of the bnounding box

for folder in ['val', 'train']:
# for folder in ['train']:
    print(f'\n Scanning folder "{folder}"')
    annotation_file = args.DATAROOT / f'LOC_{folder}_solution.csv'
    with open(annotation_file, 'r') as csv_file:
        df_data = fovea.pd.read_csv(csv_file)

    src_root = FULL_DATA_DIR / folder
    tgt_root = BBOX_DATA_DIR / folder
    tgt_root.mkdir(parents=True, exist_ok=True)

    count_in = 0
    count_out = 0

    # parcours récursif avec pathlib
    for img_path in fovea.tqdm(clean_list(list(src_root.rglob('*.*')))):
        if not img_path.is_file() or img_path.suffix not in IMG_EXTS:
            print(f'File {img_path} is detected as an invalid image.')
            continue

        count_in += 1
        imgid = img_path.stem
        # Get the list of bounding boxes for a specific image. If that list is empty (meaning no boxes were found for this image), then skip to the next image and don't run any code that comes after this line.
        boxes = get_boxes(df_data, imgid)
        if not boxes:
            continue

        class_id = img_path.parent.name
        target_folder = tgt_root / class_id
        target_folder.mkdir(parents=True, exist_ok=True)

        original_image = None
        for i_obj, b in enumerate(boxes):
            no = '' if i_obj == 0 else f'_{i_obj}'
            out_path = target_folder / f'{imgid}{no}.{format}'
            if out_path.is_file():
                # the file already exists let's skip it
                count_out += 1
                continue

            if original_image is None:
                try:
                    original_image = Image.open(img_path).convert('RGB')
                except Exception as e:
                    print(f'  could not open {img_path}: {e}')
                    break

            xmin, ymin, xmax, ymax = b['xmin'], b['ymin'], b['xmax'], b['ymax']
            margin = int( fovea.np.sqrt(((xmax-xmin)**2 + (ymax-ymin)**2)) * rel_margin )
            crop_image = crop_square_around_bbox(original_image, (xmin, ymin, xmax, ymax), margin=margin)
            crop_image.save(out_path, format='PNG')
            count_out += 1

    print(f' - in: {count_in} / out: {count_out}')

## Showcasing the new dataset


In [None]:
val_dir = args.DATAROOT / 'Imagenet_bbox' / 'val'

fig, axes = fovea.plt.subplots(rows, cols, figsize=(4*cols, 4*rows))
axes = axes.flatten()

for ax_idx, imgid in enumerate(image_ids):
    ax = axes[ax_idx]
    # trouver le fichier d'image correspondant (quelque soit l'extension)
    files = list(val_dir.rglob(f'{imgid}.*'))
    if not files:
        ax.set_title(f'{imgid} not found')
        ax.axis('off')
        continue
    img_path = files[0]
    img = Image.open(img_path).convert('RGB')
    ax.imshow(img)

    ax.set_title(imgid)
    ax.axis('off')

# masquer les axes restants si nécessaire
for i in range(len(image_ids), rows*cols):
    axes[i].axis('off')

fovea.plt.tight_layout()
fovea.plt.show()

## Debugging the new dataset for `from torchvision.io import read_image`



In [None]:
import retinoto_py as fovea
args = fovea.Params(batch_size=1, shuffle=False)
args

from PIL import Image
Image.MAX_TEXT_CHUNK = 10 * 1024 * 1024   # 10 MiB (choose a value > largest chunk)

from pathlib import Path

def clean_png(image_path):
    """Re-encode PNG cleanly without ICC profile"""
    try:
        img = Image.open(image_path)
        # Convert to RGB if needed, save cleanly
        if img.mode == 'RGBA':
            img = img.convert('RGB')
        img.save(image_path, 'PNG')
        return True
    except Exception as e:
        print(f"Error: {e}")
        return False

for data_dir in [args.DATAROOT / 'Imagenet_bbox' / 'val', args.DATAROOT / 'Imagenet_bbox' / 'train']:
    for png_file in fovea.tqdm(list(data_dir.rglob("*.png"))):
        clean_png(str(png_file))
        # print(f"Cleaned: {png_file}")

In [None]:
from pathlib import Path
def inspect_png_chunks(filepath):
    """Read PNG chunks and report text chunk sizes"""
    with open(filepath, 'rb') as f:
        # PNG signature
        if f.read(8) != b'\x89PNG\r\n\x1a\n':
            return None
        
        chunks = []
        while True:
            length_bytes = f.read(4)
            if len(length_bytes) < 4:
                break
            
            chunk_len = int.from_bytes(length_bytes, 'big')
            chunk_type = f.read(4).decode('ascii', errors='ignore')
            chunk_data = f.read(chunk_len)
            crc = f.read(4)
            
            chunks.append((chunk_type, chunk_len))
            
            if chunk_type == 'IEND':
                break
        
        return chunks

# # Find problematic files
# for png_file in data_dir.rglob("*.png"):
#     chunks = inspect_png_chunks(png_file)
#     if chunks:
#         for chunk_type, chunk_len in chunks:
#             if chunk_type in ['tEXt', 'zTXt', 'iTXt'] and chunk_len > 128 * 1024:
#                 print(f"{png_file}: {chunk_type} chunk = {chunk_len / 1024:.1f} KiB")

def find_large_text_chunks(directory, size_limit=128*1024):
    problematic = []
    for png_file in Path(directory).rglob("*.png"):
        chunks = inspect_png_chunks(png_file)
        if chunks:
            for chunk_type, chunk_len in chunks:
                if chunk_type in ['tEXt', 'zTXt', 'iTXt'] and chunk_len > size_limit:
                    problematic.append({
                        'file': str(png_file),
                        'chunk_type': chunk_type,
                        'size_kb': chunk_len / 1024
                    })
    return problematic

# Find all problematic files
for data_dir in [args.DATAROOT / 'Imagenet_bbox' / 'val', args.DATAROOT / 'Imagenet_bbox' / 'train']:
    issues = find_large_text_chunks(data_dir)
    for issue in issues:
        print(f"{issue['file']}: {issue['chunk_type']} = {issue['size_kb']:.1f} KiB")

: 

In [None]:
import os
from PIL import Image

def strip_icc_profiles(img_path):
    if img_path.name.lower().endswith('.png'):
        try:
            img = Image.open(img_path)

            # Remove ICC profile if it exists
            if 'icc_profile' in img.info:
                img.info.pop('icc_profile')

            img.save(img_path)
        except Exception as e:
            print(f"Error processing {img_path}: {e}")

for data_dir in [args.DATAROOT / 'Imagenet_bbox' / 'val', args.DATAROOT / 'Imagenet_bbox' / 'train']:
    for png_file in fovea.tqdm(list(data_dir.rglob("*.png"))):
        strip_icc_profiles(png_file)

In [None]:
# def convert_png_to_jpg(image_path):
#     """Re-encode PNG cleanly without ICC profile"""
#     try:
#         img = Image.open(image_path).convert('RGB')
#         imgid = img_path.stem
#         class_id = img_path.parent.name
#         target_folder = tgt_root / class_id
#         out_path = target_folder / f'{imgid}.jpg'        
#         # https://pc-pillow.readthedocs.io/en/latest/Image_class/Image_save.html
#         img.save(out_path, quality=95)
#         image_path.unlink()
#         return True
#     except Exception as e:
#         print(f"Error: {e}")
#         return False

# for data_dir in [args.DATAROOT / 'Imagenet_bbox' / 'val', args.DATAROOT / 'Imagenet_bbox' / 'train']:
#     for png_file in fovea.tqdm(list(data_dir.rglob("*.png"))):
#         convert_png_to_jpg(str(png_file))
#         # print(f"Cleaned: {png_file}")

In [21]:
"""
Fix malformed ICC profiles in PNG files by re-encoding them.
Run this once on your ImageNet dataset to permanently remove the warnings.
"""

from PIL import Image
from pathlib import Path
from joblib import Parallel, delayed
from tqdm import tqdm

def fix_png_profile(image_path):
    """
    Remove ICC profile from a single PNG by re-encoding it.
    Returns True if successful, False otherwise.
    """
    try:
        img = Image.open(image_path)
        
        # Convert RGBA to RGB if needed
        if img.mode == 'RGBA':
            rgb_img = Image.new('RGB', img.size, (255, 255, 255))
            rgb_img.paste(img, mask=img.split()[3])
            img = rgb_img
        elif img.mode != 'RGB':
            img = img.convert('RGB')
        
        # Save without ICC profile - this removes the malformed metadata
        temp_path = image_path.with_suffix('.temp.png')
        img.save(temp_path, 'PNG')
        temp_path.replace(image_path)
        return True
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return False

def fix_all_pngs(root_dir, num_workers=-1):
    """
    Recursively find and fix all PNG files in a directory.
    
    Parameters
    ----------
    root_dir : Path
        Root directory containing PNG files
    num_workers : int
        Number of parallel workers for processing
    """
    png_files = list(root_dir.rglob("*.png"))
    
    print(f"Found {len(png_files)} PNG files to process")
    
    if len(png_files) == 0:
        print("No PNG files found!")
        return
    
    # Process in parallel with progress bar
    results = Parallel(n_jobs=num_workers)(
        delayed(fix_png_profile)(f) for f in tqdm(png_files, desc="Fixing PNG profiles")
    )
    
    successful = sum(results)
    print(f"\nSuccessfully fixed {successful}/{len(png_files)} files")


for data_dir in [args.DATAROOT / 'Imagenet_bbox' / 'val', args.DATAROOT / 'Imagenet_bbox' / 'train']:
    print(f"\nProcessing: {data_dir}")
    fix_all_pngs(data_dir, num_workers=-1)


Processing: /Users/laurent/data/Imagenet/Imagenet_bbox/val
Found 80475 PNG files to process


Fixing PNG profiles: 100%|██████████| 80475/80475 [00:30<00:00, 2660.78it/s]



Successfully fixed 80475/80475 files

Processing: /Users/laurent/data/Imagenet/Imagenet_bbox/train
Found 615298 PNG files to process


Fixing PNG profiles:  23%|██▎       | 139200/615298 [00:56<03:28, 2284.28it/s]

Error processing /Users/laurent/data/Imagenet/Imagenet_bbox/train/n02087394/n02087394_22856.png: cannot identify image file '/Users/laurent/data/Imagenet/Imagenet_bbox/train/n02087394/n02087394_22856.png'


Fixing PNG profiles:  56%|█████▌    | 343104/615298 [02:30<01:37, 2802.04it/s]

Error processing /Users/laurent/data/Imagenet/Imagenet_bbox/train/n04536866/n04536866_12128_2.png: Decompressed data too large for PngImagePlugin.MAX_TEXT_CHUNK
Error processing /Users/laurent/data/Imagenet/Imagenet_bbox/train/n04536866/n04536866_12128.png: Decompressed data too large for PngImagePlugin.MAX_TEXT_CHUNK
Error processing /Users/laurent/data/Imagenet/Imagenet_bbox/train/n04536866/n04536866_12128_1.png: Decompressed data too large for PngImagePlugin.MAX_TEXT_CHUNK
Error processing /Users/laurent/data/Imagenet/Imagenet_bbox/train/n04536866/n04536866_12128_3.png: Decompressed data too large for PngImagePlugin.MAX_TEXT_CHUNK


Fixing PNG profiles:  95%|█████████▍| 583360/615298 [04:14<00:11, 2720.10it/s]

Error processing /Users/laurent/data/Imagenet/Imagenet_bbox/train/n02119789/n02119789_17624.png: cannot identify image file '/Users/laurent/data/Imagenet/Imagenet_bbox/train/n02119789/n02119789_17624.png'


Fixing PNG profiles: 100%|██████████| 615298/615298 [04:27<00:00, 2301.73it/s]



Successfully fixed 615292/615298 files


Voilà !