In [5]:
import os
import glob
from PIL import Image
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
data_dir = '../Data/0325updated.task1train(626p)-20251119T175414Z-1-001/0325updated.task1train(626p)'

print("="*60)
print("IMAGE DATASET ANALYSIS")
print("="*60)

# Find all image files
image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.tiff', '*.gif']
image_files = []
for ext in image_extensions:
    image_files.extend(glob.glob(os.path.join(data_dir, ext)))

print(f"\nTotal images found: {len(image_files)}")

IMAGE DATASET ANALYSIS

Total images found: 712


In [4]:
if len(image_files) == 0:
    print("No images found! Please check the directory path.")
else:
    # Analyze images
    print("\nAnalyzing images...")
    
    widths = []
    heights = []
    aspect_ratios = []
    modes = []
    file_sizes = []
    channels = []
    
    corrupted_files = []
    
    for img_path in image_files:
        try:
            # Get file size
            file_size = os.path.getsize(img_path) / 1024  # KB
            file_sizes.append(file_size)
            
            # Open and analyze image
            img = Image.open(img_path)
            widths.append(img.width)
            heights.append(img.height)
            aspect_ratios.append(img.width / img.height)
            modes.append(img.mode)
            
            # Count channels
            if img.mode == 'RGB':
                channels.append(3)
            elif img.mode == 'RGBA':
                channels.append(4)
            elif img.mode == 'L':
                channels.append(1)
            else:
                channels.append(len(img.getbands()))
                
        except Exception as e:
            corrupted_files.append((os.path.basename(img_path), str(e)))
    
    # Statistics
    print("\n" + "="*60)
    print("IMAGE STATISTICS")
    print("="*60)
    
    print(f"\nüìê Dimensions:")
    print(f"   Width  - Min: {min(widths):4d}px | Max: {max(widths):4d}px | Avg: {np.mean(widths):.1f}px")
    print(f"   Height - Min: {min(heights):4d}px | Max: {max(heights):4d}px | Avg: {np.mean(heights):.1f}px")
    
    print(f"\nüìä Aspect Ratios:")
    print(f"   Min: {min(aspect_ratios):.3f} | Max: {max(aspect_ratios):.3f} | Avg: {np.mean(aspect_ratios):.3f}")
    
    # Check if dimensions are consistent
    unique_dims = set(zip(widths, heights))
    if len(unique_dims) == 1:
        print(f"\n‚úì All images have SAME dimensions: {widths[0]}x{heights[0]}")
    else:
        print(f"\n‚ö† Images have DIFFERENT dimensions ({len(unique_dims)} unique sizes)")
        print(f"   Most common sizes:")
        dim_counter = Counter(zip(widths, heights))
        for (w, h), count in dim_counter.most_common(5):
            print(f"     {w}x{h}: {count} images ({count/len(image_files)*100:.1f}%)")
    
    print(f"\nüé® Color Modes:")
    mode_counter = Counter(modes)
    for mode, count in mode_counter.most_common():
        print(f"   {mode}: {count} images ({count/len(image_files)*100:.1f}%)")
    
    print(f"\nüì¶ File Sizes:")
    print(f"   Min: {min(file_sizes):.1f} KB | Max: {max(file_sizes):.1f} KB | Avg: {np.mean(file_sizes):.1f} KB")
    
    if corrupted_files:
        print(f"\n‚ö† Corrupted/Problematic Images: {len(corrupted_files)}")
        for fname, error in corrupted_files[:5]:
            print(f"   - {fname}: {error}")
    else:
        print(f"\n‚úì All images loaded successfully!")
    
    # Recommendations
    print("\n" + "="*60)
    print("PREPROCESSING RECOMMENDATIONS")
    print("="*60)
    
    needs_resize = len(unique_dims) > 1
    needs_normalization = True
    needs_grayscale_conversion = 'RGB' in modes or 'RGBA' in modes
    
    print("\nüîß Required preprocessing steps:")
    
    step = 1
    if needs_resize:
        print(f"\n{step}. RESIZE IMAGES ‚ö†Ô∏è")
        print(f"   Issue: {len(unique_dims)} different image sizes")
        print(f"   Action: Resize all images to consistent dimensions")
        print(f"   Suggested size: {max(set(widths), key=widths.count)}x{max(set(heights), key=heights.count)} (most common)")
        step += 1
    else:
        print(f"\n{step}. RESIZE IMAGES ‚úì")
        print(f"   All images already have consistent size: {widths[0]}x{heights[0]}")
        step += 1
    
    print(f"\n{step}. NORMALIZATION")
    print(f"   Action: Normalize pixel values to [0, 1] or [-1, 1]")
    print(f"   Current: Pixel values likely in [0, 255]")
    step += 1
    
    if needs_grayscale_conversion:
        print(f"\n{step}. COLOR MODE CONVERSION (Optional)")
        print(f"   Current modes: {', '.join(mode_counter.keys())}")
        print(f"   Consider: Converting to grayscale if color not needed")
        print(f"   Benefits: Reduces model complexity, faster training")
        step += 1
    
    print(f"\n{step}. DATA AUGMENTATION (Recommended)")
    print(f"   Techniques to consider:")
    print(f"   - Random rotation (¬±5-10¬∞)")
    print(f"   - Random brightness/contrast adjustment")
    print(f"   - Random scaling (90-110%)")
    print(f"   - Elastic distortions (for text/OCR tasks)")
    print(f"   Benefits: Improves model generalization")
    step += 1
    
    print(f"\n{step}. VERIFY IMAGE-TEXT PAIRS")
    print(f"   Action: Ensure each image has corresponding text file")
    print(f"   Images: {len(image_files)}")
    
    # Check for matching text files
    txt_files = glob.glob(os.path.join(data_dir, '*.txt'))
    print(f"   Text files: {len(txt_files)}")
    
    if len(image_files) != len(txt_files):
        print(f"   ‚ö†Ô∏è MISMATCH: {abs(len(image_files) - len(txt_files))} files difference")
    else:
        print(f"   ‚úì Same number of images and text files")



Analyzing images...

IMAGE STATISTICS

üìê Dimensions:
   Width  - Min:  436px | Max: 4961px | Avg: 1269.9px
   Height - Min:  605px | Max: 7016px | Avg: 2283.6px

üìä Aspect Ratios:
   Min: 0.263 | Max: 0.971 | Avg: 0.511

‚ö† Images have DIFFERENT dimensions (500 unique sizes)
   Most common sizes:
     4961x7016: 74 images (10.4%)
     1080x1527: 17 images (2.4%)
     1080x1528: 13 images (1.8%)
     793x1373: 5 images (0.7%)
     619x1475: 4 images (0.6%)

üé® Color Modes:
   RGB: 708 images (99.4%)
   L: 4 images (0.6%)

üì¶ File Sizes:
   Min: 57.8 KB | Max: 3540.2 KB | Avg: 535.5 KB

‚úì All images loaded successfully!

PREPROCESSING RECOMMENDATIONS

üîß Required preprocessing steps:

1. RESIZE IMAGES ‚ö†Ô∏è
   Issue: 500 different image sizes
   Action: Resize all images to consistent dimensions
   Suggested size: 4961x7016 (most common)

2. NORMALIZATION
   Action: Normalize pixel values to [0, 1] or [-1, 1]
   Current: Pixel values likely in [0, 255]

3. COLOR MODE CONV

In [6]:
# Extract basenames (without extensions)
image_basenames = {Path(img).stem: img for img in image_files}
txt_basenames = {Path(txt).stem: txt for txt in txt_files}

In [None]:
# Find images without text
images_without_text = set(image_basenames.keys()) - set(txt_basenames.keys())
print(f"\nüì∑ Images WITHOUT corresponding text: {len(images_without_text)}")
if images_without_text:
    for basename in sorted(list(images_without_text)[:10]):
        print(f"   - {basename}")
    if len(images_without_text) > 10:
        print(f"   ... and {len(images_without_text) - 10} more")

# Find text without images
text_without_images = set(txt_basenames.keys()) - set(image_basenames.keys())
print(f"\nüìù Text files WITHOUT corresponding images: {len(text_without_images)}")
if text_without_images:
    for basename in sorted(list(text_without_images)[:10]):
        print(f"   - {basename}.txt")
    if len(text_without_images) > 10:
        print(f"   ... and {len(text_without_images) - 10} more")


üì∑ Images WITHOUT corresponding text: 8
   - X51005433492(1)
   - X51005442384(1)
   - X51005605333(1)
   - X51005676539(1)
   - X51005685355(2)
   - X51005685357(2)
   - X51007339118(1)
   - X51007339647(1)

üìù Text files WITHOUT corresponding images: 131
   - X51005705804(1).txt
   - X51005719917(2).txt
   - X51005722668(1).txt
   - X51006332575(2).txt
   - X51006556838(1).txt
   - X51006557202(1).txt
   - X51006620186(1).txt
   - X51007225417(2).txt
   - X51007339166(1).txt
   - X51007339639(1).txt
   ... and 121 more


In [8]:
# Find matching pairs
matching_pairs = set(image_basenames.keys()) & set(txt_basenames.keys())
print(f"\n‚úì Matching pairs: {len(matching_pairs)}")

print(f"\n" + "="*60)
print("RECOMMENDATIONS")
print("="*60)

if images_without_text:
    print(f"\n‚ö†Ô∏è {len(images_without_text)} images have no labels")
    print("   Options:")
    print("   1. Remove these images (can't train without labels)")
    print("   2. Manually create text files for them")
    print("   3. Move to 'unlabeled' folder for later processing")

if text_without_images:
    print(f"\n‚ö†Ô∏è {len(text_without_images)} text files have no images")
    print("   Options:")
    print("   1. Remove orphaned text files (most common)")
    print("   2. Check if images exist with different extensions")
    print("   3. Move to 'orphaned_files' folder")


‚úì Matching pairs: 704

RECOMMENDATIONS

‚ö†Ô∏è 8 images have no labels
   Options:
   1. Remove these images (can't train without labels)
   2. Manually create text files for them
   3. Move to 'unlabeled' folder for later processing

‚ö†Ô∏è 131 text files have no images
   Options:
   1. Remove orphaned text files (most common)
   2. Check if images exist with different extensions
   3. Move to 'orphaned_files' folder
