In [10]:
import os
import glob
from PIL import Image
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from pathlib import Path

In [11]:
data_dir = '../Data/0325updated.task1train(626p)-20251119T175414Z-1-001/0325updated.task1train(626p)'

print("="*60)
print("IMAGE DATASET ANALYSIS")
print("="*60)

# Find all image files
image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.tiff', '*.gif']
image_files = []
for ext in image_extensions:
    image_files.extend(glob.glob(os.path.join(data_dir, ext)))

print(f"\nTotal images found: {len(image_files)}")

IMAGE DATASET ANALYSIS

Total images found: 712


In [None]:
if len(image_files) == 0:
    print("No images found! Please check the directory path.")
else:
    # Analyze images
    print("\nAnalyzing images...")
    
    widths = []
    heights = []
    aspect_ratios = []
    modes = []
    file_sizes = []
    channels = []
    
    corrupted_files = []
    
    for img_path in image_files:
        try:
            # Get file size
            file_size = os.path.getsize(img_path) / 1024  
            file_sizes.append(file_size)
            
            # Open and analyze image
            img = Image.open(img_path)
            widths.append(img.width)
            heights.append(img.height)
            aspect_ratios.append(img.width / img.height)
            modes.append(img.mode)
            
            # Count channels
            if img.mode == 'RGB':
                channels.append(3)
            elif img.mode == 'RGBA':
                channels.append(4)
            elif img.mode == 'L':
                channels.append(1)
            else:
                channels.append(len(img.getbands()))
                
        except Exception as e:
            corrupted_files.append((os.path.basename(img_path), str(e)))
    
    # Statistics
    print("\n" + "="*60)
    print("IMAGE STATISTICS")
    print("="*60)
    
    print(f"\nüìê Dimensions:")
    print(f"   Width  - Min: {min(widths):4d}px | Max: {max(widths):4d}px | Avg: {np.mean(widths):.1f}px")
    print(f"   Height - Min: {min(heights):4d}px | Max: {max(heights):4d}px | Avg: {np.mean(heights):.1f}px")
    
    print(f"\nüìä Aspect Ratios:")
    print(f"   Min: {min(aspect_ratios):.3f} | Max: {max(aspect_ratios):.3f} | Avg: {np.mean(aspect_ratios):.3f}")
    
    # Check if dimensions are consistent
    unique_dims = set(zip(widths, heights))
    if len(unique_dims) == 1:
        print(f"\n‚úì All images have SAME dimensions: {widths[0]}x{heights[0]}")
    else:
        print(f"\n‚ö† Images have DIFFERENT dimensions ({len(unique_dims)} unique sizes)")
        print(f"   Most common sizes:")
        dim_counter = Counter(zip(widths, heights))
        for (w, h), count in dim_counter.most_common(5):
            print(f"     {w}x{h}: {count} images ({count/len(image_files)*100:.1f}%)")
    
    print(f"\nüé® Color Modes:")
    mode_counter = Counter(modes)
    for mode, count in mode_counter.most_common():
        print(f"   {mode}: {count} images ({count/len(image_files)*100:.1f}%)")
    
    print(f"\nüì¶ File Sizes:")
    print(f"   Min: {min(file_sizes):.1f} KB | Max: {max(file_sizes):.1f} KB | Avg: {np.mean(file_sizes):.1f} KB")
    
    if corrupted_files:
        print(f"\n‚ö† Corrupted/Problematic Images: {len(corrupted_files)}")
        for fname, error in corrupted_files[:5]:
            print(f"   - {fname}: {error}")
    else:
        print(f"\n‚úì All images loaded successfully!")
    
    # Recommendations
    print("\n" + "="*60)
    print("PREPROCESSING RECOMMENDATIONS")
    print("="*60)
    
    needs_resize = len(unique_dims) > 1
    needs_normalization = True
    needs_grayscale_conversion = 'RGB' in modes or 'RGBA' in modes
    
    print("\nüîß Required preprocessing steps:")
    
    step = 1
    if needs_resize:
        print(f"\n{step}. RESIZE IMAGES ‚ö†Ô∏è")
        print(f"   Issue: {len(unique_dims)} different image sizes")
        print(f"   Suggested size: {max(set(widths), key=widths.count)}x{max(set(heights), key=heights.count)} (most common)")
        step += 1
    else:
        print(f"\n{step}. RESIZE IMAGES ‚úì")
        print(f"   All images already have consistent size: {widths[0]}x{heights[0]}")
        step += 1
    
    print(f"\n{step}. NORMALIZATION")
    step += 1
    
    if needs_grayscale_conversion:
        print(f"\n{step}. COLOR MODE CONVERSION (Optional)")
        print(f"   Current modes: {', '.join(mode_counter.keys())}")
        step += 1
    
    print(f"\n{step}. DATA AUGMENTATION (Recommended)")
    step += 1
    
    print(f"\n{step}. VERIFY IMAGE-TEXT PAIRS")
    print(f"   Action: Ensure each image has corresponding text file")
    print(f"   Images: {len(image_files)}")
    
    # Check for matching text files
    txt_files = glob.glob(os.path.join(data_dir, '*.txt'))
    print(f"   Text files: {len(txt_files)}")
    
    if len(image_files) != len(txt_files):
        print(f"   ‚ö†Ô∏è MISMATCH: {abs(len(image_files) - len(txt_files))} files difference")
    else:
        print(f"   ‚úì Same number of images and text files")



Analyzing images...

IMAGE STATISTICS

üìê Dimensions:
   Width  - Min:  436px | Max: 4961px | Avg: 1269.9px
   Height - Min:  605px | Max: 7016px | Avg: 2283.6px

üìä Aspect Ratios:
   Min: 0.263 | Max: 0.971 | Avg: 0.511

‚ö† Images have DIFFERENT dimensions (500 unique sizes)
   Most common sizes:
     4961x7016: 74 images (10.4%)
     1080x1527: 17 images (2.4%)
     1080x1528: 13 images (1.8%)
     793x1373: 5 images (0.7%)
     619x1475: 4 images (0.6%)

üé® Color Modes:
   RGB: 708 images (99.4%)
   L: 4 images (0.6%)

üì¶ File Sizes:
   Min: 57.8 KB | Max: 3540.2 KB | Avg: 535.5 KB

‚úì All images loaded successfully!

PREPROCESSING RECOMMENDATIONS

üîß Required preprocessing steps:

1. RESIZE IMAGES ‚ö†Ô∏è
   Issue: 500 different image sizes
   Suggested size: 4961x7016 (most common)

2. NORMALIZATION

3. COLOR MODE CONVERSION (Optional)
   Current modes: RGB, L

4. DATA AUGMENTATION (Recommended)

5. VERIFY IMAGE-TEXT PAIRS
   Action: Ensure each image has corresponding

In [13]:
# Extract basenames (without extensions)
image_basenames = {Path(img).stem: img for img in image_files}
txt_basenames = {Path(txt).stem: txt for txt in txt_files}

In [14]:
#  images without text
images_without_text = set(image_basenames.keys()) - set(txt_basenames.keys())
print(f"\nüì∑ Images WITHOUT corresponding text: {len(images_without_text)}")
if images_without_text:
    for basename in sorted(list(images_without_text)[:10]):
        print(f"   - {basename}")
    if len(images_without_text) > 10:
        print(f"   ... and {len(images_without_text) - 10} more")

# Find text without images
text_without_images = set(txt_basenames.keys()) - set(image_basenames.keys())
print(f"\nüìù Text files WITHOUT corresponding images: {len(text_without_images)}")
if text_without_images:
    for basename in sorted(list(text_without_images)[:10]):
        print(f"   - {basename}.txt")
    if len(text_without_images) > 10:
        print(f"   ... and {len(text_without_images) - 10} more")


üì∑ Images WITHOUT corresponding text: 8
   - X51005433492(1)
   - X51005442384(1)
   - X51005605333(1)
   - X51005676539(1)
   - X51005685355(2)
   - X51005685357(2)
   - X51007339118(1)
   - X51007339647(1)

üìù Text files WITHOUT corresponding images: 131
   - X51005447850(1).txt
   - X51005587254(3).txt
   - X51005587267(3).txt
   - X51005705804(2).txt
   - X51006334699(3).txt
   - X51006556646(1).txt
   - X51006556840(1).txt
   - X51006557213(1).txt
   - X51007339643(1).txt
   - X51008099049(1).txt
   ... and 121 more


In [15]:
# Find matching pairs
matching_pairs = set(image_basenames.keys()) & set(txt_basenames.keys())
print(f"\n‚úì Matching pairs: {len(matching_pairs)}")

print(f"\n" + "="*60)
print("RECOMMENDATIONS")
print("="*60)

if images_without_text:
    print(f"\n {len(images_without_text)} images have no labels")


if text_without_images:
    print(f"\n {len(text_without_images)} text files have no images")



‚úì Matching pairs: 704

RECOMMENDATIONS

 8 images have no labels

 131 text files have no images


In [14]:
images_without_text

{'X51005433492(1)',
 'X51005442384(1)',
 'X51005605333(1)',
 'X51005676539(1)',
 'X51005685355(2)',
 'X51005685357(2)',
 'X51007339118(1)',
 'X51007339647(1)'}

In [10]:
text_without_images

{'X51005361946(1)',
 'X51005442361(1)',
 'X51005447839(1)',
 'X51005447840(1)',
 'X51005447848(1)',
 'X51005447850(1)',
 'X51005447852(1)',
 'X51005568900(1)',
 'X51005587254(1)',
 'X51005587254(2)',
 'X51005587254(3)',
 'X51005587254(4)',
 'X51005587254(5)',
 'X51005587267(1)',
 'X51005587267(2)',
 'X51005587267(3)',
 'X51005587267(4)',
 'X51005587267(5)',
 'X51005605284(5)',
 'X51005605285(4)',
 'X51005605285(5)',
 'X51005605286(4)',
 'X51005605286(5)',
 'X51005663317(1)',
 'X51005677329(1)',
 'X51005677329(2)',
 'X51005677331(2)',
 'X51005677332(2)',
 'X51005677335(2)',
 'X51005705722(2)',
 'X51005705760(1)',
 'X51005705760(2)',
 'X51005705804(1)',
 'X51005705804(2)',
 'X51005719912(1)',
 'X51005719912(2)',
 'X51005719914(1)',
 'X51005719914(2)',
 'X51005719917(1)',
 'X51005719917(2)',
 'X51005722668(1)',
 'X51005722668(2)',
 'X51005722699(1)',
 'X51005757294(1)',
 'X51005757304(1)',
 'X51005757323(1)',
 'X51005757324(1)',
 'X51005757346(1)',
 'X51005806678(1)',
 'X51005806678(2)',


In [16]:
import shutil
def organize_data_files():
    parent_dir = os.path.dirname(data_dir)
    good_data_dir = os.path.join(parent_dir, 'good_data')
    bad_data_dir = os.path.join(parent_dir, 'bad_data')
    
    # Creating output directories if they don't exist
    os.makedirs(good_data_dir, exist_ok=True)
    os.makedirs(bad_data_dir, exist_ok=True)
    
    # Getting all files in the data directory
    all_files = os.listdir(data_dir)
    
    # Separating image and text files
    image_files = [f for f in all_files if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff'))]
    text_files = [f for f in all_files if f.lower().endswith('.txt')]
    
    # Creating sets of base names (without extensions)
    image_basenames = {os.path.splitext(f)[0] for f in image_files}
    text_basenames = {os.path.splitext(f)[0] for f in text_files}
    
    # Finding matching pairs and orphans
    matching_pairs = image_basenames & text_basenames
    orphan_images = image_basenames - text_basenames
    orphan_texts = text_basenames - image_basenames
    
    print(f"Total images: {len(image_files)}")
    print(f"Total text files: {len(text_files)}")
    print(f"Matching pairs: {len(matching_pairs)}")
    print(f"Orphan images: {len(orphan_images)}")
    print(f"Orphan text files: {len(orphan_texts)}")
    
    # Copying matching pairs to good_data directory
    for base_name in matching_pairs:
        # Finding the actual files with extensions
        image_file = next((f for f in image_files if os.path.splitext(f)[0] == base_name), None)
        text_file = next((f for f in text_files if os.path.splitext(f)[0] == base_name), None)
        
        if image_file and text_file:
            shutil.copy2(os.path.join(data_dir, image_file), good_data_dir)
            shutil.copy2(os.path.join(data_dir, text_file), good_data_dir)
    
    # Copying orphan images to Bad_data directory
    for base_name in orphan_images:
        image_file = next((f for f in image_files if os.path.splitext(f)[0] == base_name), None)
        if image_file:
            shutil.copy2(os.path.join(data_dir, image_file), bad_data_dir)
    
    # Copying orphan text files to Bad_data directory
    for base_name in orphan_texts:
        text_file = next((f for f in text_files if os.path.splitext(f)[0] == base_name), None)
        if text_file:
            shutil.copy2(os.path.join(data_dir, text_file), bad_data_dir)
    
    # Verifing the results
    good_images = len([f for f in os.listdir(good_data_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff'))])
    good_texts = len([f for f in os.listdir(good_data_dir) if f.lower().endswith('.txt')])
    bad_images = len([f for f in os.listdir(bad_data_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff'))])
    bad_texts = len([f for f in os.listdir(bad_data_dir) if f.lower().endswith('.txt')])
    
    print("\n=== Organization Complete ===")
    print(f"Good_data directory: {good_images} images, {good_texts} text files")
    print(f"Bad_data directory: {bad_images} images, {bad_texts} text files")
    
    return {
        'matching_pairs': len(matching_pairs),
        'orphan_images': len(orphan_images),
        'orphan_texts': len(orphan_texts)
    }


if __name__ == "__main__":
    print("\nOrganizing files...")
    results = organize_data_files()
    
    print(f"\n=== FINAL RESULTS ===")
    print(f"Matching pairs: {results['matching_pairs']}")
    print(f"Orphan images: {results['orphan_images']}")
    print(f"Orphan text files: {results['orphan_texts']}")


Organizing files...
Total images: 712
Total text files: 835
Matching pairs: 704
Orphan images: 8
Orphan text files: 131

=== Organization Complete ===
Good_data directory: 704 images, 704 text files
Bad_data directory: 8 images, 131 text files

=== FINAL RESULTS ===
Matching pairs: 704
Orphan images: 8
Orphan text files: 131


In [17]:
good_data_dir = os.path.join(os.path.dirname(data_dir),"good_data")

In [18]:
import os
import numpy as np
from PIL import Image
import shutil
from pathlib import Path
from collections import Counter

def resize_images_grayscale_optimized(input_dir, target_height=64, max_width=512):
    """
    Resize images to grayscale for optimal text recognition
    """
    
    # Create output directory
    output_dir = os.path.join(os.path.dirname(input_dir), 'grayscale_standardized_data')
    
    # Clean previous output
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir)
    
    # Get all image files
    image_extensions = ('.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif')
    image_files = [f for f in os.listdir(input_dir) 
                  if f.lower().endswith(image_extensions)]
    
    print(f"Processing {len(image_files)} images ‚Üí Grayscale + Standardized")
    print(f"Target: Height={target_height}px, Max Width={max_width}px")
    
    stats = {
        'processed': 0,
        'widths': [],
        'heights': [],
        'aspect_ratios': [],
        'original_sizes': [],
        'new_sizes': []
    }
    
    for image_file in image_files:
        try:
            image_path = os.path.join(input_dir, image_file)
            
            # Open image and get original stats
            with Image.open(image_path) as img:
                original_width, original_height = img.size
                stats['original_sizes'].append((original_width, original_height))
                
                # Convert to grayscale
                grayscale_img = img.convert('L')
                
                # Calculate new dimensions maintaining aspect ratio
                new_height = target_height
                new_width = int(original_width * (target_height / original_height))
                
                # Apply width constraint if needed
                if new_width > max_width:
                    new_width = max_width
                    new_height = int(original_height * (max_width / original_width))
                
                # High-quality resizing
                resized_img = grayscale_img.resize((new_width, new_height), Image.Resampling.LANCZOS)
                stats['new_sizes'].append((new_width, new_height))
                
                # Save as optimized PNG
                base_name = os.path.splitext(image_file)[0]
                output_path = os.path.join(output_dir, f"{base_name}.png")
                resized_img.save(output_path, 'PNG', optimize=True)
                
                # Copy corresponding text file
                txt_file = f"{base_name}.txt"
                txt_path = os.path.join(input_dir, txt_file)
                shutil.copy2(txt_path, output_dir)
                
                stats['processed'] += 1
                stats['widths'].append(new_width)
                stats['heights'].append(new_height)
                stats['aspect_ratios'].append(new_width / new_height)
                
        except Exception as e:
            print(f"Error with {image_file}: {str(e)}")
    
    return stats, output_dir

def analyze_transformation(stats):
    """Analyze the transformation results"""
    print("\n" + "="*60)
    print("TRANSFORMATION ANALYSIS")
    print("="*60)
    
    # Original statistics
    original_widths = [w for w, h in stats['original_sizes']]
    original_heights = [h for w, h in stats['original_sizes']]
    original_aspects = [w/h for w, h in stats['original_sizes']]
    
    print("üìä BEFORE Transformation:")
    print(f"   Width  - Min: {min(original_widths):4.0f}px | Max: {max(original_widths):4.0f}px | Avg: {np.mean(original_widths):6.1f}px")
    print(f"   Height - Min: {min(original_heights):4.0f}px | Max: {max(original_heights):4.0f}px | Avg: {np.mean(original_heights):6.1f}px")
    print(f"   Aspect - Min: {min(original_aspects):.3f} | Max: {max(original_aspects):.3f} | Avg: {np.mean(original_aspects):.3f}")
    print(f"   Unique sizes: {len(set(stats['original_sizes']))}")
    
    # New statistics
    print("\nüìä AFTER Transformation:")
    print(f"   Width  - Min: {min(stats['widths']):4.0f}px | Max: {max(stats['widths']):4.0f}px | Avg: {np.mean(stats['widths']):6.1f}px")
    print(f"   Height - Min: {min(stats['heights']):4.0f}px | Max: {max(stats['heights']):4.0f}px | Avg: {np.mean(stats['heights']):6.1f}px")
    print(f"   Aspect - Min: {min(stats['aspect_ratios']):.3f} | Max: {max(stats['aspect_ratios']):.3f} | Avg: {np.mean(stats['aspect_ratios']):.3f}")
    print(f"   Unique sizes: {len(set(stats['new_sizes']))}")
    
    # Size reduction analysis
    original_pixels = sum(w * h for w, h in stats['original_sizes'])
    new_pixels = sum(w * h for w, h in stats['new_sizes'])
    pixel_reduction = ((original_pixels - new_pixels) / original_pixels) * 100
    
    print(f"\nüíæ Size Reduction:")
    print(f"   Total pixels: {original_pixels:,} ‚Üí {new_pixels:,}")
    print(f"   Pixel reduction: {pixel_reduction:.1f}%")
    print(f"   Memory saving (grayscale): ~75% additional savings")

def show_size_distribution(stats):
    """Show distribution of new sizes"""
    size_counts = Counter(stats['new_sizes'])
    
    print(f"\nüìà Most Common New Sizes:")
    for (width, height), count in size_counts.most_common(10):
        percentage = (count / len(stats['new_sizes'])) * 100
        print(f"   {width:3d}x{height:2d}: {count:3d} images ({percentage:5.1f}%)")
    
    # Aspect ratio distribution
    aspect_bins = {
        'Very Narrow (<0.3)': 0,
        'Narrow (0.3-0.4)': 0,
        'Medium (0.4-0.6)': 0,
        'Wide (0.6-0.8)': 0,
        'Very Wide (>0.8)': 0
    }
    
    for aspect in stats['aspect_ratios']:
        if aspect < 0.3:
            aspect_bins['Very Narrow (<0.3)'] += 1
        elif aspect < 0.4:
            aspect_bins['Narrow (0.3-0.4)'] += 1
        elif aspect < 0.6:
            aspect_bins['Medium (0.4-0.6)'] += 1
        elif aspect < 0.8:
            aspect_bins['Wide (0.6-0.8)'] += 1
        else:
            aspect_bins['Very Wide (>0.8)'] += 1
    
    print(f"\nüìê Aspect Ratio Distribution:")
    for category, count in aspect_bins.items():
        if count > 0:
            percentage = (count / len(stats['aspect_ratios'])) * 100
            print(f"   {category}: {count:3d} images ({percentage:5.1f}%)")

def verify_data_integrity(original_dir, processed_dir):
    """Verify all files were processed correctly"""
    print(f"\n" + "="*60)
    print("DATA INTEGRITY CHECK")
    print("="*60)
    
    # Check image files
    original_images = set(f for f in os.listdir(original_dir) 
                         if not f.lower().endswith('.txt'))
    processed_images = set(f for f in os.listdir(processed_dir) 
                          if not f.lower().endswith('.txt'))
    
    # Check text files
    original_texts = set(f for f in os.listdir(original_dir) 
                        if f.lower().endswith('.txt'))
    processed_texts = set(f for f in os.listdir(processed_dir) 
                         if f.lower().endswith('.txt'))
    
    # Verify counts
    print(f"‚úÖ Images: {len(original_images)} ‚Üí {len(processed_images)}")
    print(f"‚úÖ Text files: {len(original_texts)} ‚Üí {len(processed_texts)}")
    
    # Verify matching pairs
    original_pairs = set(os.path.splitext(f)[0] for f in original_images)
    processed_pairs = set(os.path.splitext(f)[0] for f in processed_images)
    
    missing = original_pairs - processed_pairs
    if missing:
        print(f"‚ö†Ô∏è  Missing files: {len(missing)}")
    else:
        print(f"üéâ All 704 image-text pairs preserved!")

def main():
    # Define input directory (your organized good_data)
    input_dir = '../Data/0325updated.task1train(626p)-20251119T175414Z-1-001/good_data/'
    
    if not os.path.exists(input_dir):
        print(f"Error: Input directory not found: {input_dir}")
        return
    
    print(" Starting Grayscale Standardization")
    print("="*60)
    
    # Process images
    stats, output_dir = resize_images_grayscale_optimized(
        input_dir, 
        target_height=64, 
        max_width=512
    )
    
    # Analysis
    analyze_transformation(stats)
    show_size_distribution(stats)
    verify_data_integrity(input_dir, output_dir)
    
    print(f"\nTRANSFORMATION COMPLETE")
    print("="*60)
    print(f" Output: {output_dir}")

if __name__ == "__main__":
    main()

 Starting Grayscale Standardization
Processing 704 images ‚Üí Grayscale + Standardized
Target: Height=64px, Max Width=512px

TRANSFORMATION ANALYSIS
üìä BEFORE Transformation:
   Width  - Min:  436px | Max: 4961px | Avg: 1275.5px
   Height - Min:  605px | Max: 7016px | Avg: 2291.4px
   Aspect - Min: 0.263 | Max: 0.971 | Avg: 0.511
   Unique sizes: 500

üìä AFTER Transformation:
   Width  - Min:   16px | Max:   62px | Avg:   32.3px
   Height - Min:   64px | Max:   64px | Avg:   64.0px
   Aspect - Min: 0.250 | Max: 0.969 | Avg: 0.504
   Unique sizes: 32

üíæ Size Reduction:
   Total pixels: 3,702,530,594 ‚Üí 1,454,464
   Pixel reduction: 100.0%
   Memory saving (grayscale): ~75% additional savings

üìà Most Common New Sizes:
    45x64: 123 images ( 17.5%)
    32x64:  61 images (  8.7%)
    33x64:  55 images (  7.8%)
    26x64:  53 images (  7.5%)
    30x64:  44 images (  6.2%)
    31x64:  43 images (  6.1%)
    27x64:  40 images (  5.7%)
    28x64:  37 images (  5.3%)
    25x64:  29 

In [19]:
output_dir = "../Data/0325updated.task1train(626p)-20251119T175414Z-1-001/grayscale_standardized_data"

In [3]:
os.listdir(output_dir)

['X51006557193.png',
 'X51005757235.png',
 'X51005677334.png',
 'X51006311764(1).txt',
 'X51005447861.png',
 'X51005200938.txt',
 'X51005605285.txt',
 'X51005719856.txt',
 'X51005717526.png',
 'X51007103675.txt',
 'X51006334926.txt',
 'X51007339135.png',
 'X51005442384.png',
 'X51007339111.png',
 'X51008164525.txt',
 'X51006556828.txt',
 'X51005719896.txt',
 'X51005711456.png',
 'X51006334926(3).txt',
 'X51005677332(1).png',
 'X51008142068.txt',
 'X51005605286(3).png',
 'X51006619862.txt',
 'X51005677339(2).txt',
 'X51006556831.txt',
 'X51008145450.txt',
 'X51005715451.png',
 'X51005442361.txt',
 'X51007339166.txt',
 'X51005677335.png',
 'X51005724627.txt',
 'X51005361950(1).png',
 'X51006557199.png',
 'X51006557213.txt',
 'X51007846307.txt',
 'X51005711456.txt',
 'X51007846301.txt',
 'X51008164525.png',
 'X51005663273.png',
 'X51005361907(1).png',
 'X51008099054.png',
 'X51006008057.png',
 'X51005663293.png',
 'X51007339167(1).txt',
 'X51007339166.png',
 'X51006557188.txt',
 'X5100638

In [20]:

image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.tiff', '*.gif']

image_files = []

for ext in image_extensions:
    pattern = os.path.join(output_dir, ext)
    image_files.extend(glob.glob(pattern))

print("Total images:", len(image_files))


Total images: 704


In [21]:
image_files

['../Data/0325updated.task1train(626p)-20251119T175414Z-1-001/grayscale_standardized_data/X51006557193.png',
 '../Data/0325updated.task1train(626p)-20251119T175414Z-1-001/grayscale_standardized_data/X51005757235.png',
 '../Data/0325updated.task1train(626p)-20251119T175414Z-1-001/grayscale_standardized_data/X51005677334.png',
 '../Data/0325updated.task1train(626p)-20251119T175414Z-1-001/grayscale_standardized_data/X51005447861.png',
 '../Data/0325updated.task1train(626p)-20251119T175414Z-1-001/grayscale_standardized_data/X51005717526.png',
 '../Data/0325updated.task1train(626p)-20251119T175414Z-1-001/grayscale_standardized_data/X51007339135.png',
 '../Data/0325updated.task1train(626p)-20251119T175414Z-1-001/grayscale_standardized_data/X51005442384.png',
 '../Data/0325updated.task1train(626p)-20251119T175414Z-1-001/grayscale_standardized_data/X51007339111.png',
 '../Data/0325updated.task1train(626p)-20251119T175414Z-1-001/grayscale_standardized_data/X51005711456.png',
 '../Data/0325updat

In [22]:
if len(image_files) == 0:
    print("No images found! Please check the directory path.")
else:
    # Analyze images
    print("\nAnalyzing images...")
    
    widths = []
    heights = []
    aspect_ratios = []
    modes = []
    file_sizes = []
    channels = []
    
    corrupted_files = []
    
    for img_path in image_files:
        try:
            # Get file size
            file_size = os.path.getsize(img_path) / 1024  
            file_sizes.append(file_size)
            
            # Open and analyze image
            img = Image.open(img_path)
            widths.append(img.width)
            heights.append(img.height)
            aspect_ratios.append(img.width / img.height)
            modes.append(img.mode)
            
            # Count channels
            if img.mode == 'RGB':
                channels.append(3)
            elif img.mode == 'RGBA':
                channels.append(4)
            elif img.mode == 'L':
                channels.append(1)
            else:
                channels.append(len(img.getbands()))
                
        except Exception as e:
            corrupted_files.append((os.path.basename(img_path), str(e)))
    
    # Statistics
    print("\n" + "="*60)
    print("IMAGE STATISTICS")
    print("="*60)
    
    print(f"\nüìê Dimensions:")
    print(f"   Width  - Min: {min(widths):4d}px | Max: {max(widths):4d}px | Avg: {np.mean(widths):.1f}px")
    print(f"   Height - Min: {min(heights):4d}px | Max: {max(heights):4d}px | Avg: {np.mean(heights):.1f}px")
    
    print(f"\nüìä Aspect Ratios:")
    print(f"   Min: {min(aspect_ratios):.3f} | Max: {max(aspect_ratios):.3f} | Avg: {np.mean(aspect_ratios):.3f}")
    
    # Check if dimensions are consistent
    unique_dims = set(zip(widths, heights))
    if len(unique_dims) == 1:
        print(f"\n‚úì All images have SAME dimensions: {widths[0]}x{heights[0]}")
    else:
        print(f"\n‚ö† Images have DIFFERENT dimensions ({len(unique_dims)} unique sizes)")
        print(f"   Most common sizes:")
        dim_counter = Counter(zip(widths, heights))
        for (w, h), count in dim_counter.most_common(5):
            print(f"     {w}x{h}: {count} images ({count/len(image_files)*100:.1f}%)")
    
    print(f"\nüé® Color Modes:")
    mode_counter = Counter(modes)
    for mode, count in mode_counter.most_common():
        print(f"   {mode}: {count} images ({count/len(image_files)*100:.1f}%)")
    
    print(f"\nüì¶ File Sizes:")
    print(f"   Min: {min(file_sizes):.1f} KB | Max: {max(file_sizes):.1f} KB | Avg: {np.mean(file_sizes):.1f} KB")
    
    if corrupted_files:
        print(f"\n‚ö† Corrupted/Problematic Images: {len(corrupted_files)}")
        for fname, error in corrupted_files[:5]:
            print(f"   - {fname}: {error}")
    else:
        print(f"\n‚úì All images loaded successfully!")
    
    # Recommendations
    print("\n" + "="*60)
    print("PREPROCESSING RECOMMENDATIONS")
    print("="*60)
    
    needs_resize = len(unique_dims) > 1
    needs_normalization = True
    needs_grayscale_conversion = 'RGB' in modes or 'RGBA' in modes
    
    print("\nüîß Required preprocessing steps:")
    
    step = 1
    if needs_resize:
        print(f"\n{step}. RESIZE IMAGES ‚ö†Ô∏è")
        print(f"   Issue: {len(unique_dims)} different image sizes")
        print(f"   Action: Resize all images to consistent dimensions")
        print(f"   Suggested size: {max(set(widths), key=widths.count)}x{max(set(heights), key=heights.count)} (most common)")
        step += 1
    else:
        print(f"\n{step}. RESIZE IMAGES ‚úì")
        print(f"   All images already have consistent size: {widths[0]}x{heights[0]}")
        step += 1
    
    print(f"\n{step}. NORMALIZATION")
    print(f"   Action: Normalize pixel values to [0, 1] or [-1, 1]")
    print(f"   Current: Pixel values likely in [0, 255]")
    step += 1
    
    if needs_grayscale_conversion:
        print(f"\n{step}. COLOR MODE CONVERSION (Optional)")
        print(f"   Current modes: {', '.join(mode_counter.keys())}")
        print(f"   Consider: Converting to grayscale if color not needed")
        print(f"   Benefits: Reduces model complexity, faster training")
        step += 1
    
    
    print(f"\n{step}. VERIFY IMAGE-TEXT PAIRS")
    print(f"   Action: Ensure each image has corresponding text file")
    print(f"   Images: {len(image_files)}")
    
    # Check for matching text files
    txt_files = glob.glob(os.path.join(output_dir, '*.txt')) 
    print(f"   Text files: {len(txt_files)}")
    
    if len(image_files) != len(txt_files):
        print(f"   ‚ö†Ô∏è MISMATCH: {abs(len(image_files) - len(txt_files))} files difference")
    else:
        print(f"   ‚úì Same number of images and text files")



Analyzing images...

IMAGE STATISTICS

üìê Dimensions:
   Width  - Min:   16px | Max:   62px | Avg: 32.3px
   Height - Min:   64px | Max:   64px | Avg: 64.0px

üìä Aspect Ratios:
   Min: 0.250 | Max: 0.969 | Avg: 0.504

‚ö† Images have DIFFERENT dimensions (32 unique sizes)
   Most common sizes:
     45x64: 123 images (17.5%)
     32x64: 61 images (8.7%)
     33x64: 55 images (7.8%)
     26x64: 53 images (7.5%)
     30x64: 44 images (6.2%)

üé® Color Modes:
   L: 704 images (100.0%)

üì¶ File Sizes:
   Min: 0.5 KB | Max: 2.6 KB | Avg: 1.1 KB

‚úì All images loaded successfully!

PREPROCESSING RECOMMENDATIONS

üîß Required preprocessing steps:

1. RESIZE IMAGES ‚ö†Ô∏è
   Issue: 32 different image sizes
   Action: Resize all images to consistent dimensions
   Suggested size: 45x64 (most common)

2. NORMALIZATION
   Action: Normalize pixel values to [0, 1] or [-1, 1]
   Current: Pixel values likely in [0, 255]

3. VERIFY IMAGE-TEXT PAIRS
   Action: Ensure each image has correspondin