In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import random
from PIL import Image
from collections import Counter
from tqdm import tqdm
import imagehash

# Setting the directory where we have the dataset
data_dir = "../data"

# Get all images from directory and make random display

In [None]:
def get_images(path):
    """
    Inspect the specified directory and gets all images along.
    
    Parameters:
    path (str): Directory path containing images.
    
    Return:
    image_paths (list): List with the image paths.
    """
    image_paths = []
    valid_extensions = ('jpg', 'png', 'jpeg')  # Valid image extensions
    for img_file in os.listdir(path):
        if img_file.lower().endswith(valid_extensions):  # Check the extension
            image_paths.append(os.path.join(path, img_file))
    return image_paths

# Upload images and tags to lists
image_paths = get_images(data_dir)

# Create a DataFrame with the information obtained
df = pd.DataFrame({"image_path": image_paths})
print("Total images:", len(df), "\n")
print(df.head())

In [None]:
# Display random images for visual inspection
def show_random_images(df, num_images=6):
    """
    Displays a set of random images from the dataset.
    
    Parameters:
    df (DataFrame): DataFrame containing the image paths.
    num_images (int): Number of images to display.
    """
    random_imgs = random.sample(list(df["image_path"][:1000]), num_images)
    fig, axes = plt.subplots(1, num_images, figsize=(30,25))
    # fig, axes = plt.subplots(2, num_images // 2, figsize=(30,25))
    for img_path, ax in zip(random_imgs, axes.flatten()):
        img = Image.open(img_path)
        ax.imshow(img)
        ax.set_title(os.path.basename(img_path))
        ax.axis("off")
    plt.show()

show_random_images(df, 5)

# Image size analysis

In [None]:
sizes = []
for img_path in tqdm(df["image_path"][:1000]):  # Analyze a sample to avoid overprocessing
    with Image.open(img_path) as img:
        sizes.append(img.size)

# Convert list to DataFrame for analysis
sizes = pd.DataFrame(sizes, columns=["width", "height"])

# Display size distribution
plt.figure(figsize=(10, 5))
sns.histplot(sizes["width"], bins=30, kde=True, label="Width", color="blue")
sns.histplot(sizes["height"], bins=30, kde=True, label="Height", color="red")
plt.legend()
plt.title("Distribución de tamaños de imagen")
plt.xlabel("Pixeles")
plt.ylabel("Frecuencia")
plt.show()

## Interpretation of a Graph with Variations and Peaks

If you notice that the graph shows multiple variations with four distinct peaks, this may indicate several things about the distribution of image sizes in your dataset.

### Interpretation of the Peaks:

1. **Modes**:
   - Each peak in the graph represents a "mode" in the size distribution. This means that there are several image sizes that are more common in your dataset.
   - For example, if there is a peak at 200x300 pixels, another at 800x600 pixels, and so on, this suggests that there are groups of images that share similar dimensions.

2. **Image Categories**:
   - The peaks may indicate that the images come from different sources or categories. For example, if you are working with a dataset that includes product images, portraits, and landscapes, each type may have a typical size reflected in the peaks.

3. **Quality and Resolution**:
   - Variations in sizes may also be related to image quality and resolution. High-quality images may have larger dimensions, while low-quality images may be smaller.

4. **Possible Issues**:
   - If the peaks are very pronounced and there is a large number of images at extreme sizes, this could indicate an issue in data collection, such as images that were not properly resized.

### Conclusion:
The presence of multiple peaks in the graph suggests that your dataset is heterogeneous in terms of image sizes. This can be useful for segmenting the data or applying different processing techniques based on image size. Additionally, you might consider normalizing or resizing the images if you plan to conduct analyses that require consistent sizes.


# Color histogram to analyze the distribution of RGB values

In [None]:
def plot_color_histogram(image_path, index):
    """
    Generates an RGB color histogram for a given image.
    
    Parameters:
    image_path (str): Path of the image to be analyzed.
    index (int): Index of the image in the dataset.
    """
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    colors = ("r", "g", "b")
    plt.figure(figsize=(9, 6))
    for i, color in enumerate(colors):
        hist = cv2.calcHist([img], [i], None, [256], [0, 256])
        plt.plot(hist, color=color)
    
    plt.title(f"Histograma de colores\n(File: {image_path})")
    plt.xlabel("Intensidad de color")
    plt.ylabel("Frecuencia")
    plt.show()

random_index = random.randint(0, min(10000, len(image_paths) - 1))  # Genera un índice aleatorio
plot_color_histogram(image_paths[random_index], random_index)  # Usa el índice aleatorio

## Interpretation of the Color Histogram

The graph generated by the code is a color histogram that shows the distribution of RGB (red, green, and blue) values for a specific image. Here’s how to interpret it:

### 1. Graph Axes
- **X-Axis (Color Intensity)**: This axis represents the intensity of colors, ranging from 0 to 255. A value of 0 means no presence of that color, while 255 indicates maximum intensity.
- **Y-Axis (Frequency)**: This axis shows how many pixels in the image have a specific color intensity.

### 2. Color Curves
- The graph has three curves, each representing one of the primary colors: red, green, and blue.
- The height of each curve at a specific point on the X-axis indicates how many pixels have that color intensity.

### 3. Interpretation of the Curves
- **Peaks in the Curve**: If there are high peaks in a particular color curve, it means that many pixels in the image have that color intensity. For example, a high peak in the red curve indicates many pixels with a high red value.
- **Color Distribution**: The general shape of the curves can give you an idea of the image's color palette. If the red curve has a high peak while the others are lower, the image may have a reddish tone.

### 4. Image Analysis
- You can use the histogram to understand the image’s lighting and contrast. For example, if all curves are concentrated on the left side, the image may be dark. If they are evenly distributed, the image may have a good dynamic range.

### Conclusion
The color histogram is a useful tool for analyzing an image’s color composition. It helps identify dominant colors and their distribution, which can be valuable for image processing tasks such as color correction or segmentation.


## Interpretation and Necessary Processing

If you notice that the color histograms for different images are quite different, with curves that look distinct and peaks in various parts of the graph, this may indicate several things about the images and their content. Let’s explore what this could mean and some recommendations on how to proceed:

### Interpretation of Different Histograms

1. **Variety in Lighting**:
   - Differences in histograms may indicate that the images were taken under different lighting conditions. For example, an overexposed image may have peaks on the right, while an underexposed one may have peaks on the left.

2. **Different Image Sources**:
   - If the images come from different sources or categories (e.g., portraits, landscapes, products), they likely have different color palettes and lighting characteristics.

3. **Image Quality**:
   - Low-quality or highly compressed images may have histograms with a reduced dynamic range, meaning that color values are more concentrated in certain areas.

### Adjustments and Corrections

1. **Color Correction**:
   - You can apply color correction techniques to adjust lighting and color balance, including:
     - **Brightness and Contrast Adjustment**: Increasing brightness and contrast can help enhance detail visibility in underexposed images.
     - **White Balance Adjustment**: Adjusting white balance can help correct unwanted color tones.

2. **Normalization**:
   - Consider normalizing the images so they all have a similar range of color values. This can be useful for comparative analysis.

3. **Segmentation**:
   - If the images represent different categories or types of content, segmentation may be useful. You can classify the images into groups based on their color characteristics and then apply different processing techniques to each group.

4. **Histogram Adjustments**:
   - You can apply histogram equalization techniques to improve image contrast. This redistributes color intensity values so they occupy a broader range.

### Work Strategy

1. **Preliminary Analysis**:
   - Conduct an initial analysis of the histograms to identify significant patterns and differences.

2. **Apply Corrections**:
   - Apply the necessary corrections to the images based on the analysis results. This may include brightness, contrast, and color adjustments.

3. **Re-evaluation**:
   - After applying corrections, reassess the histograms to ensure the images now have more consistent characteristics.

4. **Documentation**:
   - Document the changes made and the results obtained for future reference and process improvement.

### Conclusion
Variability in color histograms can provide valuable insights into the capture conditions and quality of the images. By applying appropriate corrections and segmentations, you can enhance image quality and facilitate further analysis.



---

# Aspect Ratio Analysis

In [None]:
sizes["aspect_ratio"] = sizes["width"] / sizes["height"]
plt.figure(figsize=(10, 5))
sns.histplot(sizes["aspect_ratio"], bins=30, kde=True, color="purple")
plt.title("Distribución del Aspect Ratio de las imágenes")
plt.xlabel("Aspect Ratio")
plt.ylabel("Frecuencia")
plt.show()


## Interpretation of the Aspect Ratio Distribution Chart

1. **Graph Axes**:
   - **X-Axis (Aspect Ratio)**: This axis represents the aspect ratio of the images, calculated as width divided by height. A value of 1 indicates a square image, values less than 1 indicate images that are taller than they are wide, and values greater than 1 indicate images that are wider than they are tall.
   - **Y-Axis (Frequency)**: This axis shows how many images have a specific aspect ratio.

2. **Distribution Curve**:
   - The shape of the curve in the graph indicates how aspect ratios are distributed in your dataset.
   - If the curve has a peak at a specific value, it means that many images share that aspect ratio. For example, a peak at 16:9 would indicate that many of your images are in widescreen format, which is common in videos and photographs.

3. **Interpretation of Peaks**:
   - **Peaks in the Curve**: If there are multiple peaks, this may indicate different categories of images in your dataset. For example, portrait images may have a higher aspect ratio (taller than wide), while landscape images may have a lower aspect ratio (wider than tall).
   - **Uniform Distribution**: If the curve is relatively flat, this suggests a variety of aspect ratios in the images, indicating a diverse dataset.

4. **Image Quality Analysis**:
   - Aspect ratio can also influence the visual quality of images. For example, images with unusual aspect ratios may require cropping or adjustments to be used in certain contexts, such as presentations or publications.

### Conclusion
The aspect ratio distribution chart provides valuable insights into the dimensions of the images in your dataset. By analyzing this distribution, you can identify patterns and characteristics that may be useful for image processing, data segmentation, or preparing images for use in different applications.


# Duplicate Analysis with Perceptual Hashing

- Compare image histograms to find possible duplicates.
- Using perceptual hashing to find similar images.

In [None]:
def find_duplicates(image_paths):
    """
    Identify duplicate images using perceptual hashing.
    
    Parameters:
    image_paths (list): List of image paths.
    
    Return:
    duplicates (list): List of detected duplicate images.
    """
    hashes = {}
    duplicates = []
    for img_path in tqdm(image_paths):  # Limitar para rendimiento
        with Image.open(img_path) as img:
            h = str(imagehash.average_hash(img))
            if h in hashes:
                duplicates.append((img_path, hashes[h]))
            else:
                hashes[h] = img_path
    return duplicates

print(f"Total de imágenes que vamos a analizar: {len(image_paths)}")
duplicates = find_duplicates(image_paths)
print(f"Total de imágenes duplicadas detectadas: {len(duplicates)}")

# Corrupt Image Detection

In [None]:

def detect_corrupt_images(image_paths):
    """
    Detects corrupted images by attempting to open them with PIL.
    
    Parameters:
    image_paths (list): List of image paths.
    
    Returns:
    corrupt_images (list): List of images that could not be opened.
    """
    corrupt_images = []
    for img_path in tqdm(image_paths):
        try:
            with Image.open(img_path) as img:
                img.verify()  # Check if the image is corrupted
        except (UnidentifiedImageError, IOError):
            corrupt_images.append(img_path)
    return corrupt_images

print(f"Total number of images we are going to analyze: {len(image_paths)}")
corrupt_images = detect_corrupt_images(image_paths)
print(f"Total corrupt images detected: {len(corrupt_images)}")