
# Enhanced EDA for Image Classification Dataset

This notebook performs an enhanced exploratory data analysis (EDA) for an image classification dataset. The following analyses are included:
1. **Basic Insights**: Checking image dimensions, aspect ratios, and pixel intensity distributions.
2. **Dataset Quality Checks**: Handling missing data, class balance, and detecting outliers.
3. **Advanced Visualizations**: Including T-SNE or PCA for feature space visualization.
4. **Data Augmentation Preview**: Demonstrating applied augmentations on sample images.
5. **Correlation Analysis**: Analyzing metadata and class label correlations.
6. **Class Heatmaps**: Visualizing average pixel intensity per class.


In [1]:
import tensorflow as tf
import numpy as np
print("TensorFlow version:", tf.__version__)
print("NumPy version:", np.__version__)



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/py

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/py

AttributeError: _ARRAY_API not found

ImportError: numpy.core._multiarray_umath failed to import

ImportError: numpy.core.umath failed to import

TypeError: Unable to convert function return value to a Python type! The signature was
	() -> handle

In [6]:

# Import required libraries
#import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from PIL import Image
import cv2



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/py

AttributeError: _ARRAY_API not found

ImportError: numpy.core.multiarray failed to import

In [4]:

# Define dataset paths (update paths as needed)
train_images_dir = '/dataset/train'
test_images_dir = '/dataset/test'



In [5]:

# Analyze image dimensions and aspect ratios
def analyze_image_dimensions(image_dir):
    dimensions = []
    aspect_ratios = []
    for image_name in os.listdir(image_dir):
        image_path = os.path.join(image_dir, image_name)
        image = Image.open(image_path)
        width, height = image.size
        dimensions.append((width, height))
        aspect_ratios.append(width / height)
    return dimensions, aspect_ratios

train_dimensions, train_aspect_ratios = analyze_image_dimensions(train_images_dir)
test_dimensions, test_aspect_ratios = analyze_image_dimensions(test_images_dir)

# Plot aspect ratio distributions
plt.figure(figsize=(12, 6))
sns.histplot(train_aspect_ratios, kde=True, label='Train Aspect Ratios', color='blue', bins=30)
sns.histplot(test_aspect_ratios, kde=True, label='Test Aspect Ratios', color='orange', bins=30)
plt.title("Aspect Ratio Distribution")
plt.xlabel("Aspect Ratio (Width/Height)")
plt.ylabel("Frequency")
plt.legend()
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: '/dataset/train'

In [None]:

# Check for missing data
print("Checking for missing data...")
missing_data = metadata.isnull().sum()
print(missing_data[missing_data > 0])

# Visualize class balance
if 'labels' in metadata.columns:
    plt.figure(figsize=(12, 6))
    sns.countplot(y=metadata['labels'], order=metadata['labels'].value_counts().index)
    plt.title('Class Distribution')
    plt.xlabel('Number of Images')
    plt.ylabel('Classes')
    plt.tight_layout()
    plt.show()
else:
    print("No labels found in metadata for class balance analysis.")


In [None]:

# Analyze pixel intensity distributions
def pixel_intensity_distribution(image_dir):
    intensities = []
    for image_name in os.listdir(image_dir)[:500]:  # Limit to 500 images for efficiency
        image_path = os.path.join(image_dir, image_name)
        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        intensities.extend(image.flatten())
    return intensities

train_intensities = pixel_intensity_distribution(train_images_dir)
test_intensities = pixel_intensity_distribution(test_images_dir)

# Plot pixel intensity distributions
plt.figure(figsize=(12, 6))
sns.histplot(train_intensities, kde=True, label='Train Intensities', color='blue', bins=50)
sns.histplot(test_intensities, kde=True, label='Test Intensities', color='orange', bins=50)
plt.title("Pixel Intensity Distribution")
plt.xlabel("Pixel Intensity (0-255)")
plt.ylabel("Frequency")
plt.legend()
plt.show()


In [None]:

# Extract features for T-SNE and PCA (placeholder logic; replace with actual feature extraction)
def extract_features(image_dir, limit=500):
    features = []
    for image_name in os.listdir(image_dir)[:limit]:
        image_path = os.path.join(image_dir, image_name)
        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        image_resized = cv2.resize(image, (64, 64)).flatten()  # Resize to 64x64 and flatten
        features.append(image_resized)
    return np.array(features)

train_features = extract_features(train_images_dir)
test_features = extract_features(test_images_dir)

# Apply PCA
pca = PCA(n_components=2)
train_pca = pca.fit_transform(train_features)
test_pca = pca.fit_transform(test_features)

plt.figure(figsize=(12, 6))
plt.scatter(train_pca[:, 0], train_pca[:, 1], label='Train', alpha=0.5, s=5)
plt.scatter(test_pca[:, 0], test_pca[:, 1], label='Test', alpha=0.5, s=5, color='orange')
plt.title('PCA Visualization')
plt.legend()
plt.show()

# Apply T-SNE
tsne = TSNE(n_components=2, random_state=42)
train_tsne = tsne.fit_transform(train_features)

plt.figure(figsize=(12, 6))
plt.scatter(train_tsne[:, 0], train_tsne[:, 1], label='Train', alpha=0.5, s=5)
plt.title('T-SNE Visualization')
plt.legend()
plt.show()


In [None]:

# Preview data augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2

# Define augmentations
augmentations = A.Compose([
    A.RandomRotate90(),
    A.HorizontalFlip(),
    A.VerticalFlip(),
    A.RandomBrightnessContrast(),
    A.GaussianBlur(),
])

# Apply augmentations to a sample image
sample_image_path = os.path.join(train_images_dir, os.listdir(train_images_dir)[0])
sample_image = cv2.imread(sample_image_path)

plt.figure(figsize=(15, 10))
for i in range(6):
    augmented = augmentations(image=sample_image)['image']
    plt.subplot(2, 3, i + 1)
    plt.imshow(cv2.cvtColor(augmented, cv2.COLOR_BGR2RGB))
    plt.axis('off')
plt.suptitle("Augmented Images")
plt.tight_layout()
plt.show()
