# Exploratory Data Analysis
In this notebook, we perform an exploratory data analysis (EDA) of the cervical cancer dataset. The goal is to understand the structure of the data, the distribution of the classes, and to visualize some of the images to make sure that the dataset is well organized. This analysis will also help us identify potential problems before training the model.

In [None]:
# Imports
import os
import random
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from torchvision import transforms
from torchvision.datasets import ImageFolder
from PIL import Image

In [None]:
# Path to our dataset
data_dir = "D:/Data/cervical_cancer_data"

# Load dataset with ImageFolder
train_dataset = ImageFolder(root=os.path.join(data_dir, 'train'))
val_dataset = ImageFolder(root=os.path.join(data_dir, 'val'))
test_dataset = ImageFolder(root=os.path.join(data_dir, 'test'))

# Show detected classes
print(f"Detected classes: {train_dataset.classes}")
print(f"Total images in train: {len(train_dataset)}")
print(f"Total images in validation: {len(val_dataset)}")
print(f"Total images in test: {len(test_dataset)}")

In [None]:
# Show random images from the training folder
sample_images = random.choices(train_dataset.imgs, k=5)

for img_path, label in sample_images:
    img = Image.open(img_path)
    plt.imshow(img)
    plt.title(f"Clase: {train_dataset.classes[label]}")
    plt.show()


In [None]:
def plot_class_distribution(dataset, title):
    """Plotting the class distribution for a dataset"""
    class_counts = [0] * len(dataset.classes)
    
    for _, label in dataset.imgs:
        class_counts[label] += 1

    plt.figure(figsize=(10, 5))
    sns.barplot(x=dataset.classes, y=class_counts, palette="viridis", hue=dataset.classes, legend=False)
    plt.title(title)
    plt.xlabel('Classes')
    plt.ylabel('Number of images')
    plt.xticks(rotation=45)
    plt.show()

# Graficar la distribución de clases para cada conjunto
plot_class_distribution(train_dataset, "Train class distribution")
plot_class_distribution(val_dataset, "Validation class distribution")
plot_class_distribution(test_dataset, "Test class distribution")


In [None]:
def get_image_statistics(dataset, num_samples=100):
    """Get basic statistics on image dimensions"""
    widths, heights = [], []
    
    # Get random samples
    sample_images = random.choices(dataset.imgs, k=num_samples)
    
    for img_path, _ in sample_images:
        img = Image.open(img_path)
        width, height = img.size
        widths.append(width)
        heights.append(height)
    
    print(f"Media de ancho: {np.mean(widths):.2f}, Desviación estándar: {np.std(widths):.2f}")
    print(f"Media de alto: {np.mean(heights):.2f}, Desviación estándar: {np.std(heights):.2f}")
    
    plt.figure(figsize=(12, 5))
    sns.histplot(widths, kde=True, color="blue", label="Width")
    sns.histplot(heights, kde=True, color="orange", label="Height")
    plt.title("Width and Height Distribution of Images")
    plt.xlabel('Size (px)')
    plt.ylabel('Frequency')
    plt.legend()
    plt.show()

# Obtener estadísticas para el dataset de train
get_image_statistics(train_dataset)


# Conclusion
- The dataset contains 5 classes distributed in a perfectly balanced manner.
- All images have a resolution of 512x512, which fits well for the ResNet50 model (after transforms)
- The folder structure is correct and no formatting errors were detected.