# Baixando dados

In [16]:
# Import the DataLoader class from the get_data_from_db module
from get_data_from_db import DataLoader

# Initialize the DataLoader
data_loader = DataLoader()

# Get the dataset path
path = data_loader.path
print(f"Dataset path: {path}")

# Get test and train data
test_data = data_loader.get_test_data()
train_data = data_loader.get_train_data()

print(f"Number of test images: {len(test_data)}")
print(f"Number of train images: {len(train_data)}")
print(f"Sample test files: {test_data[:5] if len(test_data) > 5 else test_data}")
print(f"Sample train files: {train_data[:5] if len(train_data) > 5 else train_data}")

Downloading from https://www.kaggle.com/api/v1/datasets/download/francismon/curated-colon-dataset-for-deep-learning?dataset_version_number=1...


100%|██████████| 1.41G/1.41G [02:38<00:00, 9.55MB/s]

Extracting files...





Dataset downloaded/cached at: C:\Users\mathe\.cache\kagglehub\datasets\francismon\curated-colon-dataset-for-deep-learning\versions\1
Dataset path: C:\Users\mathe\.cache\kagglehub\datasets\francismon\curated-colon-dataset-for-deep-learning\versions\1
Number of test images: 200
Number of train images: 3200
Sample test files: ['test_normal_ (1).jpg', 'test_normal_ (10).jpg', 'test_normal_ (100).jpg', 'test_normal_ (101).jpg', 'test_normal_ (102).jpg']
Sample train files: [('train_normal_ (1).jpg', 0), ('train_normal_ (10).jpg', 0), ('train_normal_ (100).jpg', 0), ('train_normal_ (101).jpg', 0), ('train_normal_ (102).jpg', 0)]


Os arquivos já estavam separados em treino e teste

Obtendo os arquivos de treino

In [19]:
# Using the DataLoader to get all category paths and files
import os

# Get all test categories (normal)
test_normal_path = os.path.join(path, "test", "0_normal")
test_normal_files = os.listdir(test_normal_path)

# Get all train categories
train_normal_path = os.path.join(path, "train", "0_normal")
train_normal_files = os.listdir(train_normal_path)

train_ulcerative_colitis_path = os.path.join(path, "train", "1_ulcerative_colitis")
train_ulcerative_colitis_files = os.listdir(train_ulcerative_colitis_path)

train_polyps_path = os.path.join(path, "train", "2_polyps")
train_polyps_files = os.listdir(train_polyps_path)

train_esophagitis_path = os.path.join(path, "train", "3_esophagitis")
train_esophagitis_files = os.listdir(train_esophagitis_path)

# Print summary of all categories
print("Dataset Summary:")
print(f"Test Normal: {len(test_normal_files)} images")
print(f"Train Normal: {len(train_normal_files)} images")
print(f"Train Ulcerative Colitis: {len(train_ulcerative_colitis_files)} images")
print(f"Train Polyps: {len(train_polyps_files)} images")
print(f"Train Esophagitis: {len(train_esophagitis_files)} images")

# Total counts
total_test = len(test_normal_files)
total_train = len(train_normal_files) + len(train_ulcerative_colitis_files) + len(train_polyps_files) + len(train_esophagitis_files)
print(f"\nTotal Test Images: {total_test}")
print(f"Total Train Images: {total_train}")
print(f"Total Dataset: {total_test + total_train} images")

Dataset Summary:
Test Normal: 200 images
Train Normal: 800 images
Train Ulcerative Colitis: 800 images
Train Polyps: 800 images
Train Esophagitis: 800 images

Total Test Images: 200
Total Train Images: 3200
Total Dataset: 3400 images


In [22]:
# Complete dataset analysis including validation data
import importlib
importlib.reload(get_data_from_db)
from get_data_from_db import DataLoader

# Create a new DataLoader instance with enhanced features
enhanced_loader = DataLoader()

# Get all category counts including validation
counts = enhanced_loader.get_category_counts()
print("Complete Dataset Counts:")
for category, count in counts.items():
    print(f"  {category}: {count} images")

# Calculate totals by split
test_total = counts['test_normal']
train_total = counts['train_normal'] + counts['train_ulcerative_colitis'] + counts['train_polyps'] + counts['train_esophagitis']
val_total = counts['val_normal'] + counts['val_ulcerative_colitis'] + counts['val_polyps'] + counts['val_esophagitis']
total_images = test_total + train_total + val_total

print(f"\nDataset Split Summary:")
print(f"  Training: {train_total} images")
print(f"  Validation: {val_total} images")
print(f"  Testing: {test_total} images")
print(f"  Total: {total_images} images")

# Get validation data organized by category
validation_by_category = enhanced_loader.get_validation_data()
print(f"\nValidation data categories: {list(validation_by_category.keys())}")

# Get validation data with labels
validation_with_labels = enhanced_loader.get_validation_data_with_labels()
print(f"Total validation samples with labels: {len(validation_with_labels)}")
print("Sample validation data with labels:")
category_names = ['normal', 'ulcerative_colitis', 'polyps', 'esophagitis']
for i in range(min(5, len(validation_with_labels))):
    filename, label = validation_with_labels[i]
    print(f"  {filename} -> {category_names[label]} (label: {label})")

# Show distribution across categories
print(f"\nCategory Distribution:")
for i, category in enumerate(category_names):
    train_count = counts[f'train_{category}']
    val_count = counts[f'val_{category}']
    test_count = counts['test_normal'] if category == 'normal' else 0
    print(f"  {category.upper()}:")
    print(f"    Train: {train_count}, Val: {val_count}, Test: {test_count}")
    print(f"    Total: {train_count + val_count + test_count}")

Dataset downloaded/cached at: C:\Users\mathe\.cache\kagglehub\datasets\francismon\curated-colon-dataset-for-deep-learning\versions\1
Complete Dataset Counts:
  test_normal: 200 images
  train_normal: 800 images
  train_ulcerative_colitis: 800 images
  train_polyps: 800 images
  train_esophagitis: 800 images
  val_normal: 500 images
  val_ulcerative_colitis: 500 images
  val_polyps: 500 images
  val_esophagitis: 500 images

Dataset Split Summary:
  Training: 3200 images
  Validation: 2000 images
  Testing: 200 images
  Total: 5400 images

Validation data categories: ['normal', 'ulcerative_colitis', 'polyps', 'esophagitis']
Total validation samples with labels: 2000
Sample validation data with labels:
  val_normal_ (1).jpg -> normal (label: 0)
  val_normal_ (10).jpg -> normal (label: 0)
  val_normal_ (100).jpg -> normal (label: 0)
  val_normal_ (101).jpg -> normal (label: 0)
  val_normal_ (102).jpg -> normal (label: 0)

Category Distribution:
  NORMAL:
    Train: 800, Val: 500, Test: 200

# Análise Completa do Dataset

O dataset está organizado em três divisões:

## Estrutura do Dataset:
- **Treino (train/)**: 3.200 imagens (800 por categoria)
- **Validação (val/)**: 2.000 imagens (500 por categoria)  
- **Teste (test/)**: 200 imagens (apenas categoria normal)

## Categorias:
1. **Normal (0)**: Imagens normais do cólon
2. **Ulcerative Colitis (1)**: Colite ulcerativa
3. **Polyps (2)**: Pólipos
4. **Esophagitis (3)**: Esofagite

## Total: 5.400 imagens

**Nota**: O conjunto de teste contém apenas imagens da categoria "normal", enquanto treino e validação contêm todas as 4 categorias balanceadas.

In [23]:
# Example: How to use the DataLoader for machine learning workflows
import numpy as np

# Get all data splits with labels
train_data_with_labels = enhanced_loader.get_train_data()
val_data_with_labels = enhanced_loader.get_validation_data_with_labels()
test_data_files = enhanced_loader.get_test_data()  # Only normal images

print("Data for Machine Learning:")
print(f"Training samples: {len(train_data_with_labels)} (with labels)")
print(f"Validation samples: {len(val_data_with_labels)} (with labels)")
print(f"Test samples: {len(test_data_files)} (normal only)")

# Example: Extract labels for analysis
train_labels = [label for _, label in train_data_with_labels]
val_labels = [label for _, label in val_data_with_labels]

print(f"\nLabel distribution in training:")
unique_labels, train_counts = np.unique(train_labels, return_counts=True)
for label, count in zip(unique_labels, train_counts):
    print(f"  Label {label} ({category_names[label]}): {count} samples")

print(f"\nLabel distribution in validation:")
unique_labels, val_counts = np.unique(val_labels, return_counts=True)
for label, count in zip(unique_labels, val_counts):
    print(f"  Label {label} ({category_names[label]}): {count} samples")

# Example: Get full file paths for a specific category
normal_train_files = enhanced_loader.get_train_data_by_category()['normal']
normal_val_files = enhanced_loader.get_validation_data()['normal']

print(f"\nExample file paths for 'normal' category:")
print(f"  Train path: {enhanced_loader.train_normal_path}")
print(f"  Validation path: {enhanced_loader.validation_normal_path}")
print(f"  Sample files: {normal_train_files[:3]} (train), {normal_val_files[:3]} (val)")

Data for Machine Learning:
Training samples: 3200 (with labels)
Validation samples: 2000 (with labels)
Test samples: 200 (normal only)

Label distribution in training:
  Label 0 (normal): 800 samples
  Label 1 (ulcerative_colitis): 800 samples
  Label 2 (polyps): 800 samples
  Label 3 (esophagitis): 800 samples

Label distribution in validation:
  Label 0 (normal): 500 samples
  Label 1 (ulcerative_colitis): 500 samples
  Label 2 (polyps): 500 samples
  Label 3 (esophagitis): 500 samples

Example file paths for 'normal' category:
  Train path: C:\Users\mathe\.cache\kagglehub\datasets\francismon\curated-colon-dataset-for-deep-learning\versions\1\train\0_normal
  Validation path: C:\Users\mathe\.cache\kagglehub\datasets\francismon\curated-colon-dataset-for-deep-learning\versions\1\val\0_normal
  Sample files: ['train_normal_ (1).jpg', 'train_normal_ (10).jpg', 'train_normal_ (100).jpg'] (train), ['val_normal_ (1).jpg', 'val_normal_ (10).jpg', 'val_normal_ (100).jpg'] (val)
