In [None]:
# Verificar GPU
import torch
print(f"GPU disponible: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Montar Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Instalar dependencias
!pip install SimpleITK scikit-learn

In [None]:
import sys
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split, StratifiedKFold
from pathlib import Path
from tqdm import tqdm
import json

# Configuracion
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
RANDOM_SEED = 42
BATCH_SIZE = 32
NUM_EPOCHS = 50
LEARNING_RATE = 0.001
PATIENCE = 10
K_FOLDS = 5

torch.manual_seed(RANDOM_SEED)
print(f"Device: {DEVICE}")

In [None]:
# Configurar proyecto en Colab: clonar repo y preparar imports
import os, sys, shutil
from pathlib import Path

# Ruta del dataset en Drive (fija)
DATASET_PATH = Path('/content/drive/MyDrive/dataset_node21')

# Clonar repo en Colab si no existe
if not Path('/content/aprendizaje_automatico').exists():
    !git clone https://github.com/joacoesperon/aprendizaje_automatico

# Ruta del proyecto en Colab
PROJECT_PATH = Path('/content/aprendizaje_automatico')
SRC_IN_COLAB = PROJECT_PATH / 'src'

# Asegurar que src est√© en el PYTHONPATH
sys.path.insert(0, '/content/aprendizaje_automatico')
sys.path.insert(0, '/content')

from src.models import SimpleCNN
from src.data_loader import NODE21Dataset, get_train_transforms, get_val_test_transforms
from src.evaluate import evaluate_model, calculate_metrics

print(f"Repo listo en: {PROJECT_PATH}")
print(f"Dataset en Drive: {DATASET_PATH}")

In [None]:
# Cargar metadata y generar splits usando dataset en Drive
METADATA_FILE = DATASET_PATH / 'cxr_images' / 'proccessed_data' / 'metadata.csv'
IMAGE_DIR = DATASET_PATH / 'cxr_images' / 'proccessed_data' / 'images'

meta = pd.read_csv(METADATA_FILE)
print(f"Total imagenes: {len(meta)}")

unique_imgs = meta[['img_name', 'label']].drop_duplicates()
train_imgs, test_imgs = train_test_split(
    unique_imgs, test_size=0.20, stratify=unique_imgs['label'], random_state=RANDOM_SEED
)
print(f"Train: {len(train_imgs)}, Test: {len(test_imgs)}")

In [None]:
# Preparar datasets
train_dataset = NODE21Dataset(
    image_dir=IMAGE_DIR,
    image_names=train_imgs['img_name'].tolist(),
    labels=train_imgs['label'].tolist(),
    transform=get_train_transforms()
)

test_dataset = NODE21Dataset(
    image_dir=IMAGE_DIR,
    image_names=test_imgs['img_name'].tolist(),
    labels=test_imgs['label'].tolist(),
    transform=get_val_test_transforms()
)

In [None]:
# K-Fold Cross-Validation
kfold = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=RANDOM_SEED)
best_fold_model = None
best_fold_val_loss = float('inf')

for fold, (train_idx, val_idx) in enumerate(kfold.split(train_imgs, train_imgs['label'])):
    print(f"\n{'='*50}")
    print(f"Fold {fold + 1}/{K_FOLDS}")
    print(f"{'='*50}")
    
    train_subset = Subset(train_dataset, train_idx)
    val_subset = Subset(train_dataset, val_idx)
    
    train_loader = DataLoader(train_subset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = SimpleCNN(num_classes=2).to(DEVICE)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    
    best_val_loss = float('inf')
    patience_counter = 0
    best_model_state = None
    
    for epoch in range(NUM_EPOCHS):
        # Training
        model.train()
        train_loss = 0.0
        for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * images.size(0)
        train_loss = train_loss / len(train_loader.dataset)
        
        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(DEVICE), labels.to(DEVICE)
                outputs = model(images)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * images.size(0)
                _, predicted = outputs.max(1)
                correct += predicted.eq(labels).sum().item()
        val_loss = val_loss / len(val_loader.dataset)
        val_acc = correct / len(val_loader.dataset)
        
        print(f"Epoch {epoch+1}: train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, val_acc={val_acc:.4f}")
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict().copy()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= PATIENCE:
                print(f"Early stopping at epoch {epoch+1}")
                break
    
    if best_model_state:
        model.load_state_dict(best_model_state)
    
    if best_val_loss < best_fold_val_loss:
        best_fold_val_loss = best_val_loss
        best_fold_model = model.state_dict().copy()
        print(f"Nuevo mejor fold (val_loss={best_val_loss:.4f})")

In [None]:
# Evaluar en test set
final_model = SimpleCNN(num_classes=2).to(DEVICE)
final_model.load_state_dict(best_fold_model)

test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
y_true, y_pred, y_probs = evaluate_model(final_model, test_loader, DEVICE)
metrics = calculate_metrics(y_true, y_pred, y_probs)

print("\nMetricas en Test Set:")
for metric_name, value in metrics.items():
    print(f"  {metric_name}: {value:.4f}")

In [None]:
# Guardar modelo
save_dir = PROJECT_PATH / 'models' / 'SimpleCNN'
save_dir.mkdir(parents=True, exist_ok=True)
torch.save(final_model.state_dict(), save_dir / 'best_model.pth')

with open(save_dir / 'metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print(f"Modelo guardado en: {save_dir}")