# Overfitting y Underfitting

## Objetivos

- Comprender qu√© es el **overfitting** (sobreajuste) y el **underfitting** (subajuste)
- Identificar visualmente estos fen√≥menos en curvas de entrenamiento
- Aplicar t√©cnicas para prevenir overfitting en Pytorch

## Importar librer√≠as

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.datasets import make_friedman3
from tqdm import tqdm

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## Cargar y preparar datos

Usaremos el dataset de la central de ciclo combinado, pero **crearemos un escenario propenso al overfitting**:
- Entrenaremos con solo el 30% de los datos (dataset peque√±o ‚Üí f√°cil overfitting)
- Usaremos modelos complejos
- Sin regularizaci√≥n inicialmente

In [None]:
def prepare_data(n_samples, train_ratio, batch_size, seed=42):

  device = "cuda" if torch.cuda.is_available() else "cpu"
  
  X, y =make_friedman3(n_samples=n_samples,
                        noise=40,
                        random_state=seed)
  X_lb = np.array([0.,40.,0.,1.])
  X_ub = np.array([100.,560*np.pi,1.,11.])
  X_scaled = (X-X_lb)/(X_ub-X_lb)

  dataset = {}
  for i in range(X_scaled.shape[-1]):
    dataset[f"X_{i}"] = X_scaled[:,i]
  dataset["y"] = y
  dataset = pd.DataFrame(dataset)

  X_train = dataset.sample(frac=train_ratio, random_state=42)
  X_val = dataset.drop(X_train.index)

  # Separar features y target
  y_train = X_train.pop('y')
  y_val = X_val.pop('y')

  # Convertir a tensors de PyTorch
  X_train_tensor = torch.FloatTensor(X_train.values.copy()).to(device)
  y_train_tensor = torch.FloatTensor(y_train.values.copy()).reshape(-1, 1).to(device)

  X_val_tensor = torch.FloatTensor(X_val.values.copy()).to(device)
  y_val_tensor = torch.FloatTensor(y_val.values.copy()).reshape(-1, 1).to(device)

  # Data loader
  train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
  train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
  val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
  val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

  print(f"Train set: {X_train.shape[0]} muestras")
  print(f"Val set: {X_val.shape[0]} muestras")

  return train_loader, val_loader

In [None]:
TRAIN_RATIO = 0.2
BATCH_SIZE = 64
NSAMPLES = 1000

train_loader, val_loader = prepare_data(n_samples=NSAMPLES, 
                                        train_ratio=TRAIN_RATIO, 
                                        batch_size=BATCH_SIZE)

## Demostraci√≥n 1: Underfitting vs Overfitting (sin regularizaci√≥n)

Entrenaremos dos modelos:
1. **Modelo peque√±o** (16 ‚Üí 8 ‚Üí 1): Probablemente underfitting
2. **Modelo grande** (128 ‚Üí 64 ‚Üí 32 ‚Üí 1): Probablemente overfitting

In [None]:
class SmallModel(nn.Module):
    """Modelo peque√±o para observar subajuste o underfitting"""
    def __init__(self, input_size):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_size, 8),
            nn.ReLU(),
            nn.Linear(8, 1),
        )
    
    def forward(self, x):
        return self.net(x)

class LargeModel(nn.Module):
    """Modelo grande para trabajar el sobreajuste o overfitting"""
    def __init__(self, input_size):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
        )
    
    def forward(self, x):
        return self.net(x)

# Construir modelos
input_size = train_loader.dataset.tensors[0].shape[1]
device = train_loader.dataset.tensors[0].device
small = SmallModel(input_size).to(device)
large = LargeModel(input_size).to(device)

# Conteo de par√°metros de la red neuronal
print(f"Par√°metros modelo peque√±o: {sum(p.numel() for p in small.parameters())}")
print(f"Par√°metros modelo grande: {sum(p.numel() for p in large.parameters())}")

In [None]:
def train_model(model, train_loader, val_loader, epochs=200, lr=0.01, early_stopping=False, patience=20, min_delta=0.001):
    """
    Funci√≥n para entrenar modelos.
    
    Par√°metros:
        early_stopping: Si True, detiene cuando val_loss no mejora
        patience: Cu√°ntas √©pocas esperar sin mejora
        min_delta: m√≠nimo cambio en la funci√≥n objetivo que se considera mejora
    """
    loss_fcn = nn.MSELoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)
    
    history = {'train_loss': [], 'val_loss': [], 'train_mae': [], 'val_mae': []}
    best_val_loss = float('inf')
    patience_counter = 0
    
    for epoch in tqdm(range(epochs), desc="Training loop"):
        model.train()
        train_loss = 0
        for X_batch, y_batch in train_loader:
            # Training
            y_pred = model(X_batch)
            loss = loss_fcn(y_pred, y_batch)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
        train_loss /= len(train_loader)

        # Validation
        val_loss = 0
        model.eval()
        for X_batch, y_batch in val_loader:
            with torch.no_grad():
                y_pred_val = model(X_batch)
                loss = loss_fcn(y_pred_val, y_batch)

                val_loss += loss.item()
        val_loss /= len(val_loader)

        history['train_loss'].append(train_loss.item())
        history['val_loss'].append(val_loss.item())
        
        # Early stopping
        if early_stopping:
            if val_loss.item() < best_val_loss - min_delta:
                best_val_loss = val_loss.item()
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print(f"Early stopping en √©poca {epoch}")
                    break
        
        # if (epoch + 1) % 50 == 0:
        #     print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss.item():.4f}, Val Loss: {val_loss.item():.4f}")
    
    return history

In [None]:
EPOCHS = 400
LR = 0.01

In [None]:
# Entrenar modelo peque√±o (underfitting moderado)
small = SmallModel(input_size).to(device)
history_small = train_model(small, train_loader, val_loader, epochs=EPOCHS, lr=LR, early_stopping=False)

print(f"\nTrain loss final: {history_small['train_loss'][-1]:.4f}")
print(f"Val loss final: {history_small['val_loss'][-1]:.4f}")

# Entrenar modelo grande (overfitting)
large = LargeModel(input_size).to(device)
history_large = train_model(large, train_loader, val_loader, epochs=EPOCHS, lr=LR, early_stopping=False)

print(f"\nTrain loss final: {history_large['train_loss'][-1]:.4f}")
print(f"Val loss final: {history_large['val_loss'][-1]:.4f}")

In [None]:
# Visualizar underfitting vs overfitting
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Modelo peque√±o (underfitting)
axes[0].plot(history_small['train_loss'], label='Train Loss', linewidth=2, color="c")
axes[0].plot(history_small['val_loss'], label='Val Loss', linewidth=2, color="r")
axes[0].set_xlabel('√âpoca', fontsize=12)
axes[0].set_ylabel('MSE Loss', fontsize=12)
axes[0].set_title('Modelo Peque√±o (UNDERFITTING)\nAmbas p√©rdidas altas y similares', fontsize=13, fontweight='bold')
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3)

# Modelo grande (overfitting)
axes[1].plot(history_large['train_loss'], label='Train Loss', linewidth=2, color="c")
axes[1].plot(history_large['val_loss'], label='Val Loss', linewidth=2, color="r")
axes[1].set_xlabel('√âpoca', fontsize=12)
axes[1].set_ylabel('MSE Loss', fontsize=12)
axes[1].set_title('Modelo Grande (OVERFITTING)\nVal loss diverge de train loss', fontsize=13, fontweight='bold')
axes[1].legend(fontsize=11)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nüìä OBSERVACIONES:")
print(f"Modelo peque√±o: brecha final = {history_small['val_loss'][-1] - history_small['train_loss'][-1]:.4f}")
print(f"Modelo grande: brecha final = {history_large['val_loss'][-1] - history_large['train_loss'][-1]:.4f}")
print(f"\n‚û°Ô∏è El modelo grande tiene OVERFITTING (brecha > 0)")
print(f"‚û°Ô∏è El modelo peque√±o tiene UNDERFITTING (ambas p√©rdidas altas)")

In [None]:
# Visualizar predicciones: Real vs Predicho (parity plot)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

small.eval()
large.eval()
with torch.no_grad():
    y_pred_small = small(X_train).cpu().numpy().flatten()
    y_pred_large = large(X_train).cpu().numpy().flatten()

y_train_np = y_train.cpu().numpy().flatten()
lims = [min(y_train_np.min(), y_pred_large.min(), y_pred_small.min()),
        max(y_train_np.max(), y_pred_large.max(), y_pred_small.max())]

# Modelo peque√±o (underfitting)
axes[0].scatter(y_train_np, y_pred_small, alpha=0.6, s=50, color='blue', label='Predicciones')
axes[0].plot(lims, lims, 'k--', linewidth=2, label='y = x')
axes[0].set_xlabel('y real', fontsize=12)
axes[0].set_ylabel('y predicho', fontsize=12)
axes[0].set_title('Modelo Peque√±o (UNDERFITTING)', fontsize=13, fontweight='bold')
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3)

# Modelo grande (overfitting)
axes[1].scatter(y_train_np, y_pred_large, alpha=0.6, s=50, color='red', label='Predicciones')
axes[1].plot(lims, lims, 'k--', linewidth=2, label='y = x')
axes[1].set_xlabel('y real', fontsize=12)
axes[1].set_ylabel('y predicho', fontsize=12)
axes[1].set_title('Modelo Grande (OVERFITTING)', fontsize=13, fontweight='bold')
axes[1].legend(fontsize=11)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nüìä INTERPRETACI√ìN:")
print("- Puntos cercanos a la diagonal indican buenas predicciones.")
print("- Dispersi√≥n alta indica underfitting o overfitting.")


## T√©cnica 1: Early Stopping

### ¬øQu√© es Early Stopping?

**Early Stopping** detiene autom√°ticamente el entrenamiento cuando la validaci√≥n loss deja de mejorar.

**Ventajas:**
- ‚úÖ Simple de implementar
- ‚úÖ Muy efectivo para prevenir overfitting
- ‚úÖ Ahorra tiempo de entrenamiento
- ‚úÖ No a√±ade complejidad computacional

**Par√°metros clave:**
- `patience`: Cu√°ntas √©pocas esperar sin mejora antes de parar
- `min_delta`: Cambio m√≠nimo para contar como "mejora"
- `restore_best_weights`: Restaurar los mejores pesos (no los finales)

**Referencia:** [`torch.optim` - PyTorch Optimizers](https://pytorch.org/docs/stable/optim.html)

In [None]:
print("\n=== Entrenando modelo GRANDE con Early Stopping ===")
large_es = LargeModel(input_size).to(DEVICE)
history_large_es = train_model(large_es, X_train, y_train, X_val, y_val, 
                               epochs=EPOCHS, lr=LR, early_stopping=True, patience=20)

print(f"\nTrain loss final: {history_large_es['train_loss'][-1]:.4f}")
print(f"Val loss final: {history_large_es['val_loss'][-1]:.4f}")
print(f"√âpocas de entrenamiento: {len(history_large_es['train_loss'])} (de {EPOCHS})")

In [None]:
# Comparar con y sin Early Stopping
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Sin Early Stopping
axes[0].plot(history_large['train_loss'], label='Train Loss', linewidth=2)
axes[0].plot(history_large['val_loss'], label='Val Loss', linewidth=2)
axes[0].set_xlabel('√âpoca', fontsize=12)
axes[0].set_ylabel('MSE Loss', fontsize=12)
axes[0].set_title('SIN Early Stopping\nVal loss contin√∫a aumentando', fontsize=13, fontweight='bold')
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3)

# Con Early Stopping
axes[1].plot(history_large_es['train_loss'], label='Train Loss', linewidth=2)
axes[1].plot(history_large_es['val_loss'], label='Val Loss', linewidth=2)
axes[1].axvline(len(history_large_es['val_loss'])-1, color='red', linestyle='--', linewidth=2, label='Parada')
axes[1].set_xlabel('√âpoca', fontsize=12)
axes[1].set_ylabel('MSE Loss', fontsize=12)
axes[1].set_title('CON Early Stopping\nSe detiene cuando val_loss no mejora', fontsize=13, fontweight='bold')
axes[1].legend(fontsize=11)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("‚úÖ Early Stopping evita que el modelo contin√∫e empeorando en validaci√≥n")

## T√©cnica 2: Dropout

### ¬øQu√© es Dropout?

**Dropout** desactiva aleatoriamente una fracci√≥n de neuronas durante el entrenamiento.

**Intuici√≥n:**
- Previene que el modelo se vuelva codependiente de ciertas neuronas
- For√ßa al modelo a aprender caracter√≠sticas redundantes y robustas
- Es como entrenar m√∫ltiples modelos d√©biles en paralelo

**Caracter√≠sticas:**
- Solo se aplica durante entrenamiento (`.train()`)
- Durante evaluaci√≥n, todas las neuronas est√°n activas (`.eval()`)
- T√≠picamente 20-50% de tasa de dropout

**Referencia:** [`torch.nn.Dropout`](https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html)

In [None]:
class LargeModelWithDropout(nn.Module):
    """Modelo grande CON Dropout para prevenir overfitting"""
    def __init__(self, input_size, dropout_rate=0.3):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(32, 1)
        )
    
    def forward(self, x):
        return self.net(x)

print("\n=== Entrenando modelo GRANDE con Dropout (p=0.3) ===")
large_dropout = LargeModelWithDropout(input_size, dropout_rate=0.3).to(DEVICE)
history_large_dropout = train_model(large_dropout, X_train, y_train, X_val, y_val, 
                                    epochs=EPOCHS, lr=LR, early_stopping=False)

print(f"\nTrain loss final: {history_large_dropout['train_loss'][-1]:.4f}")
print(f"Val loss final: {history_large_dropout['val_loss'][-1]:.4f}")

In [None]:
# Comparar: Sin Dropout vs Con Dropout
fig, ax = plt.subplots(figsize=(12, 5))

ax.plot(history_large['train_loss'], label='Modelo Grande - Train', linewidth=2, linestyle='-')
ax.plot(history_large['val_loss'], label='Modelo Grande - Val (OVERFITTING)', linewidth=2, linestyle='-')

ax.plot(history_large_dropout['train_loss'], label='Con Dropout - Train', linewidth=2, linestyle='--', alpha=0.8)
ax.plot(history_large_dropout['val_loss'], label='Con Dropout - Val', linewidth=2, linestyle='--', alpha=0.8)

ax.set_xlabel('√âpoca', fontsize=12)
ax.set_ylabel('MSE Loss', fontsize=12)
ax.set_title('Efecto del Dropout en Overfitting', fontsize=13, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n‚úÖ Dropout reduce la brecha entre train y validation loss")
print(f"Sin Dropout - brecha final: {history_large['val_loss'][-1] - history_large['train_loss'][-1]:.4f}")
print(f"Con Dropout - brecha final: {history_large_dropout['val_loss'][-1] - history_large_dropout['train_loss'][-1]:.4f}")

## T√©cnica 3: Regularizaci√≥n L1/L2

### ¬øQu√© es Regularizaci√≥n?

**Regularizaci√≥n** a√±ade un t√©rmino de penalizaci√≥n a la funci√≥n de p√©rdida basado en el tama√±o de los pesos.

**L2 (Ridge) Regularization:**
$$\text{Loss}_{total} = \text{Loss}_{original} + \lambda \sum_i w_i^2$$

- Penaliza pesos grandes
- Favorece pesos peque√±os pero no necesariamente cero
- Muy com√∫n en deep learning

**L1 (Lasso) Regularization:**
$$\text{Loss}_{total} = \text{Loss}_{original} + \lambda \sum_i |w_i|$$

- Tiende a hacer algunos pesos exactamente cero
- √ötil para selecci√≥n de caracter√≠sticas

**En PyTorch:**
- `weight_decay` en el optimizador implementa L2 regularization
- Se aplica autom√°ticamente a todos los par√°metros entrenables

**Referencia:** [`torch.optim.Adam` - weight_decay parameter](https://pytorch.org/docs/stable/generated/torch.optim.Adam.html)

In [None]:
def train_model_with_regularization(model, X_train, y_train, X_val, y_val, epochs=200, lr=0.01, weight_decay=0.0):
    """
    Funci√≥n de entrenamiento con par√°metro weight_decay para L2 regularization.
    
    weight_decay: Coeficiente de regularizaci√≥n L2 (t√≠picamente 0.0001 - 0.01)
    """
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    
    history = {'train_loss': [], 'val_loss': [], 'train_mae': [], 'val_mae': []}
    
    for epoch in tqdm(range(epochs), desc="Training loop"):
        # Training
        model.train()
        y_pred_train = model(X_train)
        train_loss = criterion(y_pred_train, y_train)
        
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        
        # Validation
        model.eval()
        with torch.no_grad():
            y_pred_val = model(X_val)
            val_loss = criterion(y_pred_val, y_val)
            
            train_mae = torch.abs(y_pred_train - y_train).mean().item()
            val_mae = torch.abs(y_pred_val - y_val).mean().item()
        
        history['train_loss'].append(train_loss.item())
        history['val_loss'].append(val_loss.item())
        history['train_mae'].append(train_mae)
        history['val_mae'].append(val_mae)
    
    return history

# Entrenar con diferentes valores de L2 regularization
print("\n=== Modelo GRANDE con L2 Regularization (weight_decay=0.001) ===")
large_l2 = LargeModel(input_size).to(DEVICE)
history_large_l2 = train_model_with_regularization(large_l2, X_train, y_train, X_val, y_val, 
                                                    epochs=EPOCHS, lr=LR, weight_decay=0.001)

print(f"\nTrain loss final: {history_large_l2['train_loss'][-1]:.4f}")
print(f"Val loss final: {history_large_l2['val_loss'][-1]:.4f}")

In [None]:
# Comparar diferentes fuerzas de regularizaci√≥n
fig, ax = plt.subplots(figsize=(12, 5))

ax.plot(history_large['val_loss'], label='Sin Regularizaci√≥n', linewidth=2.5, marker='o', markersize=3, alpha=0.7)
ax.plot(history_large_l2['val_loss'], label='L2 Regularization (Œª=0.001)', linewidth=2.5, marker='s', markersize=3, alpha=0.7)

ax.set_xlabel('√âpoca', fontsize=12)
ax.set_ylabel('Validation Loss', fontsize=12)
ax.set_title('Efecto de L2 Regularization en Validation Loss', fontsize=13, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("‚úÖ L2 Regularization suaviza las curvas y reduce overfitting")

## Comparaci√≥n: Todas las t√©cnicas

Resumamos el rendimiento de todas las t√©cnicas en el conjunto de test.

In [None]:
# Evaluar todos los modelos en test set
models_to_eval = {
    'Peque√±o (Underfitting)': (small, 'orange'),
    'Grande (Overfitting)': (large, 'red'),
    'Grande + Early Stopping': (large_es, 'green'),
    'Grande + Dropout': (large_dropout, 'blue'),
    'Grande + L2': (large_l2, 'purple'),
}

results = {}

for name, (model, color) in models_to_eval.items():
    model.eval()
    with torch.no_grad():
        y_pred_test = model(X_test).cpu().numpy().flatten()
    
    y_test_np = y_test.values
    mae = mean_absolute_error(y_test_np, y_pred_test)
    mse = mean_squared_error(y_test_np, y_pred_test)
    r2 = r2_score(y_test_np, y_pred_test)
    
    results[name] = {'MAE': mae, 'MSE': mse, 'R2': r2}

# Crear tabla de resultados
results_df = pd.DataFrame(results).T
print("\n" + "="*70)
print("RENDIMIENTO EN TEST SET")
print("="*70)
print(results_df.to_string())
print("="*70)

## Conclusiones

### Resumen de t√©cnicas para prevenir Overfitting

| T√©cnica | Ventajas | Desventajas | Cu√°ndo usar |
|---------|----------|------------|-------------|
| **Early Stopping** | Simple, efectivo, r√°pido | Requiere validaci√≥n set | Casi siempre ‚úÖ |
| **Dropout** | Muy efectivo, robusto | Aumenta tiempo entrenamiento | Modelos profundos/complejos |
| **L1/L2 Regularizaci√≥n** | Suave, controlable | Hyperpar√°metro a tunar | Modelos simples/medianos |
| **Reducir complejidad** | Soluciona el problema ra√≠z | Puede causar underfitting | Si el modelo es muy grande |
| **M√°s datos** | Soluci√≥n ideal | Costoso/dif√≠cil de obtener | Si es posible |

### Recomendaciones pr√°cticas

1. **Siempre usar Early Stopping** - Es pr√°cticamente gratis y muy efectivo
2. **Monitorear train vs val loss** - La brecha indica overfitting
3. **Combinar t√©cnicas** - Early Stopping + Dropout + L2 es lo m√°s robusto
4. **Tunar hyperpar√°metros** - `patience`, `dropout_rate`, `weight_decay`
5. **Usar validaci√≥n set** - Cr√≠tico para detectar overfitting temprano

### Conceptos clave

- **Underfitting:** Modelo demasiado simple ‚Üí Soluci√≥n: m√°s complejidad
- **Overfitting:** Modelo memoriza ruido ‚Üí Soluci√≥n: regularizaci√≥n
- **Trade-off:** Balance entre bias y varianza
- **Generalizaci√≥n:** El verdadero objetivo del machine learning