In [None]:
# %% [markdown]
# # Actividad 2 - Machine Learning Supervisado
# ## Análisis completo del dataset Heart Disease
# %%
# Importaciones
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, confusion_matrix, 
                            classification_report, roc_curve, auc)
import xgboost as xgb
import tensorflow as tf
from tensorflow import keras

# Configuración
plt.style.use('ggplot')
np.random.seed(42)
# %%
# =============================================================================
# 1. CARGA Y EXPLORACIÓN DE DATOS
# =============================================================================
print("1. CARGANDO Y EXPLORANDO DATOS...")
df = pd.read_csv('../data/heart.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nInformación del dataset:")
print(df.info())
print(f"\nValores faltantes:\n{df.isnull().sum()}")
print(f"\nEstadísticas descriptivas:\n{df.describe()}")
# %%
# =============================================================================
# 2. PREPROCESAMIENTO (Punto 4)
# =============================================================================
print("\n2. PREPROCESANDO DATOS...")

# Separar características y target
X = df.drop('target', axis=1)
y = df['target']

# Escalado
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# División train/test
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print(f"✓ Datos escalados y divididos")
print(f"  Train: {X_train.shape}, Test: {X_test.shape}")
# %%
# =============================================================================
# 3. ENTRENAMIENTO DE MODELOS (Puntos 5, 6, 7)
# =============================================================================
print("\n3. ENTRENANDO MODELOS...")

# Modelo 1: SVM
print("Entrenando SVM...")
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42)
svm_model.fit(X_train, y_train)

# Modelo 2: Red Neuronal
print("Entrenando Red Neuronal...")
model_nn = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(1, activation='sigmoid')
])

model_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model_nn.fit(X_train, y_train, epochs=100, batch_size=32, 
                      validation_split=0.2, verbose=0)

# Modelo 3: XGBoost
print("Entrenando XGBoost...")
xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=6, 
                             learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

print("✓ Todos los modelos entrenados")
# %%
# =============================================================================
# 4. EVALUACIÓN Y COMPARATIVA (Punto 8)
# =============================================================================
print("\n4. EVALUANDO MODELOS...")

# Predicciones
y_pred_svm = svm_model.predict(X_test)
y_pred_nn = (model_nn.predict(X_test) > 0.5).astype("int32")
y_pred_xgb = xgb_model.predict(X_test)

# Métricas
def evaluar_modelo(y_true, y_pred, nombre_modelo):
    acc = accuracy_score(y_true, y_pred)
    print(f"\n{nombre_modelo}:")
    print(f"Accuracy: {acc:.3f}")
    print("Classification Report:")
    print(classification_report(y_true, y_pred))
    return acc

acc_svm = evaluar_modelo(y_test, y_pred_svm, "SVM")
acc_nn = evaluar_modelo(y_test, y_pred_nn, "Red Neuronal")
acc_xgb = evaluar_modelo(y_test, y_pred_xgb, "XGBoost")
# %%
# =============================================================================
# 5. VISUALIZACIONES COMPARATIVAS
# =============================================================================
print("\n5. CREANDO VISUALIZACIONES...")

# Gráfica de accuracy comparativo
plt.figure(figsize=(10, 6))
models = ['SVM', 'Red Neuronal', 'XGBoost']
accuracies = [acc_svm, acc_nn, acc_xgb]

bars = plt.bar(models, accuracies, color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
plt.title('Comparación de Accuracy entre Modelos', fontsize=16, fontweight='bold')
plt.ylabel('Accuracy', fontsize=12)
plt.ylim(0, 1)

# Añadir valores en las barras
for bar, acc in zip(bars, accuracies):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
             f'{acc:.3f}', ha='center', fontweight='bold')

plt.tight_layout()
plt.savefig('results/accuracy_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

# Matrices de confusión
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for i, (y_pred, nombre, ax) in enumerate(zip(
    [y_pred_svm, y_pred_nn, y_pred_xgb],
    ['SVM', 'Red Neuronal', 'XGBoost'],
    axes
)):
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
    ax.set_title(f'Matriz de Confusión - {nombre}')
    ax.set_xlabel('Predicho')
    ax.set_ylabel('Real')

plt.tight_layout()
plt.savefig('results/confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Gráficas guardadas en la carpeta 'results'")
# %%
# =============================================================================
# 6. ANÁLISIS COMPARATIVO (Para el README)
# =============================================================================
print("\n6. RESUMEN DE RESULTADOS:")

# Crear DataFrame comparativo
resultados_comparativos = pd.DataFrame({
    'Modelo': ['SVM', 'Red Neuronal', 'XGBoost'],
    'Accuracy': [acc_svm, acc_nn, acc_xgb],
    'Ventajas': [
        'Bueno con pocos datos, efectivo en alta dimensión',
        'Aprende patrones complejos, flexible',
        'Alto rendimiento, maneja missing values'
    ],
    'Desventajas': [
        'Sensible a parámetros, lento con muchos datos',
        'Requiere más datos, costo computacional alto',
        'Puede overfittear sin regularización'
    ]
})

print(resultados_comparativos)
print("\n✓ Análisis listo para el README.md")