In [None]:
from evopt import EvolutionaryOptimizer

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import RobustScaler
import warnings
warnings.filterwarnings('ignore')
# Cargar dataset
df = pd.read_csv('diabetes.csv')

# CASO DIABETES
X = df.drop('target', axis=1).values
y = df['target'].values

# CASO CALIFORNIA (descomentar si se usa otro dataset)
#X = df.drop('MedHouseVal', axis=1).values
#y = df['MedHouseVal'].values

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)

print(f"\n{'='*70}")
print(f"EVALUACIÓN DEL SISTEMA")
print(f"{'='*70}")
print(f"Dataset: {X.shape[0]} instancias, {X.shape[1]} features")
print(f"Train: {X_train.shape[0]} | Test: {X_test.shape[0]}")
print(f"{'='*70}\n")

# ========================================================================
# BASELINE: Modelo sin optimización
# ========================================================================
print(f"\n{'='*70}")
print(f"BASELINE (Sin Optimización)")
print(f"{'='*70}")

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

baseline = Ridge(alpha=1.0, random_state=42)
baseline.fit(X_train, y_train)
baseline_preds = baseline.predict(X_test)

baseline_mae = mean_absolute_error(y_test, baseline_preds)
baseline_mse = mean_squared_error(y_test, baseline_preds)

print(f"MAE: {baseline_mae:.4f}")
print(f"MSE: {baseline_mse:.4f}")
print(f"Features utilizadas: {X_train.shape[1]}")

# ========================================================================
# OPTIMIZACIÓN CON PROGRAMACIÓN GENÉTICA + FEATURE SELECTION
# ========================================================================
print(f"\n{'='*70}")
print(f"OPTIMIZACIÓN EVOLUTIVA")
print(f"{'='*70}")

# Crear optimizador
gp_optimizer = EvolutionaryOptimizer(
maxtime=3600,  # 20 minutos (ajusta según necesites)
)

# Entrenar el optimizador (aprende transformaciones)
gp_optimizer.fit(X_train, y_train)

# Transformar los datos (aplicar las transformaciones aprendidas)
X_train_optimized = gp_optimizer.transform(X_train)
X_test_optimized = gp_optimizer.transform(X_test)

if gp_optimizer.feature_selection_ is not None:
    n_selected = np.sum(gp_optimizer.feature_selection_)
    n_total = len(gp_optimizer.feature_selection_)
    print(f"Features seleccionadas: {n_selected}/{n_total}")
    
    # Mostrar cuáles features se seleccionaron
    print(f"\nFeatures seleccionadas:")
    selected_indices = np.where(gp_optimizer.feature_selection_)[0]
    for idx in selected_indices:
        if idx < X.shape[1]:
            print(f"  X{idx} (original)")
        else:
            tree_idx = idx - X.shape[1]
            if tree_idx < len(gp_optimizer.best_trees_):
                print(f"  {gp_optimizer.best_trees_[tree_idx].to_string()} (generada)")


EVALUACIÓN DEL SISTEMA
Dataset: 442 instancias, 10 features
Train: 353 | Test: 89


BASELINE (Sin Optimización)
MAE: 46.1389
MSE: 3077.4159
Features utilizadas: 10

OPTIMIZACIÓN EVOLUTIVA

PROGRAMACIÓN GENÉTICA
Población: 100 | Features a crear: 4
Profundidad máxima: 5
Modelo de evaluación: RIDGE
Tiempo asignado GP: 42.0min (2520.0s)
Tiempo asignado FS: 18.0min (1080.0s)

Gen 1 - MEJORA! Val: 2383.0965 | Train: 3397.9592
Gen 2 - MEJORA! Val: 2239.7171 | Train: 3307.4348
Gen 4 - MEJORA! Val: 2225.5233 | Train: 3332.9018
Gen 5 - MEJORA! Val: 2174.9740 | Train: 3226.5790
Gen 6 - MEJORA! Val: 2143.9258 | Train: 3236.3550
Gen 10 - MEJORA! Val: 2095.4673 | Train: 3248.0385
Gen 50 | Val: 2095.4673 | Train: 3248.0385 | Tiempo GP: 2.2min | Early stop: 40
Gen 100 | Val: 2095.4673 | Train: 3248.0385 | Tiempo GP: 4.4min | Early stop: 90
GP Early stopping en generación 110

Programación Genética completada en 110 generaciones (4.8min)
Mejor MSE: 2143.8848 | Mejor MAE: 37.7053
Mejores árboles encon

In [None]:
# Importar las funciones y librerías adicionales
from sklearn.ensemble import AdaBoostRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.linear_model import Lasso, ElasticNet, BayesianRidge, HuberRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.base import clone
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge
import pandas as pd

# Modelos adicionales para probar
additional_models = {
    'Ridge': Ridge(alpha=1.0, random_state=42),
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(n_jobs=-1, random_state=42),
    'SVR': SVR(),
    'XGBoost': XGBRegressor(n_jobs=-1, random_state=42, verbosity=0),
    'GradientBoosting': GradientBoostingRegressor(random_state=42),
    'Lasso': Lasso(alpha=1.0, random_state=42),
    'ElasticNet': ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42),
    'BayesianRidge': BayesianRidge(),
    'HuberRegressor': HuberRegressor(),
    'KNeighbors': KNeighborsRegressor(n_neighbors=5),
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'ExtraTrees': ExtraTreesRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'AdaBoost': AdaBoostRegressor(n_estimators=100, random_state=42),
    'Bagging': BaggingRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'MLP': MLPRegressor(hidden_layer_sizes=(100,), max_iter=500, random_state=42),
    'KernelRidge': KernelRidge(alpha=1.0)
}

print(f"\n{'='*80}")
print("PROBANDO MODELOS ADICIONALES")
print(f"{'='*80}")

results = []

for name, model in additional_models.items():
    try:
        print(f"\nProbando {name}...")
        
        # Baseline
        model_baseline = clone(model)
        model_baseline.fit(X_train, y_train)
        baseline_preds = model_baseline.predict(X_test)
        
        baseline_mae = mean_absolute_error(y_test, baseline_preds)
        baseline_mse = mean_squared_error(y_test, baseline_preds)
        
        # Con optimización
        model_optimized = clone(model)
        model_optimized.fit(X_train_optimized, y_train)
        optimized_preds = model_optimized.predict(X_test_optimized)
        
        optimized_mae = mean_absolute_error(y_test, optimized_preds)
        optimized_mse = mean_squared_error(y_test, optimized_preds)
        
        # Mejoras
        mae_improvement = ((baseline_mae - optimized_mae) / baseline_mae * 100)
        mse_improvement = ((baseline_mse - optimized_mse) / baseline_mse * 100)
        
        results.append({
            'Modelo': name,
            'MAE_Base': baseline_mae,
            'MAE_Opt': optimized_mae,
            'Mejora_MAE': mae_improvement,
            'MSE_Base': baseline_mse,
            'MSE_Opt': optimized_mse,
            'Mejora_MSE': mse_improvement
        })
        
        print(f"  MAE: {baseline_mae:.4f} → {optimized_mae:.4f} ({mae_improvement:+.2f}%)")
        print(f"  MSE: {baseline_mse:.4f} → {optimized_mse:.4f} ({mse_improvement:+.2f}%)")
        
    except Exception as e:
        print(f"  Error: {e}")
        continue

# Mostrar resumen
print(f"\n{'='*100}")
print("RESUMEN COMPLETO - TODOS LOS MODELOS")
print(f"{'='*100}")

df_results = pd.DataFrame(results)
df_sorted = df_results.sort_values('Mejora_MSE', ascending=False)

print(f"{'Modelo':<15} {'MAE Base':<10} {'MAE Opt':<10} {'Mejora MAE':<12} {'MSE Base':<10} {'MSE Opt':<10} {'Mejora MSE':<12}")
print("-" * 100)

for _, row in df_sorted.iterrows():
    print(f"{row['Modelo']:<15} {row['MAE_Base']:<10.4f} {row['MAE_Opt']:<10.4f} "
          f"{row['Mejora_MAE']:+<12.2f}% {row['MSE_Base']:<10.4f} {row['MSE_Opt']:<10.4f} "
          f"{row['Mejora_MSE']:+<12.2f}%")

# Estadísticas finales
print(f"\n{'='*60}")
print("ESTADÍSTICAS GENERALES")
print(f"{'='*60}")
print(f"Modelos que mejoraron MAE: {len(df_sorted[df_sorted['Mejora_MAE'] > 0])}/{len(df_sorted)}")
print(f"Modelos que mejoraron MSE: {len(df_sorted[df_sorted['Mejora_MSE'] > 0])}/{len(df_sorted)}")
print(f"Mejor mejora MAE: {df_sorted['Mejora_MAE'].max():.2f}% ({df_sorted.loc[df_sorted['Mejora_MAE'].idxmax(), 'Modelo']})")
print(f"Mejor mejora MSE: {df_sorted['Mejora_MSE'].max():.2f}% ({df_sorted.loc[df_sorted['Mejora_MSE'].idxmax(), 'Modelo']})")
print(f"Mejora promedio MAE: {df_sorted['Mejora_MAE'].mean():.2f}%")
print(f"Mejora promedio MSE: {df_sorted['Mejora_MSE'].mean():.2f}%")