# 📊 Parcial 1 TAM 2025-1 – Pregunta 2
**Nombre:** Edgar Ivan Calpa Cuacialpud  
**Universidad Nacional de Colombia - Sede Manizales**

---

## 🧩 1. Cargar datos y bibliotecas
## 🔍 2. Análisis exploratorio y limpieza
## 🧼 3. Preprocesamiento
## 📦 4. Modelado base (Lasso como ejemplo)
## 🔁 5. Replicación para otros modelos
## 📈 6. Tabla de comparación de resultados
## 🧠 7. Conclusiones



Código completo para un modelo: Lasso con GridSearchCV

In [None]:
# 1. Cargar librerías
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 2. Cargar datos (ya subiste el CSV desde Kaggle a Colab)
df = pd.read_csv("/content/sample_data/AmesHousing.csv")  # Ajusta si tu archivo tiene otro nombre

# 3. Eliminar columnas con demasiados valores nulos (opcional)
null_percent = df.isnull().mean()
df = df.drop(columns=null_percent[null_percent > 0.3].index)

# 4. Imputar valores nulos básicos (simplificado, podrías mejorar con KNN u otros)
df = df.fillna(df.median(numeric_only=True))  # Para numéricos
df = df.fillna("Missing")                     # Para categóricos

# 5. Separar entrada y salida
X = df.drop(columns=['SalePrice'])
y = df['SalePrice']

# 6. Separar variables numéricas y categóricas
num_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_features = X.select_dtypes(include=['object']).columns.tolist()

# 7. Separar entrenamiento y prueba (evita data leakage)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 8. Preprocesamiento con ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
])

# 9. Pipeline con modelo Lasso
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', Lasso(max_iter=10000))  # Aumentamos iteraciones para evitar advertencias
])

# 10. Definir espacio de búsqueda
param_grid = {
    'model__alpha': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]
}

# 11. GridSearchCV con validación cruzada
grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_train)

# 12. Evaluar en datos de prueba
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print("📊 Resultados con Lasso:")
print(f"Mejor alpha: {grid.best_params_['model__alpha']}")
print(f"MAE:  {mae:.2f}")
print(f"MSE:  {mse:.2f}")
print(f"R2:   {r2:.4f}")
print(f"MAPE: {mape:.2f}%")


📊 Resultados con Lasso:
Mejor alpha: 10.0
MAE:  16025.79
MSE:  823773334.65
R2:   0.8973
MAPE: 8.71%



🧩 CÓDIGO ADAPTADO – RandomForestRegressor


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Pipeline con Random Forest
pipeline_rf = Pipeline([
    ('preprocess', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

# Espacio de búsqueda para hiperparámetros
param_grid_rf = {
    'model__n_estimators': [100],
    'model__max_depth': [10, None],
    'model__min_samples_split': [2],
    'model__min_samples_leaf': [1]
}

# GridSearchCV
grid_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_rf.fit(X_train, y_train)

# Evaluación
best_rf = grid_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
mape_rf = np.mean(np.abs((y_test - y_pred_rf) / y_test)) * 100

print("🌲 Resultados con Random Forest:")
print(f"Mejores hiperparámetros: {grid_rf.best_params_}")
print(f"MAE:  {mae_rf:.2f}")
print(f"MSE:  {mse_rf:.2f}")
print(f"R2:   {r2_rf:.4f}")
print(f"MAPE: {mape_rf:.2f}%")


🌲 Resultados con Random Forest:
Mejores hiperparámetros: {'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}
MAE:  15895.48
MSE:  698050695.01
R2:   0.9129
MAPE: 8.56%


MODELO 3: SVR (Support Vector Regression)
El SVR requiere que los datos estén bien escalados, lo cual ya estás haciendo con StandardScaler en el pipeline. Vamos a definir un espacio de búsqueda para C, gamma y epsilon, que son los hiperparámetros más relevantes en SVR.

In [None]:
from sklearn.svm import SVR

# Pipeline con SVR
pipeline_svr = Pipeline([
    ('preprocess', preprocessor),
    ('model', SVR())
])

# Espacio de búsqueda
param_grid_svr = {
    'model__C': [0.1, 1, 10],
    'model__gamma': ['scale', 0.01, 0.001],
    'model__epsilon': [0.1, 1.0]
}

# GridSearchCV
grid_svr = GridSearchCV(pipeline_svr, param_grid_svr, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_svr.fit(X_train, y_train)

# Evaluación
best_svr = grid_svr.best_estimator_
y_pred_svr = best_svr.predict(X_test)

mae_svr = mean_absolute_error(y_test, y_pred_svr)
mse_svr = mean_squared_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)
mape_svr = np.mean(np.abs((y_test - y_pred_svr) / y_test)) * 100

print("🔧 Resultados con SVR:")
print(f"Mejores hiperparámetros: {grid_svr.best_params_}")
print(f"MAE:  {mae_svr:.2f}")
print(f"MSE:  {mse_svr:.2f}")
print(f"R2:   {r2_svr:.4f}")
print(f"MAPE: {mape_svr:.2f}%")


🔧 Resultados con SVR:
Mejores hiperparámetros: {'model__C': 10, 'model__epsilon': 0.1, 'model__gamma': 0.01}
MAE:  62841.78
MSE:  8716924294.03
R2:   -0.0872
MAPE: 32.39%


🧩 CÓDIGO ADAPTADO – BayesianRidge

In [None]:
from sklearn.linear_model import BayesianRidge
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

#Actualiza el preprocesador para que la salida del OneHotEncoder sea densa (no sparse)
preprocessor_dense = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_features)
])

#Pipeline con Bayesian Ridge
pipeline_br = Pipeline([
    ('preprocess', preprocessor_dense),
    ('model', BayesianRidge())
])

#Entrenamiento
pipeline_br.fit(X_train, y_train)

#Evaluación
y_pred_br = pipeline_br.predict(X_test)

mae_br = mean_absolute_error(y_test, y_pred_br)
mse_br = mean_squared_error(y_test, y_pred_br)
r2_br = r2_score(y_test, y_pred_br)
mape_br = np.mean(np.abs((y_test - y_pred_br) / y_test)) * 100

#Resultados
print("Resultados con Bayesian Ridge:")
print(f"MAE:  {mae_br:.2f}")
print(f"MSE:  {mse_br:.2f}")
print(f"R2:   {r2_br:.4f}")
print(f"MAPE: {mape_br:.2f}%")


📐 Resultados con Bayesian Ridge:
MAE:  16665.23
MSE:  846936893.58
R2:   0.8944
MAPE: 8.86%


🧩 CÓDIGO ADAPTADO – ElasticNet

In [None]:
from sklearn.linear_model import ElasticNet

# Pipeline con ElasticNet
pipeline_en = Pipeline([
    ('preprocess', preprocessor),
    ('model', ElasticNet(max_iter=10000))  # para asegurar convergencia
])

# Espacio de búsqueda
param_grid_en = {
    'model__alpha': [0.0001, 0.001, 0.01, 0.1, 1.0],
    'model__l1_ratio': [0.1, 0.5, 0.9]  # combina L1 y L2
}

# GridSearchCV
grid_en = GridSearchCV(pipeline_en, param_grid_en, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_en.fit(X_train, y_train)

# Evaluación
best_en = grid_en.best_estimator_
y_pred_en = best_en.predict(X_test)

mae_en = mean_absolute_error(y_test, y_pred_en)
mse_en = mean_squared_error(y_test, y_pred_en)
r2_en = r2_score(y_test, y_pred_en)
mape_en = np.mean(np.abs((y_test - y_pred_en) / y_test)) * 100

print("🧪 Resultados con ElasticNet:")
print(f"Mejores hiperparámetros: {grid_en.best_params_}")
print(f"MAE:  {mae_en:.2f}")
print(f"MSE:  {mse_en:.2f}")
print(f"R2:   {r2_en:.4f}")
print(f"MAPE: {mape_en:.2f}%")


🧪 Resultados con ElasticNet:
Mejores hiperparámetros: {'model__alpha': 0.01, 'model__l1_ratio': 0.5}
MAE:  16616.11
MSE:  846759063.81
R2:   0.8944
MAPE: 8.82%


🧩 CÓDIGO – LinearRegression

LinearRegression (Regresión Lineal Simple)
Este modelo no requiere hiperparámetros ni validación cruzada compleja. Solo usamos el pipeline con preprocesamiento y entrenamos directamente.

In [None]:
from sklearn.linear_model import LinearRegression

# Pipeline con regresión lineal
pipeline_lr = Pipeline([
    ('preprocess', preprocessor),
    ('model', LinearRegression())
])

# Entrenamiento
pipeline_lr.fit(X_train, y_train)

# Evaluación
y_pred_lr = pipeline_lr.predict(X_test)

mae_lr = mean_absolute_error(y_test, y_pred_lr)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
mape_lr = np.mean(np.abs((y_test - y_pred_lr) / y_test)) * 100

print("📏 Resultados con Linear Regression:")
print(f"MAE:  {mae_lr:.2f}")
print(f"MSE:  {mse_lr:.2f}")
print(f"R2:   {r2_lr:.4f}")
print(f"MAPE: {mape_lr:.2f}%")


📏 Resultados con Linear Regression:
MAE:  16360.42
MSE:  850872789.23
R2:   0.8939
MAPE: 9.24%


🧩KernelRidge con validación cruzada
Hiperparámetros clave:
alpha: regularización (igual que en Ridge)

kernel: tipo de kernel ('linear', 'poly', 'rbf', etc.)

gamma: parámetro del kernel RBF

In [None]:
from sklearn.kernel_ridge import KernelRidge

# Pipeline con Kernel Ridge
pipeline_kr = Pipeline([
    ('preprocess', preprocessor),
    ('model', KernelRidge())
])

# Espacio de búsqueda
param_grid_kr = {
    'model__alpha': [0.01, 0.1, 1.0],
    'model__kernel': ['linear', 'rbf'],
    'model__gamma': [0.01, 0.1, 1.0]  # solo se usa con 'rbf'
}

# GridSearchCV
grid_kr = GridSearchCV(pipeline_kr, param_grid_kr, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_kr.fit(X_train, y_train)

# Evaluación
best_kr = grid_kr.best_estimator_
y_pred_kr = best_kr.predict(X_test)

mae_kr = mean_absolute_error(y_test, y_pred_kr)
mse_kr = mean_squared_error(y_test, y_pred_kr)
r2_kr = r2_score(y_test, y_pred_kr)
mape_kr = np.mean(np.abs((y_test - y_pred_kr) / y_test)) * 100

print("💠 Resultados con Kernel Ridge:")
print(f"Mejores hiperparámetros: {grid_kr.best_params_}")
print(f"MAE:  {mae_kr:.2f}")
print(f"MSE:  {mse_kr:.2f}")
print(f"R2:   {r2_kr:.4f}")
print(f"MAPE: {mape_kr:.2f}%")


💠 Resultados con Kernel Ridge:
Mejores hiperparámetros: {'model__alpha': 1.0, 'model__gamma': 0.01, 'model__kernel': 'linear'}
MAE:  16528.96
MSE:  840916628.10
R2:   0.8951
MAPE: 8.91%


GaussianProcessRegressor

Consideraciones:
No escala bien con muchos datos (pero Ames Housing está en un rango manejable).

Solo acepta matrices densas, así que como hicimos con BayesianRidge, usaremos un OneHotEncoder con sparse_output=False.

El kernel más común y flexible es el RBF, combinado con WhiteKernel para el ruido.

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, WhiteKernel

#Usar preprocesador que genere salida densa
preprocessor_dense = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_features)
])

#Definir kernel: constante * RBF + ruido
kernel = C(1.0, (1e-3, 1e3)) * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2)) + WhiteKernel()

#Pipeline
pipeline_gpr = Pipeline([
    ('preprocess', preprocessor_dense),
    ('model', GaussianProcessRegressor(kernel=kernel, alpha=0.0, normalize_y=True))
])

#Entrenamiento
pipeline_gpr.fit(X_train, y_train)

#Predicción
y_pred_gpr = pipeline_gpr.predict(X_test)

#Evaluación
mae_gpr = mean_absolute_error(y_test, y_pred_gpr)
mse_gpr = mean_squared_error(y_test, y_pred_gpr)
r2_gpr = r2_score(y_test, y_pred_gpr)
mape_gpr = np.mean(np.abs((y_test - y_pred_gpr) / y_test)) * 100

print("📈 Resultados con Gaussian Process Regressor:")
print(f"MAE:  {mae_gpr:.2f}")
print(f"MSE:  {mse_gpr:.2f}")
print(f"R2:   {r2_gpr:.4f}")
print(f"MAPE: {mape_gpr:.2f}%")


📈 Resultados con Gaussian Process Regressor:
MAE:  13528.09
MSE:  879248832.48
R2:   0.8903
MAPE: 7.41%


| Modelo                | MAE       | MSE               | R²      | MAPE     | Hiperparámetros principales                    |
|-----------------------|-----------|-------------------|---------|----------|-----------------------------------------------|
| **Lasso**             | 16,025.79 | 823,773,334.65    | 0.8973  | 8.71%    | alpha=10.0                                 |
| **Random Forest**     | 15,895.48 | 698,050,695.01    | 0.9129  | 8.56%    | n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1 |
| **SVR**              | 62,841.78 | 8,716,924,294.03  | -0.0872 | 32.39%   | C=10, gamma=0.01, epsilon=0.1          |
| **Bayesian Ridge**   | 16,665.23 | 846,936,893.58    | 0.8944  | 8.86%    | —                                           |
| **ElasticNet**       | 16,616.11 | 846,759,063.81    | 0.8944  | 8.82%    | alpha=0.01, l1_ratio=0.5                |
| **Linear Regression**| 16,360.42 | 850,872,789.23    | 0.8939  | 9.24%    | —                                           |
| **Kernel Ridge**     | 16,528.96 | 840,916,628.10    | 0.8951  | 8.91%    | alpha=1.0, kernel='linear', gamma=0.01|
| **Gaussian Process** | **13,528.09** | 879,248,832.48 | 0.8903  | **7.41%**| kernel=RBF + WhiteKernel, normalize_y=True |