# Importaciones necesarias

In [2]:
import kagglehub
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, max_error, r2_score

# Proceso

In [4]:
path = kagglehub.dataset_download("prevek18/ames-housing-dataset")
df = pd.read_csv(f"{path}/AmesHousing.csv")
y = df['SalePrice']
df = df.drop(columns='SalePrice')
categorical_cols = [c for c in df.columns if df[c].dtype == "object"]
numerical_cols = [c for c in df.columns if df[c].dtype in ["int64", "float64"]]
print(df.shape)
print(len(categorical_cols))
print(len(numerical_cols))
print(df.isnull().any().sum())
print(df.isnull().sum().sum())

num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')
df[numerical_cols] = pd.DataFrame(num_imputer.fit_transform(df[numerical_cols]), columns=numerical_cols)
df[categorical_cols] = pd.DataFrame(cat_imputer.fit_transform(df[categorical_cols]), columns=categorical_cols)

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_cat = pd.DataFrame(encoder.fit_transform(df[categorical_cols]))
encoded_cat.columns = encoder.get_feature_names_out(categorical_cols)
encoded_cat.index = df.index

X = pd.concat([df[numerical_cols], encoded_cat], axis=1)
print(X.shape)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, random_state=0)
print(X_train.shape)

def evaluar_modelo(modelo):
    modelo.fit(X_train, y_train)
    preds = modelo.predict(X_valid)
    # Orden modificado de las métricas: Max Error, MedAE, MAE, R2, RMSE
    return [
        round(max_error(y_valid, preds), 0),
        round(median_absolute_error(y_valid, preds), 2),
        round(mean_absolute_error(y_valid, preds), 2),
        round(r2_score(y_valid, preds), 2),
        round(np.sqrt(mean_squared_error(y_valid, preds)), 2)
    ]

# Lista de nombres de modelos en el orden solicitado
nombres_modelos = [
    'Decision Tree',
    'Linear Regression',
    'Gradient Boosting',
    'Random Forest',
    'SVR',
    'K-Neighbors'
]

# Modelos en el orden correspondiente a nombres_modelos
modelos = [
    DecisionTreeRegressor(random_state=0),
    LinearRegression(),
    GradientBoostingRegressor(random_state=0),
    RandomForestRegressor(random_state=0),
    SVR(),
    KNeighborsRegressor()
]

# Nombres de las métricas en el orden solicitado
metricas = [
    'Max Error',
    'Median AE',
    'MAE',
    'R2 Score',
    'RMSE'
]

resultados = [evaluar_modelo(m) for m in modelos]

# Crear DataFrame con el orden solicitado
df_resultados = pd.DataFrame(
    resultados,
    index=nombres_modelos,  # Modelos como índice (filas)
    columns=metricas        # Métricas como columnas
)

print(df_resultados)

(2930, 81)
43
38
27
15749
(2930, 305)
(2344, 305)
                   Max Error  Median AE       MAE  R2 Score      RMSE
Decision Tree       425250.0   19000.00  27610.66      0.71  43366.33
Linear Regression   575528.0   10961.52  17118.22      0.79  37249.79
Gradient Boosting   408928.0    9920.03  15476.49      0.87  29198.25
Random Forest       340620.0   11511.44  17139.56      0.86  30111.59
SVR                 464887.0   35499.00  57204.49     -0.08  84260.99
K-Neighbors         184400.0   17077.00  27694.45      0.74  41086.36
