<a href="https://colab.research.google.com/github/jdmartinev/ST1613-AppliedML-/blob/main/Semana03/Regresi%C3%B3n_lineal_housing_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1woWEpcwkCYSdRSTfSTT2j17JinrTMJOu' -O housing.csv

--2023-04-19 22:14:26--  https://docs.google.com/uc?export=download&id=1woWEpcwkCYSdRSTfSTT2j17JinrTMJOu
Resolving docs.google.com (docs.google.com)... 172.253.122.101, 172.253.122.100, 172.253.122.139, ...
Connecting to docs.google.com (docs.google.com)|172.253.122.101|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-10-6c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/q9ge8g16dk1n10eu4t74cipklqmg22ms/1681942425000/15315348669826032119/*/1woWEpcwkCYSdRSTfSTT2j17JinrTMJOu?e=download&uuid=190642a8-972b-4ed9-b96c-499dda903f03 [following]
--2023-04-19 22:14:26--  https://doc-10-6c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/q9ge8g16dk1n10eu4t74cipklqmg22ms/1681942425000/15315348669826032119/*/1woWEpcwkCYSdRSTfSTT2j17JinrTMJOu?e=download&uuid=190642a8-972b-4ed9-b96c-499dda903f03
Resolving doc-10-6c-docs.googleusercontent.com (doc-10-6c-docs.googleusercontent.com)... 142.251.16.132, 2607:f

In [None]:
import sys
import time

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
import numpy as np
import pandas as pd
from IPython.display import HTML

sys.path.append("code/.")

#import mglearn
from IPython.display import display
#from plotting_functions import *


# Preprocessing and pipeline
from sklearn.impute import SimpleImputer
from scipy.stats import reciprocal

# train test split and cross validation
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import (
    MinMaxScaler,
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler,
    PolynomialFeatures,
)
pd.set_option("display.max_colwidth", 200)

In [None]:
random_state = 42
np.random.seed(random_state)

In [None]:
# Cargar los datos
housing_df = pd.read_csv("housing.csv")
#Particiones
train_df, test_df = train_test_split(housing_df, test_size=0.20, random_state=123)
#Creación de nuevas variables
train_df = train_df.assign(
    rooms_per_household=train_df["total_rooms"] / train_df["households"]
)

test_df = test_df.assign(
    rooms_per_household=test_df["total_rooms"] / test_df["households"]
)

train_df = train_df.assign(
    bedrooms_per_household=train_df["total_bedrooms"] / train_df["households"]
)

test_df = test_df.assign(
    bedrooms_per_household=test_df["total_bedrooms"] / test_df["households"]
)

train_df = train_df.assign(
    population_per_household=train_df["population"] / train_df["households"]
)

test_df = test_df.assign(
    population_per_household=test_df["population"] / test_df["households"]
)
#Definir características y variable objetivo
X_train = train_df.drop(columns=["median_house_value"])
y_train = train_df["median_house_value"]

X_test = test_df.drop(columns=["median_house_value"])
y_test = test_df["median_house_value"]

#Encontrar variables numéricas y categóricas
cat_cols = X_train.select_dtypes(include=object).columns
num_cols = X_train.select_dtypes(include=np.number).columns

In [None]:
#Definir el pipeline de pre-procesamiento
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")),("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[("encoder", OneHotEncoder(handle_unknown="ignore"))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ]
)


In [None]:
#Definamos el regresor ridge
ridge_base = Ridge()
clf_ridge = Pipeline(steps=[("preprocessor", preprocessor), ('poly', PolynomialFeatures(include_bias=True)), ("regressor", ridge_base)])
# Definamos las distribuciones de parámetros sobre las que haremos la búsqueda:
param_distributions = {
    'poly__degree': list(range(1, 4)),
    'regressor__alpha': reciprocal(1e-5, 1e3)
}
# Definamos nuestros modelos mediante RandomizedSearchCV:
search_ridge = RandomizedSearchCV(
    clf_ridge, 
    param_distributions=param_distributions, 
    n_iter=10,
    cv=5,
    n_jobs=-1
)
#Entrenemos los modelos
search_ridge.fit(X_train, y_train)
print(search_ridge)
print(search_ridge.best_params_)
# Obtengamos el R^2 y el MAE de prueba para el modelo ridge:
print('Modelo ridge')
print(f'R^2: {search_ridge.score(X_test, y_test)}')
print(f'MAE: {mean_absolute_error(y_test, search_ridge.predict(X_test))}')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Modelo ridge
R^2: 0.6476159989658351
MAE: 48978.12634480735


In [None]:
#Definamos el regresor lasso
lasso_base = Lasso()
clf_lasso = Pipeline(steps=[("preprocessor", preprocessor), ('poly', PolynomialFeatures(include_bias=True)), ("regressor", lasso_base)])
# Definamos las distribuciones de parámetros sobre las que haremos la búsqueda:
param_distributions = {
    'poly__degree': list(range(1, 4)),
    'regressor__alpha': np.logspace(-4, 4, 20),  # Regularization strength; smaller values = weaker regularization
}
# Definamos nuestros modelos mediante RandomizedSearchCV:
search_lasso = RandomizedSearchCV(
    clf_lasso, 
    param_distributions=param_distributions, 
    n_iter=10,
    cv=5,
    verbose=1,
    n_jobs=4
)
#Entrenemos los modelos
search_lasso.fit(X_train, y_train)
print(search_lasso)
print(search_lasso.best_params_)
# Obtengamos el R^2 y el MAE de prueba para el modelo ridge:
print('Modelo Lasso')
print(f'R^2: {search_lasso.score(X_test, y_test)}')
print(f'MAE: {mean_absolute_error(y_test, search_lasso.predict(X_test))}')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Modelo ridge
R^2: 0.6476177498261906
MAE: 48976.93150054957


  model = cd_fast.enet_coordinate_descent(


In [None]:
#Definamos el regresor Elastic-net
elnet_base = ElasticNet()
clf_elnet = Pipeline(steps=[("preprocessor", preprocessor), ('poly', PolynomialFeatures(include_bias=True)), ("regressor", elnet_base)])
# Definamos las distribuciones de parámetros sobre las que haremos la búsqueda:
param_distributions = {
    'poly__degree': list(range(1, 4)),
    'regressor__alpha': np.logspace(-4, 4, 20),  # Regularization strength; smaller values = weaker regularization
    'regressor__l1_ratio': np.linspace(0, 1, 10),  # Ratio for Elastic Net
}
# Definamos nuestros modelos mediante RandomizedSearchCV:
search_elnet = RandomizedSearchCV(
    clf_elnet, 
    param_distributions=param_distributions, 
    n_iter=10,
    cv=5,
    n_jobs=-1
)
#Entrenemos los modelos
search_elnet.fit(X_train, y_train)
print(search_elnet)
print(search_elnet.best_params_)
# Obtengamos el R^2 y el MAE de prueba para el modelo ridge:
print('Modelo Elastic net')
print(f'R^2: {search_elnet.score(X_test, y_test)}')
print(f'MAE: {mean_absolute_error(y_test, search_elnet.predict(X_test))}')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Modelo ridge
R^2: 0.6476671572815367
MAE: 48984.56459126855


  model = cd_fast.enet_coordinate_descent(


In [None]:
#Definamos el regresor basado en redes neuronales
mlp_base = MLPRegressor()
clf_mlp = Pipeline(steps=[("preprocessor", preprocessor), ("regressor", mlp_base)])
param_distributions = {
    "regressor__hidden_layer_sizes": [(50, 50), (100, 100), (50, 100, 50)],
    "regressor__activation": ["relu", "tanh", "logistic"],
    #"solver": ["adam", "sgd"],
    "regressor__alpha": np.logspace(-5, 3, 9),
    "regressor__learning_rate": ["constant", "invscaling", "adaptive"],
    "regressor__max_iter": [200, 500, 1000]
}
# Definamos nuestros modelos mediante RandomizedSearchCV:
search_mlp = RandomizedSearchCV(
    clf, 
    param_distributions=param_distributions, 
    n_iter=10,
    cv=5,
    n_jobs=-1
)
#Entrenemos el modelo
search_mlp.fit(X_train, y_train)
# Obtengamos los mejores hiperparámetros encontrados para el modelo ridge
search_mlp.best_params_
# Obtengamos el R^2 y el MAE de prueba para el modelo ridge:
print('Modelo ridge')
print(f'R^2: {search_mlp.score(X_test, y_test)}')
print(f'MAE: {mean_absolute_error(y_test, search_mlp.predict(X_test))}')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Modelo ridge
R^2: 0.7285636349519824
MAE: 41705.67151704049


