In [2]:
from pycaret.regression import setup, compare_models, predict_model, save_model
from sklearn.datasets import fetch_california_housing
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
# Cargamos el dataset de California Housing. 
data = fetch_california_housing(as_frame=True)
df = data.frame

In [4]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [5]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [6]:
# inicializamos el entorno de PyCaret para un problema de regresión
reg = setup(data=train_df, target='MedHouseVal')

Unnamed: 0,Description,Value
0,Session id,1189
1,Target,MedHouseVal
2,Target type,Regression
3,Original data shape,"(16512, 9)"
4,Transformed data shape,"(16512, 9)"
5,Transformed train set shape,"(11558, 9)"
6,Transformed test set shape,"(4954, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


In [7]:
# para entrenar modelos, compararlos y obtener el mejor
best_model = compare_models()
print(best_model)

LGBMRegressor(n_jobs=-1, random_state=1189)


In [9]:
best_model

In [8]:
# hacer predicciones sobre los datos de prueba
predictions = predict_model(best_model, data=test_df)
print(predictions.head())

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Light Gradient Boosting Machine,0.3135,0.2186,0.4676,0.8332,0.1426,0.1825


       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup   Latitude  \
20046  1.6812      25.0  4.192201   1.022284      1392.0  3.877437  36.060001   
3024   2.5313      30.0  5.039383   1.193493      1565.0  2.679795  35.139999   
15663  3.4801      52.0  3.977155   1.185877      1310.0  1.360332  37.799999   
20484  5.7376      17.0  6.163636   1.020202      1705.0  3.444444  34.279999   
9814   3.7250      34.0  5.492990   1.028037      1063.0  2.483645  36.619999   

        Longitude  MedHouseVal  prediction_label  
20046 -119.010002      0.47700          0.616732  
3024  -119.459999      0.45800          0.939236  
15663 -122.440002      5.00001          4.816205  
20484 -118.720001      2.18600          2.308346  
9814  -121.930000      2.78000          2.359973  


In [None]:
## metricas en test
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_test = test_df['MedHouseVal']
y_pred = predictions['prediction_label']

print("MSE:", round(mean_squared_error(y_test, y_pred), 4))
print("MAE:", round(mean_absolute_error(y_test, y_pred), 4))
print("R2:", round(r2_score(y_test, y_pred), 4))

MSE: 0.2204
MAE: 0.3147
R2: 0.8318


In [None]:
# Guardar el modelo entrenado
save_model(best_model, 'best_california_housing_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['MedInc', 'HouseAge', 'AveRooms',
                                              'AveBedrms', 'Population',
                                              'AveOccup', 'Latitude',
                                              'Longitude'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=[],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('trained_model', LGBMRegressor(n_jobs=-1, random_state=520))]),
 'best_california_housing_model.pkl')

In [None]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [None]:
###### agregamos nans al dataset para ver como se comporta el modelo
import numpy as np

def add_nan_values(df, column_name, num_nans):
    """
    Añade valores NaN a una columna específica de un DataFrame.
    
    :param df: DataFrame al que se le añadirán los NaN
    :param column_name: Nombre de la columna a la que se le añadirán los NaN
    :param num_nans: Número de NaN a añadir
    """
    # sselecccionamos índices aleatorios para añadir NaN
    nan_indices = np.random.choice(df.index, size=num_nans, replace=False)
    df.loc[nan_indices, column_name] = np.nan
    return df

df = add_nan_values(df, 'HouseAge', 50)
df = add_nan_values(df, 'MedInc', 500)
df = add_nan_values(df, 'Latitude', 100)
df = add_nan_values(df, 'Longitude', 1000)

In [None]:
df.isna().sum() #chequeamos haya agregado correctamente la cantidad de nans.

MedInc          500
HouseAge         50
AveRooms          0
AveBedrms         0
Population        0
AveOccup          0
Latitude        100
Longitude      1000
MedHouseVal       0
dtype: int64

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

# inicializamos el entorno de PyCaret para un problema de regresión
reg = setup(data=df_train, target='MedHouseVal', session_id=42, 
            imputation_type='iterative')


# para entrenar modelos, compararlos y obtener el mejor
best_model = compare_models()
print(best_model)

# hacer predicciones sobre los datos de prueba
predictions = predict_model(best_model, data=df_test)
print(predictions.head())

# metricas en test
y_test = test_df['MedHouseVal']
y_pred = predictions['prediction_label']
print("MSE:", round(mean_squared_error(y_test, y_pred), 4))
print("MAE:", round(mean_absolute_error(y_test, y_pred), 4))
print("R2:", round(r2_score(y_test, y_pred), 4))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0,000296 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 11527, number of used features: 7
[LightGBM] [Info] Start training from score 28,544374
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0,000301 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1587
[LightGBM] [Info] Number of data points in the train set: 11504, number of used features: 7
[LightGBM] [Info] Start training from score 35,643401
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0,000323 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1586
[LightGBM] [Info] Number of data points in the train set: 11276, number of used features: 7
[LightGBM] [Info] Start trai

Unnamed: 0,Description,Value
0,Session id,42
1,Target,MedHouseVal
2,Target type,Regression
3,Original data shape,"(16512, 9)"
4,Transformed data shape,"(16512, 9)"
5,Transformed train set shape,"(11558, 9)"
6,Transformed test set shape,"(4954, 9)"
7,Numeric features,8
8,Rows with missing values,7.9%
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.3246,0.2345,0.4835,0.8233,0.1471,0.1839,0.937
xgboost,Extreme Gradient Boosting,0.3297,0.2437,0.4931,0.8163,0.1501,0.1852,0.812
et,Extra Trees Regressor,0.3483,0.275,0.5238,0.7927,0.1555,0.1946,1.733
rf,Random Forest Regressor,0.349,0.2791,0.5276,0.7897,0.1574,0.1964,3.457
gbr,Gradient Boosting Regressor,0.3746,0.2927,0.5403,0.7794,0.1653,0.2147,2.231
dt,Decision Tree Regressor,0.4854,0.5731,0.7564,0.5681,0.2212,0.2619,0.628
ada,AdaBoost Regressor,0.6924,0.6825,0.8237,0.4853,0.2772,0.5013,0.987
en,Elastic Net,0.6815,0.7716,0.878,0.4182,0.2785,0.4543,0.443
llar,Lasso Least Angle Regression,0.772,0.961,0.9801,0.2753,0.3095,0.5227,0.431
lasso,Lasso Regression,0.772,0.961,0.9801,0.2753,0.3095,0.5227,0.433


LGBMRegressor(n_jobs=-1, random_state=42)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Light Gradient Boosting Machine,0.3185,0.2279,0.4774,0.8261,0.1457,0.1852


       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup   Latitude  \
20046  1.6812      25.0  4.192201   1.022284      1392.0  3.877437  36.060001   
3024   2.5313      30.0  5.039383   1.193493      1565.0  2.679795  35.139999   
15663  3.4801      52.0  3.977155   1.185877      1310.0  1.360332  37.799999   
20484  5.7376      17.0  6.163636   1.020202      1705.0  3.444444  34.279999   
9814   3.7250      34.0  5.492990   1.028037      1063.0  2.483645  36.619999   

        Longitude  MedHouseVal  prediction_label  
20046 -119.010002      0.47700          0.561588  
3024  -119.459999      0.45800          0.954734  
15663 -122.440002      5.00001          5.017591  
20484 -118.720001      2.18600          2.480319  
9814  -121.930000      2.78000          2.693478  
MSE: 0.2279
MAE: 0.3185
R2: 0.8261


In [None]:
from pycaret.regression import evaluate_model

evaluate_model(best_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [None]:
### y si quiero hacer solo modelos lineales?

# inicializamos el entorno de PyCaret para un problema de regresión
reg = setup(data=df_train, target='MedHouseVal', session_id=42, 
            imputation_type='iterative')


# para entrenar modelos, compararlos y obtener el mejor
best_model = compare_models(include=['lr', 'lasso', 'ridge', 'en', 'br'])
print(best_model)

# hacer predicciones sobre los datos de prueba
predictions = predict_model(best_model, data=df_test)
print(predictions.head())

# metricas en test
y_test = test_df['MedHouseVal']
y_pred = predictions['prediction_label']
print("MSE:", round(mean_squared_error(y_test, y_pred), 4))
print("MAE:", round(mean_absolute_error(y_test, y_pred), 4))
print("R2:", round(r2_score(y_test, y_pred), 4))

Unnamed: 0,Description,Value
0,Session id,42
1,Target,MedHouseVal
2,Target type,Regression
3,Original data shape,"(16512, 9)"
4,Transformed data shape,"(16512, 9)"
5,Transformed train set shape,"(11558, 9)"
6,Transformed test set shape,"(4954, 9)"
7,Numeric features,8
8,Rows with missing values,7.9%
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
en,Elastic Net,0.6815,0.7716,0.878,0.4182,0.2785,0.4543,0.462
lasso,Lasso Regression,0.772,0.961,0.9801,0.2753,0.3095,0.5227,0.467
lr,Linear Regression,0.5463,2.3509,1.0801,-0.7542,0.2276,0.3323,0.943
ridge,Ridge Regression,0.5463,2.3515,1.0802,-0.7547,0.2276,0.3323,0.508
br,Bayesian Ridge,0.5463,2.3529,1.0804,-0.7557,0.2276,0.3323,0.462


ElasticNet(random_state=42)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Elastic Net,0.6784,0.7689,0.8768,0.4133,0.2795,0.46


       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup   Latitude  \
20046  1.6812      25.0  4.192201   1.022284      1392.0  3.877437  36.060001   
3024   2.5313      30.0  5.039383   1.193493      1565.0  2.679795  35.139999   
15663  3.4801      52.0  3.977155   1.185877      1310.0  1.360332  37.799999   
20484  5.7376      17.0  6.163636   1.020202      1705.0  3.444444  34.279999   
9814   3.7250      34.0  5.492990   1.028037      1063.0  2.483645  36.619999   

        Longitude  MedHouseVal  prediction_label  
20046 -119.010002      0.47700          1.475659  
3024  -119.459999      0.45800          1.746441  
15663 -122.440002      5.00001          2.230165  
20484 -118.720001      2.18600          2.413796  
9814  -121.930000      2.78000          2.092212  
MSE: 0.7689
MAE: 0.6784
R2: 0.4133


In [None]:
evaluate_model(best_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…