# Sesión 4
Ejemplo de regresión lineal de Ridge y Lasso. Método de RandomForest.
Esta libreta también contiene ejemplos de cómo utilizar la validación cruzada.

In [2]:
import pandas as pd
from Funciones.modelo import estadisticas
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, RidgeCV, LassoCV
import numpy as np

In [3]:
data_training = pd.read_excel("../data/datos_casas_limpios_training.xlsx")
data_testing = pd.read_excel("../data/datos_casas_limpios_test.xlsx")

In [4]:
data_training.shape

(1460, 81)

In [5]:
data_testing.shape

(1459, 81)

## Creación de variables Dummies
Antes de comenzar con el modelo, como en este caso se tienen variables cualitativas, se debe crear las respectivas variables dummies. Para ello se puede hacer uso de la
función get_dummies de pandas.

In [6]:
#Eliminando la variable ID y la variable target.
y = data_training["SalePrice"]
data_training.drop(["SalePrice", "Id"], inplace=True, axis=1)
data_training.shape

(1460, 79)

In [7]:
#Creación de las dummies
X = pd.get_dummies(data_training)
X.shape

(1460, 300)

In [8]:
X.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196,706,0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0,978,0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162,486,0,...,0,0,0,1,0,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0,216,0,...,0,0,0,1,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350,655,0,...,0,0,0,1,0,0,0,0,1,0


### Regresión de Ridge
Existe una clase dentro de sklearn que me permite hacer una regresión de Ridge.
En este caso, voy a hacer de una tubería para simplificar el proceso, ya que interesa primero estandarizar los datos.

In [9]:
X_train, X_holdout, y_train, y_holdout = train_test_split(X,y,test_size=0.2, random_state=5)


In [10]:
alpha_ridge = [0,1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20,100]
for i in range(len(alpha_ridge)):
    print("*************Alpha = %.2e******************"%alpha_ridge[i])
    ridgereg = make_pipeline(StandardScaler(with_mean=True), Ridge(alpha=alpha_ridge[i]))
    ridgereg.fit(X_train,y_train)
    print("**********Estadísticas training Alpha = %.2e******************"%alpha_ridge[i])
    y_predicted = ridgereg.predict(X_train)
    estadisticas(y_predicted,y_train)

    print("*******Estadísticas testeo Alpha = %.2e******************"%alpha_ridge[i])
    y_predicted = ridgereg.predict(X_holdout)
    estadisticas(y_predicted,y_holdout)

*************Alpha = 0.00e+00******************
**********Estadísticas training Alpha = 0.00e+00******************
R-Square Value 0.8456825032683649


mean_absolute_error : 23049.336758391415


mean_squared_error :  966036000.0724629


root_mean_squared_error :  31081.119672117075
*******Estadísticas testeo Alpha = 0.00e+00******************
R-Square Value -1.3730293934531605e+28


mean_absolute_error : 9.76572224596477e+17


mean_squared_error :  8.910969823502469e+37


root_mean_squared_error :  9.439793336457355e+18
*************Alpha = 1.00e-15******************
**********Estadísticas training Alpha = 1.00e-15******************
R-Square Value 0.9379115042455075


mean_absolute_error : 12678.36171212934


mean_squared_error :  388677391.4787732


root_mean_squared_error :  19714.902776295225
*******Estadísticas testeo Alpha = 1.00e-15******************
R-Square Value -100.94197550113189


mean_absolute_error : 100553.56320039109


mean_squared_error :  661604093670.7037


root_mean_

### Tuning de Ridge
En este parte, utilizando validación cruzada, voy a encontrar el mejor valor de alpha.

In [11]:
alpha_range = 10.**np.arange(-2, 3)
ridgereg = make_pipeline(StandardScaler(with_mean=False), RidgeCV(alphas=alpha_range,scoring='neg_mean_squared_error', normalize=True))
ridgereg.fit(X_train,y_train)
print("**********Estadísticas training RidgeCV******************")
y_predicted = ridgereg.predict(X_train)
estadisticas(y_predicted,y_train)

print("*******Estadísticas testeo RidgeCV******************")
y_predicted = ridgereg.predict(X_holdout)
estadisticas(y_predicted,y_holdout)

**********Estadísticas training RidgeCV******************
R-Square Value 0.8909995833218559


mean_absolute_error : 16130.657926397123


mean_squared_error :  682348591.4698609


root_mean_squared_error :  26121.802990411303
*******Estadísticas testeo RidgeCV******************
R-Square Value 0.8563678688881379


mean_absolute_error : 17428.719473243276


mean_squared_error :  932173478.6785655


root_mean_squared_error :  30531.516154271892


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), _RidgeGCV())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




In [12]:
ridgereg.get_params()["ridgecv"].alpha_

1.0

### Tuning de Lasso
En este parte, utilizando validación cruzada, voy a encontrar el mejor valor de alpha para Lasso.

In [13]:
lassoregcv = LassoCV(n_alphas=100, normalize=True, random_state=1)
lassoregcv.fit(X_train, y_train)
print('alpha : ',lassoregcv.alpha_)

print("**********Estadísticas training LassoCV******************")
y_predicted = lassoregcv.predict(X_train)
estadisticas(y_predicted,y_train)

print("*******Estadísticas testeo LassoCV******************")
y_predicted = lassoregcv.predict(X_holdout)
estadisticas(y_predicted,y_holdout)

  model = cd_fast.enet_coordinate_descent_gram(


alpha :  34.30055603142735
**********Estadísticas training LassoCV******************
R-Square Value 0.9132607156201434


mean_absolute_error : 14834.980229766095


mean_squared_error :  542992681.362533


root_mean_squared_error :  23302.20335853528
*******Estadísticas testeo LassoCV******************
R-Square Value 0.9014640187261089


mean_absolute_error : 15269.201135443536


mean_squared_error :  639499168.6613162


root_mean_squared_error :  25288.320795602784


  model = cd_fast.enet_coordinate_descent_gram(
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lasso())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 


In [14]:
lassoregcv.alpha_

34.30055603142735

In [15]:
lassoregcv.coef_

array([-3.87852548e+01,  0.00000000e+00,  3.48951146e-01,  9.50272652e+03,
        3.42130840e+03,  2.23494226e+02,  1.19667834e+02,  5.32499436e+00,
        1.46011156e+01,  0.00000000e+00, -0.00000000e+00,  1.32947443e+01,
        0.00000000e+00,  0.00000000e+00, -2.04544824e-01,  4.57751660e+01,
        2.22576359e+03, -1.81838980e+03,  2.50936517e+03,  0.00000000e+00,
       -0.00000000e+00, -1.64336154e+03,  8.20756550e+01,  4.23839391e+03,
        4.57906196e+01,  4.30878872e+03,  1.34368701e+01,  1.26046916e+01,
        0.00000000e+00, -0.00000000e+00,  0.00000000e+00,  3.13417919e+01,
        0.00000000e+00, -0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
       -7.42458181e+03,  2.26352953e+03, -0.00000000e+00,  0.00000000e+00,
       -1.89360585e+03, -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  2.81534507e+03,
        0.00000000e+00, -0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00,  

In [22]:
from sklearn.ensemble import RandomForestRegressor
forest_model = RandomForestRegressor()
forest_model.fit(X_train, y_train)
y_predicted = forest_model.predict(X_holdout)
estadisticas(y_predicted, y_holdout)

R-Square Value 0.88059249218806


mean_absolute_error : 16670.411609589042


mean_squared_error :  774955513.6148887


root_mean_squared_error :  27838.022803620388


In [26]:
27838.02 / y_holdout.mean() * 100

15.241553527715613

In [21]:
RandomForestRegressor?

### Haciendo las predicciones
Con el conjunto de datos dados para testeo, se deben hacer las predicciones.

In [17]:
data_testing.drop(['SalePrice', 'Id'], axis=1, inplace=True)
X_test = pd.get_dummies(data_testing)

In [18]:
data_testing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1459 non-null   int64  
 1   MSZoning       1459 non-null   object 
 2   LotFrontage    1459 non-null   float64
 3   LotArea        1459 non-null   int64  
 4   Street         1459 non-null   object 
 5   Alley          1459 non-null   object 
 6   LotShape       1459 non-null   object 
 7   LandContour    1459 non-null   object 
 8   Utilities      1459 non-null   object 
 9   LotConfig      1459 non-null   object 
 10  LandSlope      1459 non-null   object 
 11  Neighborhood   1459 non-null   object 
 12  Condition1     1459 non-null   object 
 13  Condition2     1459 non-null   object 
 14  BldgType       1459 non-null   object 
 15  HouseStyle     1459 non-null   object 
 16  OverallQual    1459 non-null   int64  
 17  OverallCond    1459 non-null   int64  
 18  YearBuil