**Carregar os dados página 45**

In [1]:
import pandas as pd
import numpy as np

In [2]:
housing = pd.read_csv('CaliforniaHousing.csv')

In [3]:
from sklearn.model_selection import train_test_split

**Amostragem estratificada - Criar categorias de renda familiar**

In [4]:
housing['median_income_cat'] = pd.cut(housing['median_income'], 
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf], 
                               labels=[1, 2, 3, 4, 5])

In [5]:
strat_train_set, strat_test_set = train_test_split(housing, test_size=0.2, random_state=42,
                                                  stratify=housing['median_income_cat'])

In [6]:
#remover a coluna auxiliar median_income_cat
for set_ in (strat_train_set, strat_test_set, housing):
    set_.drop('median_income_cat', axis=1, inplace=True) # inplace=True modifica o dataframe

**1 - Copia**

In [7]:
housing_train = strat_train_set.copy()

**2 - Combinações de atributos página 60**

In [8]:
housing_train['rooms_per_household'] =  housing_train['total_rooms'] / housing_train['households']
housing_train['bedrooms_per_rooms'] = housing_train['total_bedrooms'] / housing_train['total_rooms']
housing_train['population_per_households'] = housing_train['population'] / housing_train['households']

**3 - separar preditores e valores-alvos**

In [9]:
housing_train_preds = housing_train.drop('median_house_value', axis=1)
housing_train_labels = strat_train_set['median_house_value'].copy()

**4 - Tratamento de valores ausentes com a classe Imputer**

In [10]:
from sklearn.impute import SimpleImputer

In [11]:
imputer = SimpleImputer(strategy='median')

In [12]:
# a mediana só pode ser calculada para valores numéricos
housing_train_preds_si = housing_train_preds.drop('ocean_proximity', axis=1)

In [13]:
imputer.fit(housing_train_preds_si)

In [14]:
# substituir os valores ausentes pelas medianas
X = imputer.transform(housing_train_preds_si)

In [15]:
contaNull=np.sum(np.isnan(X))
contaNull

0

In [16]:
# criar dataframe com valores ausentes resolvidos
housing_train_preds_si = pd.DataFrame(X, 
                                      columns=housing_train_preds_si.columns, 
                                      index=housing_train_preds_si.index)

**5 - Manipulação de textos e atributos categóricos com a classe OneHotEncoder**

In [17]:
#housing_train_cat = housing_train_preds['ocean_proximity']   # shape (16512, )
housing_train_cat = housing_train_preds[['ocean_proximity']]  # shape (16512, 1)

In [18]:
from sklearn.preprocessing import OneHotEncoder

In [19]:
cat_encode = OneHotEncoder()

In [20]:
housing_train_cat_1hot = cat_encode.fit_transform(housing_train_cat)

**6 - Feature scaling**

In [21]:
from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import MinMaxScaler

In [22]:
scaler = StandardScaler()

In [23]:
X = scaler.fit_transform(housing_train_preds_si)

In [24]:
# criar dataframe escalado
housing_train_preds_si_sc = pd.DataFrame(X,
                                 columns=housing_train_preds_si.columns,
                                 index=housing_train_preds_si.index)

In [25]:
housing_train_preds_si_sc_1hot = housing_train_preds_si_sc.copy()
housing_train_preds_si_sc_1hot[['<1H OCEAN','INLAND','ISLAND',
                                'NEAR BAY','NEAR OCEAN']]=housing_train_cat_1hot.toarray()

**<br><center>Regressão Linear**

In [26]:
from sklearn.linear_model import LinearRegression

In [27]:
lin_reg = LinearRegression()

In [28]:
lin_reg.fit(housing_train_preds_si_sc_1hot, housing_train_labels)

In [29]:
predicaoLin = lin_reg.predict(housing_train_preds_si_sc_1hot)

In [30]:
from sklearn.metrics import mean_squared_error

In [31]:
lin_mse = mean_squared_error(housing_train_labels, predicaoLin)

In [32]:
lin_rmse = np.sqrt(lin_mse)
lin_rmse   # margem de erro

67269.98114035041

**<br><center>Árvore de Decisão**

In [33]:
from sklearn.tree import DecisionTreeRegressor

In [34]:
tree_reg = DecisionTreeRegressor()

In [35]:
tree_reg.fit(housing_train_preds_si_sc_1hot,housing_train_labels)

In [36]:
predicaoTree = tree_reg.predict(housing_train_preds_si_sc_1hot)

In [37]:
tree_mse = mean_squared_error(housing_train_labels, predicaoTree)

In [38]:
tree_rmse = np.sqrt(tree_mse)
tree_rmse   # margem de erro

0.0

**<br><center><u>Validação Cruzada (cross-validation)</u>**<br>

In [39]:
from sklearn.model_selection import cross_val_score

**Árvore de Decisão**

In [40]:
tree_scores = cross_val_score(tree_reg,
                              housing_train_preds_si_sc_1hot,
                              housing_train_labels,
                              scoring='neg_mean_squared_error',
                              cv=10)

trees_rmse_scores = np.sqrt(-tree_scores)

# isto é para comparar depois
tree_rmse_cv=trees_rmse_scores.mean()

In [41]:
def exibir_scores(trees_rmse_scores):
    print('scores:', trees_rmse_scores)
    print('Mean:',trees_rmse_scores.mean())  # margem de erro
    print('Standard deviation:', trees_rmse_scores.std())

In [42]:
exibir_scores(trees_rmse_scores)

scores: [70376.88868044 66325.88146586 69813.02743356 71414.02896696
 69636.53582125 69736.80593604 70661.65554379 72520.58300885
 68593.08472515 70355.50682993]
Mean: 69943.3998411833
Standard deviation: 1572.9555856717868


**Regressão Linear**

In [43]:
lin_scores = cross_val_score(lin_reg,
                              housing_train_preds_si_sc_1hot,
                              housing_train_labels,
                              scoring='neg_mean_squared_error',
                              cv=10)

lin_rmse_scores = np.sqrt(-lin_scores)

# isto é para comparar depois
lin_rmse_cv=lin_rmse_scores.mean()

In [44]:
exibir_scores(lin_rmse_scores)

scores: [68880.34546807 66064.16097839 65695.02939809 68284.78318352
 66289.22280614 68513.77150023 66687.79577218 69388.33071876
 66669.88720917 67516.12228663]
Mean: 67398.94493211733
Standard deviation: 1230.739591108956


<br>**<center>Random Forest**

In [45]:
from sklearn.ensemble import RandomForestRegressor

In [46]:
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_train_preds_si_sc_1hot, housing_train_labels)
predicaoForest = forest_reg.predict(housing_train_preds_si_sc_1hot)
forest_mse = mean_squared_error(housing_train_labels, predicaoForest)
forest_rmse = np.sqrt(forest_mse)

forest_rmse

18414.03797884497

**RandomForest**

In [47]:
import time
ti = time.time()

forest_scores = cross_val_score(forest_reg,
                              housing_train_preds_si_sc_1hot,
                              housing_train_labels,
                              scoring='neg_mean_squared_error',
                              cv=10)

forest_rmses = np.sqrt(-forest_scores)

tf = time.time()
print('tempo de execução:', (tf-ti)/60) # uns 4 minutos

# isto é para comparar depois
forest_rmse_cv=forest_rmses.mean()

tempo de execução: 5.558543578783671


**<br><center><u>Ajustando hiperparâmetros com GridSearchCV</u><center>**

In [48]:
from sklearn.model_selection import GridSearchCV

In [49]:
param_grid = [{'n_estimators':[3,10,30], 'max_features':[2,4,6,8]},
              {'bootstrap':[False], 'n_estimators':[3,10], 'max_features':[2,3,4]}]

In [50]:
grid_search = GridSearchCV(forest_reg,
                           param_grid,
                           cv=5,
                           scoring='neg_mean_squared_error')

In [51]:
grid_search.fit(housing_train_preds_si_sc_1hot, housing_train_labels)

**<center>Melhor Modelo**

In [52]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [53]:
cvR = grid_search.cv_results_
for ms, p in zip(cvR['mean_test_score'], cvR['params']):
    print(np.sqrt(-ms),p)

63686.447034075885 {'max_features': 2, 'n_estimators': 3}
54752.4099671934 {'max_features': 2, 'n_estimators': 10}
52428.42612666724 {'max_features': 2, 'n_estimators': 30}
60343.08602421445 {'max_features': 4, 'n_estimators': 3}
52837.762056436615 {'max_features': 4, 'n_estimators': 10}
49908.10205761277 {'max_features': 4, 'n_estimators': 30}
58461.97180221353 {'max_features': 6, 'n_estimators': 3}
51924.93732785389 {'max_features': 6, 'n_estimators': 10}
49361.940806240425 {'max_features': 6, 'n_estimators': 30}
57988.45172145904 {'max_features': 8, 'n_estimators': 3}
51663.53421983175 {'max_features': 8, 'n_estimators': 10}
49717.10138451645 {'max_features': 8, 'n_estimators': 30}
63075.91990161266 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
53264.51359574238 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
59767.786015205456 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52070.9378006246 {'bootstrap': False, 'max_features': 3, 'n_estimators': 

**<center><br>Treinar com o melhor modelo**

In [54]:
melhor_modelo = grid_search.best_estimator_
melhor_modelo.fit(housing_train_preds_si_sc_1hot, housing_train_labels)
melhor_predicao = melhor_modelo.predict(housing_train_preds_si_sc_1hot)
melhor_mse =  mean_squared_error(housing_train_labels, melhor_predicao)
melhor_rmse = np.sqrt(melhor_mse)
melhor_rmse

19172.261072786358

In [55]:
melhor_scores = cross_val_score(melhor_modelo,
                                housing_train_preds_si_sc_1hot,
                                housing_train_labels,
                                scoring='neg_mean_squared_error',
                                cv=10)
melhor_rmses = np.sqrt(-melhor_scores)

# isto é para comparar depois
melhor_train=np.mean(melhor_rmses)

In [56]:
exibir_scores(melhor_rmses)

scores: [49377.0108791  48431.05957536 47860.92547941 48869.33913421
 47994.35929775 49702.81086563 49880.80131113 50756.35419547
 49771.0570303  49996.79455141]
Mean: 49264.051231977806
Standard deviation: 896.5008962695773


In [57]:
melhor_train

49264.051231977806

Melhorou, mas não ficou bom. Que fazer?<br>
* Experimentar outros modelos como:<br>
* support vector machine - from sklearn import svm<br>
* redes neurais - from sklearn.neural_network import MLPRegressor<br><br>
* outras técnicas de ensemble como XGBoost: import xgboost as xgb

Também podemos:<br>
* remover outliers
* tomar o logaritmo de algumas features
* usar melhor o dados geográficos

**<center>Parou aqui em 20 de setembro**

**<center><br>Avaliar no conjunto de teste**

In [58]:
#housing_train_preds_si_sc_1hot

In [59]:
# 1 - fazer cópia do strat_test_set
housing_test = strat_test_set.copy()
housing_test.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
3905,-121.95,37.11,21.0,2387.0,357.0,913.0,341.0,7.736,397700.0,<1H OCEAN
16821,-118.01,33.89,36.0,1589.0,265.0,804.0,272.0,4.6354,202900.0,<1H OCEAN
2900,-118.18,33.74,30.0,5915.0,1750.0,2136.0,1503.0,4.0968,310000.0,NEAR OCEAN
7193,-122.48,37.74,52.0,2166.0,423.0,1072.0,370.0,4.131,314300.0,NEAR OCEAN
13928,-122.39,37.78,5.0,1405.0,515.0,725.0,392.0,3.6037,187500.0,NEAR BAY


In [60]:
# 2 - combinar features
housing_test['rooms_per_household'] =  housing_test['total_rooms'] / housing_test['households']
housing_test['bedrooms_per_rooms'] = housing_test['total_bedrooms'] / housing_test['total_rooms']
housing_test['population_per_households'] = housing_test['population'] / housing_test['households']
housing_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_rooms,population_per_households
3905,-121.95,37.11,21.0,2387.0,357.0,913.0,341.0,7.7360,397700.0,<1H OCEAN,7.000000,0.149560,2.677419
16821,-118.01,33.89,36.0,1589.0,265.0,804.0,272.0,4.6354,202900.0,<1H OCEAN,5.841912,0.166772,2.955882
2900,-118.18,33.74,30.0,5915.0,1750.0,2136.0,1503.0,4.0968,310000.0,NEAR OCEAN,3.935462,0.295858,1.421158
7193,-122.48,37.74,52.0,2166.0,423.0,1072.0,370.0,4.1310,314300.0,NEAR OCEAN,5.854054,0.195291,2.897297
13928,-122.39,37.78,5.0,1405.0,515.0,725.0,392.0,3.6037,187500.0,NEAR BAY,3.584184,0.366548,1.849490
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12369,-124.16,40.79,46.0,3042.0,597.0,1206.0,541.0,2.1135,90600.0,NEAR OCEAN,5.622921,0.196252,2.229205
8707,-119.01,35.39,29.0,1820.0,459.0,1134.0,419.0,1.8289,59400.0,INLAND,4.343675,0.252198,2.706444
16634,-123.01,38.67,33.0,914.0,147.0,394.0,132.0,4.6875,246200.0,<1H OCEAN,6.924242,0.160832,2.984848
9779,-122.03,37.60,24.0,2077.0,383.0,1488.0,389.0,4.5721,214700.0,NEAR BAY,5.339332,0.184401,3.825193


In [61]:
# 3 separar preditores e valores-alvos
housing_test_preds = housing_test.drop('median_house_value', axis=1)
housing_test_labels = housing_test['median_house_value']

In [62]:
housing_test_labels

3905     397700.0
16821    202900.0
2900     310000.0
7193     314300.0
13928    187500.0
           ...   
12369     90600.0
8707      59400.0
16634    246200.0
9779     214700.0
3455     163400.0
Name: median_house_value, Length: 4128, dtype: float64

In [63]:
housing_test_preds

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,bedrooms_per_rooms,population_per_households
3905,-121.95,37.11,21.0,2387.0,357.0,913.0,341.0,7.7360,<1H OCEAN,7.000000,0.149560,2.677419
16821,-118.01,33.89,36.0,1589.0,265.0,804.0,272.0,4.6354,<1H OCEAN,5.841912,0.166772,2.955882
2900,-118.18,33.74,30.0,5915.0,1750.0,2136.0,1503.0,4.0968,NEAR OCEAN,3.935462,0.295858,1.421158
7193,-122.48,37.74,52.0,2166.0,423.0,1072.0,370.0,4.1310,NEAR OCEAN,5.854054,0.195291,2.897297
13928,-122.39,37.78,5.0,1405.0,515.0,725.0,392.0,3.6037,NEAR BAY,3.584184,0.366548,1.849490
...,...,...,...,...,...,...,...,...,...,...,...,...
12369,-124.16,40.79,46.0,3042.0,597.0,1206.0,541.0,2.1135,NEAR OCEAN,5.622921,0.196252,2.229205
8707,-119.01,35.39,29.0,1820.0,459.0,1134.0,419.0,1.8289,INLAND,4.343675,0.252198,2.706444
16634,-123.01,38.67,33.0,914.0,147.0,394.0,132.0,4.6875,<1H OCEAN,6.924242,0.160832,2.984848
9779,-122.03,37.60,24.0,2077.0,383.0,1488.0,389.0,4.5721,NEAR BAY,5.339332,0.184401,3.825193


In [64]:
# 4 - remover valores nulos
housing_test_preds_si = housing_test_preds.drop('ocean_proximity', axis=1)
housing_test_preds_si.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4128 entries, 3905 to 3455
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   longitude                  4128 non-null   float64
 1   latitude                   4128 non-null   float64
 2   housing_median_age         4128 non-null   float64
 3   total_rooms                4128 non-null   float64
 4   total_bedrooms             4089 non-null   float64
 5   population                 4128 non-null   float64
 6   households                 4128 non-null   float64
 7   median_income              4128 non-null   float64
 8   rooms_per_household        4128 non-null   float64
 9   bedrooms_per_rooms         4089 non-null   float64
 10  population_per_households  4128 non-null   float64
dtypes: float64(11)
memory usage: 387.0 KB


In [65]:
imputer = SimpleImputer(strategy='median')
imputer.fit_transform(housing_test_preds_si)
housing_test_preds_si.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4128 entries, 3905 to 3455
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   longitude                  4128 non-null   float64
 1   latitude                   4128 non-null   float64
 2   housing_median_age         4128 non-null   float64
 3   total_rooms                4128 non-null   float64
 4   total_bedrooms             4089 non-null   float64
 5   population                 4128 non-null   float64
 6   households                 4128 non-null   float64
 7   median_income              4128 non-null   float64
 8   rooms_per_household        4128 non-null   float64
 9   bedrooms_per_rooms         4089 non-null   float64
 10  population_per_households  4128 non-null   float64
dtypes: float64(11)
memory usage: 387.0 KB


In [66]:
X = imputer.fit_transform(housing_test_preds_si)
X

array([[-121.95      ,   37.11      ,   21.        , ...,    7.        ,
           0.14956012,    2.67741935],
       [-118.01      ,   33.89      ,   36.        , ...,    5.84191176,
           0.16677155,    2.95588235],
       [-118.18      ,   33.74      ,   30.        , ...,    3.93546241,
           0.29585799,    1.42115768],
       ...,
       [-123.01      ,   38.67      ,   33.        , ...,    6.92424242,
           0.16083151,    2.98484848],
       [-122.03      ,   37.6       ,   24.        , ...,    5.33933162,
           0.18440058,    3.8251928 ],
       [-117.12      ,   32.74      ,   52.        , ...,    4.64386792,
           0.19756221,    2.06839623]])

In [67]:
contaNull = np.sum(np.isnan(X))
contaNull

0

In [68]:
# montar um daframe com o X
housing_test_preds_si = pd.DataFrame(X,
                                     columns=housing_test_preds_si.columns,
                                     index=housing_test_preds_si.index)
housing_test_preds_si.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4128 entries, 3905 to 3455
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   longitude                  4128 non-null   float64
 1   latitude                   4128 non-null   float64
 2   housing_median_age         4128 non-null   float64
 3   total_rooms                4128 non-null   float64
 4   total_bedrooms             4128 non-null   float64
 5   population                 4128 non-null   float64
 6   households                 4128 non-null   float64
 7   median_income              4128 non-null   float64
 8   rooms_per_household        4128 non-null   float64
 9   bedrooms_per_rooms         4128 non-null   float64
 10  population_per_households  4128 non-null   float64
dtypes: float64(11)
memory usage: 387.0 KB


In [69]:
# 5 - OneHotEncoder
housing_test_cat = housing_test_preds[['ocean_proximity']]
type(housing_test_cat), housing_test_cat.shape

(pandas.core.frame.DataFrame, (4128, 1))

In [70]:
housing_test_cat_1hot = cat_encode.fit_transform(housing_test_cat)
housing_test_cat_1hot

<4128x5 sparse matrix of type '<class 'numpy.float64'>'
	with 4128 stored elements in Compressed Sparse Row format>

In [71]:
housing_test_cat_1hot.toarray()

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]])

In [72]:
# 6 - scaler
X = scaler.fit_transform(housing_test_preds_si)
X

array([[-1.18804209,  0.69962912, -0.62700415, ...,  0.79958178,
        -1.0742425 , -0.03227171],
       [ 0.76723335, -0.8005624 ,  0.56519904, ...,  0.22764638,
        -0.78692537, -0.0192847 ],
       [ 0.68286868, -0.8704471 ,  0.08831777, ..., -0.71387591,
         1.36796405, -0.09086148],
       ...,
       [-1.71408066,  1.42642999,  0.32675841, ...,  0.76216802,
        -0.88608482, -0.01793377],
       [-1.22774311,  0.92791914, -0.38856351, ..., -0.0205587 ,
        -0.49263731,  0.02125837],
       [ 1.20890725, -1.33634509,  1.83688246, ..., -0.36402157,
        -0.27292506, -0.06067545]])

In [73]:
# criar dataframe escalado
housing_test_preds_si_sc = pd.DataFrame(X,                                        
                                     columns=housing_test_preds_si.columns,
                                     index=housing_test_preds_si.index)

In [74]:
housing_test_preds_si_sc_1hot = housing_test_preds_si_sc.copy()

housing_test_preds_si_sc_1hot[['<1H OCEAN','INLAND',
                               'ISLAND','NEAR BAY',
                               'NEAR OCEAN']]=housing_test_cat_1hot.toarray()

housing_test_preds_si_sc_1hot

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_rooms,population_per_households,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
3905,-1.188042,0.699629,-0.627004,-0.108087,-0.428991,-0.402769,-0.412280,2.001178,0.799582,-1.074243,-0.032272,1.0,0.0,0.0,0.0,0.0
16821,0.767233,-0.800562,0.565199,-0.476372,-0.653675,-0.488459,-0.593776,0.395516,0.227646,-0.786925,-0.019285,1.0,0.0,0.0,0.0,0.0
2900,0.682869,-0.870447,0.088318,1.520117,2.973013,0.558698,2.644224,0.116599,-0.713876,1.367964,-0.090861,0.0,0.0,0.0,0.0,1.0
7193,-1.451061,0.993145,1.836882,-0.210081,-0.267805,-0.277770,-0.335999,0.134310,0.233643,-0.310842,-0.022017,0.0,0.0,0.0,0.0,1.0
13928,-1.406398,1.011781,-1.898688,-0.561289,-0.043122,-0.550566,-0.278130,-0.138755,-0.887359,2.548020,-0.070885,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12369,-2.284783,2.414134,1.360001,0.194201,0.157140,-0.172426,0.113797,-0.910463,0.119495,-0.294789,-0.053176,0.0,0.0,0.0,0.0,1.0
8707,0.270971,-0.101715,0.008838,-0.369763,-0.179885,-0.229029,-0.207110,-1.057845,-0.512275,0.639128,-0.030918,0.0,1.0,0.0,0.0,0.0
16634,-1.714081,1.426430,0.326758,-0.787890,-0.941856,-0.810783,-0.962030,0.422496,0.762168,-0.886085,-0.017934,1.0,0.0,0.0,0.0,0.0
9779,-1.227743,0.927919,-0.388564,-0.251155,-0.365494,0.049270,-0.286022,0.362736,-0.020559,-0.492637,0.021258,0.0,0.0,0.0,1.0,0.0


In [75]:
type(housing_test_preds_si_sc_1hot), housing_test_preds_si_sc_1hot.shape

(pandas.core.frame.DataFrame, (4128, 16))

**<center>Testar com o melhor_modelo**

In [76]:
pred = melhor_modelo.predict(housing_test_preds_si_sc_1hot)
pred_mse = mean_squared_error(housing_test_labels, pred)
pred_rmse = np.sqrt(pred_mse)
pred_rmse

59194.128292751564

**grid_search.best_estimator_**

In [77]:
melhor_scores = cross_val_score(melhor_modelo,
                                housing_test_preds_si_sc_1hot,
                                housing_test_labels,
                                scoring='neg_mean_squared_error',
                                cv=10)
melhor_rmse = np.sqrt(-melhor_scores)
# isto é para comparar depois
melhor_rmse_cv=melhor_rmses.mean()

In [78]:
exibir_scores(melhor_rmses)

scores: [49377.0108791  48431.05957536 47860.92547941 48869.33913421
 47994.35929775 49702.81086563 49880.80131113 50756.35419547
 49771.0570303  49996.79455141]
Mean: 49264.051231977806
Standard deviation: 896.5008962695773


In [79]:
d = {'Modelo':['LinearRgression','DecisionTree','RandomForest'], 
     'Resultado train':    [lin_rmse,    tree_rmse,    forest_rmse],
     'Resultado train CV': [lin_rmse_cv, tree_rmse_cv, forest_rmse_cv],
     'Resultado test': ['-', '-',                      pred_rmse],
     'Resultado test CV': ['-', '-',                   melhor_rmse_cv]}
df = pd.DataFrame(data=d)
df

Unnamed: 0,Modelo,Resultado train,Resultado train CV,Resultado test,Resultado test CV
0,LinearRgression,67269.98114,67398.944932,-,-
1,DecisionTree,0.0,69943.399841,-,-
2,RandomForest,18414.037979,49677.691523,59194.128293,49264.051232
