In [6]:
import os
import tarfile
from six.moves import urllib
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                        bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]




DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)



housing = load_housing_data()
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

housing = train_set.drop("median_house_value", axis=1)
housing_num = housing.drop("ocean_proximity",axis=1)
housing_labels = train_set["median_house_value"].copy()


attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])
housing_num_tr = num_pipeline.fit_transform(housing_num)

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])
housing_prepared = full_pipeline.fit_transform(housing)

param_grid = [
        {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
        {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
    ]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
        scoring='neg_mean_squared_error',
        return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

print('Best Parameters: ', grid_search.best_params_)
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)


Best Parameters:  {'max_features': 8, 'n_estimators': 30}
63467.324961700964 {'max_features': 2, 'n_estimators': 3}
54539.981728016224 {'max_features': 2, 'n_estimators': 10}
52683.33906858547 {'max_features': 2, 'n_estimators': 30}
59358.19799425676 {'max_features': 4, 'n_estimators': 3}
52271.45838211252 {'max_features': 4, 'n_estimators': 10}
50391.70821355501 {'max_features': 4, 'n_estimators': 30}
58524.37129023218 {'max_features': 6, 'n_estimators': 3}
51889.73782864724 {'max_features': 6, 'n_estimators': 10}
49787.88754867301 {'max_features': 6, 'n_estimators': 30}
58266.43007066911 {'max_features': 8, 'n_estimators': 3}
51903.51994488712 {'max_features': 8, 'n_estimators': 10}
49766.992373872185 {'max_features': 8, 'n_estimators': 30}
62635.17490518263 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
54368.75469014809 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
60175.36801146478 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52422.945310584

In [9]:
final_model = grid_search.best_estimator_
X_test = test_set.drop("median_house_value", axis=1)
y_test = test_set["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse) # => evaluates to 47,730.2
print(final_rmse)

49977.69072845478
