In [1]:
import os
from zlib import crc32
import tarfile
from six.moves import urllib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.impute import SimpleImputer as Imputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
%matplotlib inline

In [2]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join('datasets', 'housing')
HOUSING_URL = DOWNLOAD_ROOT + 'datasets/housing/housing.tgz'

## Functions

In [3]:
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, 'housing.tgz')
    urllib.request.urlretrieve(housing_url, tgz_path)
    with tarfile.open(tgz_path) as housing_tgz:
        housing_tgz.extractall(path=housing_path)


def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, 'housing.csv')
    return pd.read_csv(csv_path)


class CombinedAttributeAdder(BaseEstimator, TransformerMixin):
    rooms_ix = 3
    bedrooms_ix = 4
    population_ix = 5
    household_ix = 6
    
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        attrs = []
        rooms_per_household = X[:, self.rooms_ix] / X[:, self.household_ix]
        attrs.append(rooms_per_household)
        
        population_per_household = X[:, self.population_ix] / X[:, self.household_ix]
        attrs.append(population_per_household)
        
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, self.bedrooms_ix] / X[:, self.rooms_ix]
            attrs.append(bedrooms_per_room)
        
        
        return np.c_[X, np.stack(attrs, axis=1)]
        

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names].values

In [4]:
fetch_housing_data()

In [5]:
housing = load_housing_data()

In [6]:
housing['income_cat'] = np.ceil(housing['median_income'] / 1.5)
housing['income_cat'].where(housing['income_cat'] < 5, 5.0, inplace=True)

In [7]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['income_cat']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [8]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop('income_cat', axis=1, inplace=True)

In [9]:
housing = strat_train_set.drop('median_house_value', axis=1)
housing_labels = strat_train_set['median_house_value'].copy()

In [16]:
obj_columns = ['ocean_proximity']
num_columns = list(housing.drop(obj_columns, axis=1))

In [17]:
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_columns)),
    ('imputer', Imputer(strategy='median')),
    ('attribs_adder', CombinedAttributeAdder()),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(obj_columns)),
    ('encoder', OneHotEncoder(sparse=False)),
])

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline),
])

In [18]:
housing_prepared = full_pipeline.fit_transform(housing)

In [19]:
housing_prepared

array([[-1.15604281,  0.77194962,  0.74333089, ...,  0.        ,
         0.        ,  0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.18684903, -1.34218285,  0.18664186, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.        ,
         0.        ,  0.        ],
       [-1.43579109,  0.99645926,  1.85670895, ...,  0.        ,
         1.        ,  0.        ]])

In [14]:
housing_prepared.shape

(16512, 16)

In [82]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib

In [67]:
def describe_model(model_reg):
    some_data = housing.iloc[:5]
    some_labels = housing_labels.iloc[:5]
    some_data_prepared = full_pipeline.transform(some_data)
    
    housing_predictions = model_reg.predict(housing_prepared)
    model_mse = mean_squared_error(housing_labels, housing_predictions)
    model_rmse = np.sqrt(model_mse)
    
    scores = cross_val_score(model_reg, housing_prepared, housing_labels,
                             scoring='neg_mean_squared_error', cv=10)
    scores = np.sqrt(-scores)
    
    print("Predictions:", model_reg.predict(some_data_prepared))
    print("Labels:", list(some_labels))
    print("RMSE:", model_rmse)
    print('\nScores:', scores)
    print('Mean:', scores.mean())
    print('Std Deviation:', scores.std())

In [68]:
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [72]:
describe_model(lin_reg)

Predictions: [210644.60459286 317768.80697211 210956.43331178  59218.98886849
 189747.55849879]
Labels: [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]
RMSE: 68628.19819848923

Scores: [66768.61385378 66969.88548843 70347.95244419 74751.17270878
 68031.13388938 71215.16959565 64960.68503917 68270.58754008
 71552.91566558 67665.10082067]
Mean: 69053.32170457012
Std Deviation: 2737.847700689222


In [69]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [74]:
describe_model(tree_reg)

Predictions: [286600. 340600. 196900.  46300. 254500.]
Labels: [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]
RMSE: 0.0

Scores: [68915.71326557 66802.11952058 70384.00745758 68969.11784643
 72295.45020492 74906.76299092 70564.67800788 72124.62953629
 76812.5559304  70467.31103723]
Mean: 71224.23457977925
Std Deviation: 2800.5788416610667


In [80]:
forest_reg = RandomForestRegressor(n_estimators=10)
forest_reg.fit(housing_prepared, housing_labels)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [81]:
describe_model(forest_reg)

Predictions: [289250. 305800. 214110.  47980. 236090.]
Labels: [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]
RMSE: 22389.857112279242

Scores: [51367.42934946 49246.48859302 51456.39169976 53763.38689799
 52978.13502984 56106.60109498 51075.05366641 50040.67230767
 54776.79143633 53368.93329893]
Mean: 52417.98833743909
Std Deviation: 2043.0328436830605


## Para salvar um modelo a ser carregado depois
```py
joblib.dump(forest_reg, 'forest_reg.pkl')
```