In [1]:
# basic imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# skearn imports
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor


# info_apoio imports
from info_apoio import fetch_housing_data
from info_apoio import load_housing_data
from info_apoio import CombinedAttributesAdder
from info_apoio import data_preparation
from info_apoio import data_trasformation


# Carregando os dados, lendo e criando um repositório 

In [2]:
# loading the data
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"


In [3]:
# creating directory
fetch_housing_data(housing_url=HOUSING_URL,
                   housing_path=HOUSING_PATH,
                   data="housing.tgz")

In [4]:
# reading the data
housing = load_housing_data(housing_path=HOUSING_PATH,
                            data="housing.csv")

housing.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
housing['income_cat'] = pd.cut(housing['median_income'],
                        bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                        labels=[1, 2, 3, 4, 5])

housing.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,income_cat
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,5
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,5
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,5
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,4
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,3


In [6]:
strat_train, strat_test = data_preparation(housing, 'income_cat') 


# Preparando os dados que vão alimentar o modelo

In [7]:
# loading the model
data_trasformation = data_trasformation(strat_train.drop('median_house_value', axis=1), 'ocean_proximity')

# train data
X_train = data_trasformation.fit_transform(strat_train.drop(columns=['median_house_value']))
y_train = strat_train['median_house_value']

# test data
X_test = data_trasformation.fit_transform(strat_test.drop(columns=['median_house_value']))
y_test = strat_test['median_house_value']

# Melhorando os parametros 

In [8]:
# optimizing parameters
parameter_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 4, 6, 8]}
    ]

forest_model = RandomForestRegressor()

grid_search = GridSearchCV(forest_model,
                           param_grid=parameter_grid,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(X_train, y_train)
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [10]:
modelo = RandomForestRegressor(grid_search.best_params_)