In [27]:
import bayes_opt

In [28]:
import os
import tarfile
import urllib.request
import numpy as np

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [29]:
fetch_housing_data()

In [30]:
import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)
housing = load_housing_data()

In [31]:
# housing["total_bedrooms"].fillna(housing["total_bedrooms"].median(),inplace=True)
housing = housing.reset_index()

housing["income_category"] = pd.cut(housing["median_income"], bins=[0., 1.5,3.0,4.5,6.,np.inf], labels=[1,2,3,4,5])

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

train_index, test_index = split.split(housing, housing["income_category"]).__iter__().__next__()
train_set = housing.loc[train_index]
test_set = housing.loc[test_index]
train_set = train_set.drop(columns="income_category")
test_set = test_set.drop(columns="income_category")


In [32]:
train_set.head()

Unnamed: 0,index,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
12655,12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,72100.0,INLAND
15502,15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,279600.0,NEAR OCEAN
2908,2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875,82700.0,INLAND
14053,14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,112500.0,NEAR OCEAN
20496,20496,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,238300.0,<1H OCEAN


In [33]:
from sklearn.base import BaseEstimator, TransformerMixin

class AttributeAdder(BaseEstimator, TransformerMixin):
    def __init__(self, index_dic):
        self.index_dic = index_dic
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        rooms_per_households = X[:,self.index_dic["total_rooms"]] / X[:,self.index_dic["households"]]
        bedrooms_per_rooms = X[:,self.index_dic["total_bedrooms"]] / X[:,self.index_dic["total_rooms"]]
        population_per_households = X[:, self.index_dic["population"]] / X[:,self.index_dic["households"]]
        return np.c_[
            X,
            rooms_per_households,
            bedrooms_per_rooms,
            population_per_households
        ]

class columnDropperTransformer():
    def __init__(self,columns):
        self.columns=columns

    def transform(self,X,y=None):
        return X.drop(self.columns,axis=1)

    def fit(self, X, y=None):
        return self 


In [34]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


housing_num = train_set.drop("ocean_proximity", axis=1)
housing_num_attr = list(housing_num)
housing_num_index_dic = dict()
for attr in housing_num_attr:
    housing_num_index_dic[attr] = housing_num.columns.get_loc(attr)

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")), # 中央値で欠損値を埋める
    ("custom", AttributeAdder(index_dic=housing_num_index_dic)),
    ("scaler", StandardScaler()) # 標準化
])

full_pipeline = ColumnTransformer([
    ("drop", "drop", "median_house_value"),
    ("num", num_pipeline, list(housing_num)),
    ("onehot", OneHotEncoder(), ["ocean_proximity"]) # テキストラベルをワンホットエンコーディング
])


In [35]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

pbounds = {'n_estimators': (1,200), 'max_features': (1,8)}

housing_labels = train_set["median_house_value"].copy()

def f(n_estimators, max_features):
    housing_prepared = full_pipeline.fit_transform(train_set)
    regressor = RandomForestRegressor(n_estimators=int(n_estimators), max_features=int(max_features),random_state=42, n_jobs=8)
    scores = cross_val_score(regressor, housing_prepared, housing_labels,
        scoring="neg_mean_squared_error", cv=5)
    return -np.sqrt(-scores).mean()


optimizer = bayes_opt.BayesianOptimization(
    f=f,pbounds=pbounds, random_state=42,
    verbose=5
)

optimizer.maximize(init_points=10, n_iter=5)

import optuna

study = optuna.create_study()

def objective(trial: optuna.Trial):
    n_estimators = trial.suggest_int("n_estimators", 1, 200)
    max_features = trial.suggest_int("max_features", 1,8)
    housing_prepared = full_pipeline.fit_transform(train_set)
    regressor = RandomForestRegressor(n_estimators=int(n_estimators), max_features=int(max_features),random_state=42, n_jobs=8)
    scores = cross_val_score(regressor, housing_prepared, housing_labels,
        scoring="neg_mean_squared_error", cv=5)
    return np.sqrt(-scores).mean()

study.optimize(objective, n_trials=15)

print(optimizer.max)
print(study.best_trial)

|   iter    |  target   | max_fe... | n_esti... |
-------------------------------------------------
| [0m1        [0m | [0m-1.416e+0[0m | [0m3.622    [0m | [0m190.2    [0m |
| [95m2        [0m | [95m-4.715e+0[0m | [95m6.124    [0m | [95m120.1    [0m |
| [0m3        [0m | [0m-2.328e+0[0m | [0m2.092    [0m | [0m32.04    [0m |
| [0m4        [0m | [0m-3.062e+0[0m | [0m1.407    [0m | [0m173.4    [0m |
| [0m5        [0m | [0m-7.042e+0[0m | [0m5.208    [0m | [0m141.9    [0m |
| [0m6        [0m | [0m-3.066e+0[0m | [0m1.144    [0m | [0m194.0    [0m |
| [0m7        [0m | [0m-5.589e+0[0m | [0m6.827    [0m | [0m43.26    [0m |
| [0m8        [0m | [0m-2.286e+0[0m | [0m2.273    [0m | [0m37.5     [0m |
| [0m9        [0m | [0m-1.453e+0[0m | [0m3.13     [0m | [0m105.4    [0m |
| [0m10       [0m | [0m-1.056e+0[0m | [0m4.024    [0m | [0m58.95    [0m |
| [0m11       [0m | [0m-7.042e+0[0m | [0m5.272    [0m | [0m142.0    

[32m[I 2022-12-09 04:34:06,005][0m A new study created in memory with name: no-name-57128fb4-9776-4781-81de-969cf5562f66[0m


| [95m15       [0m | [95m-3.397e+0[0m | [95m7.932    [0m | [95m123.0    [0m |


[32m[I 2022-12-09 04:34:09,634][0m Trial 0 finished with value: 22287.186782614805 and parameters: {'n_estimators': 92, 'max_features': 2}. Best is trial 0 with value: 22287.186782614805.[0m
[32m[I 2022-12-09 04:34:15,355][0m Trial 1 finished with value: 10375.263962674544 and parameters: {'n_estimators': 101, 'max_features': 4}. Best is trial 1 with value: 10375.263962674544.[0m
[32m[I 2022-12-09 04:34:22,623][0m Trial 2 finished with value: 3466.668775708668 and parameters: {'n_estimators': 94, 'max_features': 7}. Best is trial 2 with value: 3466.668775708668.[0m
[32m[I 2022-12-09 04:34:28,197][0m Trial 3 finished with value: 30730.416683160314 and parameters: {'n_estimators': 184, 'max_features': 1}. Best is trial 2 with value: 3466.668775708668.[0m
[32m[I 2022-12-09 04:34:32,133][0m Trial 4 finished with value: 14500.956098643772 and parameters: {'n_estimators': 86, 'max_features': 3}. Best is trial 2 with value: 3466.668775708668.[0m
[32m[I 2022-12-09 04:34:40,800]

{'target': -3397.2358360523285, 'params': {'max_features': 7.93159321933638, 'n_estimators': 123.03186665497277}}
FrozenTrial(number=5, values=[2575.9028843844185], datetime_start=datetime.datetime(2022, 12, 9, 4, 34, 32, 135238), datetime_complete=datetime.datetime(2022, 12, 9, 4, 34, 40, 799746), params={'n_estimators': 108, 'max_features': 8}, distributions={'n_estimators': IntDistribution(high=200, log=False, low=1, step=1), 'max_features': IntDistribution(high=8, log=False, low=1, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=5, state=TrialState.COMPLETE, value=None)


In [36]:
regressor = RandomForestRegressor(n_estimators=int(optimizer.max["params"]["n_estimators"]), max_features=int(optimizer.max["params"]["max_features"]),random_state=42, n_jobs=8)
housing_prepared = full_pipeline.fit_transform(train_set)
housing_labels = train_set["median_house_value"].copy()
regressor.fit(housing_prepared, housing_labels)
housing_prepared = full_pipeline.fit_transform(train_set)
housing_labels = train_set["median_house_value"].copy()
regressor2 = RandomForestRegressor(n_estimators=study.best_trial.params["n_estimators"], max_features=study.best_trial.params["max_features"], random_state=42, n_jobs=8)
regressor2.fit(housing_prepared, housing_labels)

RandomForestRegressor(max_features=8, n_estimators=108, n_jobs=8,
                      random_state=42)

In [37]:
N=10
housing_prepared = full_pipeline.transform(test_set)
housing_labels = test_set["median_house_value"].copy()
some_data = housing_prepared
some_label = housing_labels
predict = regressor.predict(some_data)
some_data = housing_prepared
some_label = housing_labels
predict2 = regressor2.predict(some_data)
diff = predict - some_label
diff2 = predict2 - some_label
table = np.stack([some_label, predict,predict2, diff, diff2],axis=-1)
test_set[:N]

Unnamed: 0,index,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
5241,5241,-118.39,34.12,29.0,6447.0,1012.0,2184.0,960.0,8.2816,500001.0,<1H OCEAN
17352,17352,-120.42,34.89,24.0,2020.0,307.0,855.0,283.0,5.0099,162500.0,<1H OCEAN
3505,3505,-118.45,34.25,36.0,1453.0,270.0,808.0,275.0,4.3839,204600.0,<1H OCEAN
7777,7777,-118.1,33.91,35.0,1653.0,325.0,1072.0,301.0,3.2708,159700.0,<1H OCEAN
14155,14155,-117.07,32.77,38.0,3779.0,614.0,1495.0,614.0,4.3529,184000.0,NEAR OCEAN
7057,7057,-118.04,33.93,35.0,1805.0,387.0,1505.0,366.0,4.1667,151900.0,<1H OCEAN
33,33,-122.27,37.83,49.0,1655.0,366.0,754.0,329.0,1.375,104900.0,NEAR BAY
17049,17049,-122.26,37.46,26.0,5067.0,750.0,1996.0,728.0,7.0001,500001.0,NEAR OCEAN
18164,18164,-122.03,37.36,16.0,2697.0,803.0,1369.0,723.0,4.4699,367400.0,<1H OCEAN
10444,10444,-117.61,33.43,24.0,2303.0,399.0,851.0,379.0,3.9875,346500.0,NEAR OCEAN


In [39]:
np.set_printoptions(suppress=True)
table[:10]

array([[500001.        , 498872.52030827, 499478.75      ,
         -1128.47969173,   -522.25      ],
       [162500.        , 164487.80487805, 162234.25925926,
          1987.80487805,   -265.74074074],
       [204600.        , 205108.94308943, 205200.92592593,
           508.94308943,    600.92592593],
       [159700.        , 160517.07317073, 160067.59259259,
           817.07317073,    367.59259259],
       [184000.        , 185662.60162602, 185683.33333333,
          1662.60162602,   1683.33333333],
       [151900.        , 152452.03252033, 152503.7037037 ,
           552.03252033,    603.7037037 ],
       [104900.        , 105243.08943089, 105432.40740741,
           343.08943089,    532.40740741],
       [500001.        , 494987.89429201, 495722.19444444,
         -5013.10570799,  -4278.80555556],
       [367400.        , 366832.53658537, 366110.19444444,
          -567.46341463,  -1289.80555556],
       [346500.        , 348007.32520325, 345337.03703704,
          1507.32520325