In [1]:
import os
import tarfile
import urllib

In [2]:
HOUSING_PATH="./datasets/housing/"

In [3]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path,"housing.csv")
    return pd.read_csv(csv_path)

In [4]:
housing = load_housing_data()

In [5]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [6]:
import matplotlib.pyplot as plt

In [7]:
import numpy as np

In [8]:
def split_train_test(data,test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data)*test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [9]:
train_set,test_set = split_train_test(housing,0.2)

In [10]:
from zlib import crc32

In [11]:
def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier))& 0xfffffff < test_ratio * 2**32

In [12]:
def split_train_test_by_id(data,test_ratio,id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_,test_ratio))
    return data.loc[~in_test_set],data.loc[in_test_set]

In [13]:
housing_with_id = housing.reset_index()
train_set,test_set = split_train_test_by_id(housing_with_id, 0.2,"index")

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
train_set, test_set = train_test_split(housing, test_size=0.2,random_state=42)

In [16]:
housing["income_cat"]= pd.cut(housing["median_income"], bins=[0.,1.5,3.0,4.5,6.0,np.inf],labels=[1,2,3,4,5])

In [17]:
from sklearn.model_selection import StratifiedShuffleSplit

In [18]:
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)

In [19]:
for train_index,test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set=housing.loc[test_index]

In [20]:
from pandas.plotting import scatter_matrix

In [21]:
from pylab import *

 ### 210121 기록
 
 # 2.5 머신러닝 알고리즘을 위한 데이터 준비

## 2.5.1 데이터 정제

대부분의 머신러닝 알고리즘은 누락된 특성을 다루지 못하므로 이를 처리할 수 있는 함수를 몇개 만들겠습니다. 앞서 total_bedrooms 특성에 값이 없는 경우를 보았는데 이를 고쳐보겠습니다. 방법은 3가지 입니다. 

- 해당 구역을 제거합니다

- 전체 특성을 삭제합니다.

- 어떤 값으로 채웁니다(0,평균,중간값등)

데이터프레임의 dropna(),drop(),fillna() 메서드를 이용해 이런 작업을 간단하게 처리할 수 있습니다.

In [22]:
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index,test_index in split.split(housing, housing["income_cat"]):
    start_train_set = housing.loc[train_index]
    start_test_set=housing.loc[test_index]

In [23]:
housing = start_train_set.drop("median_house_value",axis=1)

In [24]:
housing_labels = start_train_set["median_house_value"].copy()

In [25]:
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,income_cat
17606,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042,<1H OCEAN,2
18632,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214,<1H OCEAN,5
14650,-117.20,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621,NEAR OCEAN,2
3230,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839,INLAND,2
3555,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347,<1H OCEAN,3
...,...,...,...,...,...,...,...,...,...,...
6563,-118.13,34.20,46.0,1271.0,236.0,573.0,210.0,4.9312,INLAND,4
12053,-117.56,33.88,40.0,1196.0,294.0,1052.0,258.0,2.0682,INLAND,2
13908,-116.40,34.09,9.0,4855.0,872.0,2098.0,765.0,3.2723,INLAND,3
11159,-118.01,33.82,31.0,1960.0,380.0,1356.0,356.0,4.0625,<1H OCEAN,3


In [26]:
housing.dropna(subset=["total_bedrooms"])

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,income_cat
17606,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042,<1H OCEAN,2
18632,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214,<1H OCEAN,5
14650,-117.20,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621,NEAR OCEAN,2
3230,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839,INLAND,2
3555,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347,<1H OCEAN,3
...,...,...,...,...,...,...,...,...,...,...
6563,-118.13,34.20,46.0,1271.0,236.0,573.0,210.0,4.9312,INLAND,4
12053,-117.56,33.88,40.0,1196.0,294.0,1052.0,258.0,2.0682,INLAND,2
13908,-116.40,34.09,9.0,4855.0,872.0,2098.0,765.0,3.2723,INLAND,3
11159,-118.01,33.82,31.0,1960.0,380.0,1356.0,356.0,4.0625,<1H OCEAN,3


In [27]:
housing.drop("total_bedrooms",axis=1)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income,ocean_proximity,income_cat
17606,-121.89,37.29,38.0,1568.0,710.0,339.0,2.7042,<1H OCEAN,2
18632,-121.93,37.05,14.0,679.0,306.0,113.0,6.4214,<1H OCEAN,5
14650,-117.20,32.77,31.0,1952.0,936.0,462.0,2.8621,NEAR OCEAN,2
3230,-119.61,36.31,25.0,1847.0,1460.0,353.0,1.8839,INLAND,2
3555,-118.59,34.23,17.0,6592.0,4459.0,1463.0,3.0347,<1H OCEAN,3
...,...,...,...,...,...,...,...,...,...
6563,-118.13,34.20,46.0,1271.0,573.0,210.0,4.9312,INLAND,4
12053,-117.56,33.88,40.0,1196.0,1052.0,258.0,2.0682,INLAND,2
13908,-116.40,34.09,9.0,4855.0,2098.0,765.0,3.2723,INLAND,3
11159,-118.01,33.82,31.0,1960.0,1356.0,356.0,4.0625,<1H OCEAN,3


In [28]:
median = housing["total_bedrooms"].median()

In [29]:
median

433.0

In [30]:
housing["total_bedrooms"].fillna(median,inplace=True)

In [31]:
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,income_cat
17606,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042,<1H OCEAN,2
18632,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214,<1H OCEAN,5
14650,-117.20,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621,NEAR OCEAN,2
3230,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839,INLAND,2
3555,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347,<1H OCEAN,3
...,...,...,...,...,...,...,...,...,...,...
6563,-118.13,34.20,46.0,1271.0,236.0,573.0,210.0,4.9312,INLAND,4
12053,-117.56,33.88,40.0,1196.0,294.0,1052.0,258.0,2.0682,INLAND,2
13908,-116.40,34.09,9.0,4855.0,872.0,2098.0,765.0,3.2723,INLAND,3
11159,-118.01,33.82,31.0,1960.0,380.0,1356.0,356.0,4.0625,<1H OCEAN,3


In [32]:
from sklearn.impute import SimpleImputer

In [33]:
imputer = SimpleImputer(strategy="median")

In [34]:
imputer

SimpleImputer(strategy='median')

In [35]:
housing_num = housing.drop("ocean_proximity",axis=1)

In [36]:
housing_num

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,income_cat
17606,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042,2
18632,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214,5
14650,-117.20,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621,2
3230,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839,2
3555,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347,3
...,...,...,...,...,...,...,...,...,...
6563,-118.13,34.20,46.0,1271.0,236.0,573.0,210.0,4.9312,4
12053,-117.56,33.88,40.0,1196.0,294.0,1052.0,258.0,2.0682,2
13908,-116.40,34.09,9.0,4855.0,872.0,2098.0,765.0,3.2723,3
11159,-118.01,33.82,31.0,1960.0,380.0,1356.0,356.0,4.0625,3


In [37]:
imputer.fit(housing_num)

SimpleImputer(strategy='median')

In [38]:
imputer.statistics_

array([-118.51  ,   34.26  ,   29.    , 2119.5   ,  433.    , 1164.    ,
        408.    ,    3.5409,    3.    ])

In [39]:
housing_num.median().values

array([-118.51  ,   34.26  ,   29.    , 2119.5   ,  433.    , 1164.    ,
        408.    ,    3.5409])

In [40]:
X = imputer.transform(housing_num)

In [41]:
housing_tr=pd.DataFrame(X,columns=housing_num.columns,index=housing_num.index)

In [42]:
housing_tr

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,income_cat
17606,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042,2.0
18632,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214,5.0
14650,-117.20,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621,2.0
3230,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839,2.0
3555,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347,3.0
...,...,...,...,...,...,...,...,...,...
6563,-118.13,34.20,46.0,1271.0,236.0,573.0,210.0,4.9312,4.0
12053,-117.56,33.88,40.0,1196.0,294.0,1052.0,258.0,2.0682,2.0
13908,-116.40,34.09,9.0,4855.0,872.0,2098.0,765.0,3.2723,3.0
11159,-118.01,33.82,31.0,1960.0,380.0,1356.0,356.0,4.0625,3.0


## 2.5.2 텍스트와 범주형 특성 다루기 

In [43]:
housing_cat=housing[["ocean_proximity"]]

In [44]:
housing_cat.head(10)

Unnamed: 0,ocean_proximity
17606,<1H OCEAN
18632,<1H OCEAN
14650,NEAR OCEAN
3230,INLAND
3555,<1H OCEAN
19480,INLAND
8879,<1H OCEAN
13685,INLAND
4937,<1H OCEAN
4861,<1H OCEAN


In [45]:
from sklearn.preprocessing import OrdinalEncoder

In [46]:
ordinal_encoder = OrdinalEncoder()

In [47]:
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)

In [48]:
housing_cat_encoded[:10]

array([[0.],
       [0.],
       [4.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.]])

In [49]:
ordinal_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

In [50]:
from sklearn.preprocessing import OneHotEncoder

In [51]:
cat_encoder = OneHotEncoder()

In [52]:
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)

In [53]:
housing_cat_1hot

<16512x5 sparse matrix of type '<class 'numpy.float64'>'
	with 16512 stored elements in Compressed Sparse Row format>

In [54]:
housing_cat_1hot.toarray()

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

In [55]:
cat_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

## 2.5.3 나만의 변환기 

조합 특성을 추가하는 간단한 변환기 

변환기가 add_bedrooms_per_room 하이퍼파라미터 하나를 가지며 기본값을 True로 지정합니다. 합리적인 기본값을 주는 것이 좋습니다. 
데이터 준비 단계를 자동화할수록 더 많은 조합을 자동으로 시도해볼 수 있고 최상의 조합을 찾을 가능성을 매우 높여줍니다. 그리고 시간도 많이 절약됩니다.

In [56]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3,4,5,6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self,add_bedrooms_per_room=True): #*args나 **kargs가 아닙니다. 
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self,X,y=None) : 
        return self #더 할 일이 없습니다.
    def transform(self,X):
        rooms_per_household = X[:,rooms_ix] / X[:, households_ix]
        population_per_household = X[:,population_ix] / X[:,households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:,bedrooms_ix] / X[:,rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [57]:
housing_extra_attribs

array([[-121.89, 37.29, 38.0, ..., 2, 4.625368731563422,
        2.094395280235988],
       [-121.93, 37.05, 14.0, ..., 5, 6.008849557522124,
        2.7079646017699117],
       [-117.2, 32.77, 31.0, ..., 2, 4.225108225108225,
        2.0259740259740258],
       ...,
       [-116.4, 34.09, 9.0, ..., 3, 6.34640522875817, 2.742483660130719],
       [-118.01, 33.82, 31.0, ..., 3, 5.50561797752809,
        3.808988764044944],
       [-122.45, 37.77, 52.0, ..., 3, 4.843505477308295,
        1.9859154929577465]], dtype=object)

## 2.5.4 특성 스케일링(feature scaling) 

모든 특성의 범위를 같도록 만들어주는 방법으로 min-max 스케일링과 표준화(standardization)가 널리 사용됩니다.

min-max 스케일링이 가장 간단합니다. 많은 사람들이 이를 정규화(normalization)라고 부릅니다. 0~1 범위에 들도록 값을 이동하고 스케일을 조정하면 됩니다. 데이터에서 최솟값을 뺀 후 최댓값과 최솟값의 차이로 나누면 이렇게 할 수 있습니다. 

표준화는 먼져 평균을 뺀 후(그래서 표준화를 하면 항상 평균이 0이 됩니다) 표준편차로 나누어 결과 분포의 분산이 1이 되도록 합니다. 표준화는 범위의 상한과 하한이 없어 어떤 알고리즘에서는 문제가 될 수 있습니다.(예를 들어 신경망은 종종 입력값의 범위로 0에서 1사이를 기대합니다.)그러나 표준화는 이상치에 영향을 덜 받습니다. 

**주의 : 모든 변환기에서 스케일링은 (테스트세트가 포함된) 전체 데이터가 아니고 훈련 데이터에 대해서만 fit() 메서드를 적용해야 합니다. 그런 다음 훈련 세트와 테스트 세트(그리고 새로운 데이터)에 대해 transform() 메서드를 사용합니다. 

## 2.5.5 변환 파이프라인

In [58]:
from sklearn.pipeline import Pipeline

In [59]:
from sklearn.preprocessing import StandardScaler

In [60]:
num_pipeline=Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder',CombinedAttributesAdder()),
    ('std_scaler',StandardScaler())
])

In [61]:
housing_num_tr = num_pipeline.fit_transform(housing_num)

In [62]:
housing_num_tr

array([[-1.15604281,  0.77194962,  0.74333089, ..., -0.31205452,
        -0.08649871,  0.15531753],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.21768338,
        -0.03353391, -0.83628902],
       [ 1.18684903, -1.34218285,  0.18664186, ..., -0.46531516,
        -0.09240499,  0.4222004 ],
       ...,
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.3469342 ,
        -0.03055414, -0.52177644],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.02499488,
         0.06150916, -0.30340741],
       [-1.43579109,  0.99645926,  1.85670895, ..., -0.22852947,
        -0.09586294,  0.10180567]])

In [63]:
from sklearn.compose import ColumnTransformer

In [64]:
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num",num_pipeline,num_attribs),
    ("cat",OneHotEncoder(),cat_attribs),
])

In [65]:
housing_prepared = full_pipeline.fit_transform(housing)

In [66]:
housing_prepared

array([[-1.15604281,  0.77194962,  0.74333089, ...,  0.        ,
         0.        ,  0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.18684903, -1.34218285,  0.18664186, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.        ,
         0.        ,  0.        ],
       [-1.43579109,  0.99645926,  1.85670895, ...,  0.        ,
         1.        ,  0.        ]])

# 2.6 모델 선택과 훈련

## 2.6.1 훈련 세트에서 훈련하고 평가하기 

In [67]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared,housing_labels)

LinearRegression()

In [68]:
some_data = housing.iloc[:5]

In [69]:
some_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,income_cat
17606,-121.89,37.29,38.0,1568.0,351.0,710.0,339.0,2.7042,<1H OCEAN,2
18632,-121.93,37.05,14.0,679.0,108.0,306.0,113.0,6.4214,<1H OCEAN,5
14650,-117.2,32.77,31.0,1952.0,471.0,936.0,462.0,2.8621,NEAR OCEAN,2
3230,-119.61,36.31,25.0,1847.0,371.0,1460.0,353.0,1.8839,INLAND,2
3555,-118.59,34.23,17.0,6592.0,1525.0,4459.0,1463.0,3.0347,<1H OCEAN,3


In [70]:
some_labels= housing_labels.iloc[:5]

In [71]:
some_labels

17606    286600.0
18632    340600.0
14650    196900.0
3230      46300.0
3555     254500.0
Name: median_house_value, dtype: float64

In [72]:
some_data_prepared = full_pipeline.transform(some_data)

In [75]:
print("예측 : ", lin_reg.predict(some_data_prepared))

예측 :  [203682.37379543 326371.39370781 204218.64588245  58685.4770482
 194213.06443039]


In [76]:
print("레이블 : ", list(some_labels))

레이블 :  [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]


In [77]:
from sklearn.metrics import mean_squared_error

In [78]:
housing_predictions = lin_reg.predict(housing_prepared)

In [79]:
lin_mse = mean_squared_error(housing_labels,housing_predictions)

In [80]:
lin_rmse = np.sqrt(lin_mse)

In [81]:
lin_rmse

68376.64295459937

대부분 구역의 중간 주택 가격은 120,000에서 265,000사이입니다. 그러므로 예측 오차가 68,628인 것은 매우 만족스럽지 못합니다. 이는 모델이 훈련 데이터에 과소적합된 사례입니다. 이런 상황은 특성들이 좋은 예측을 만들 만큼 충분한 정보를 제공하지 못했거나 모델이 충분히 강력하지 못하다는 사실을 말해줍니다. 

In [82]:
from sklearn.tree import DecisionTreeRegressor

tree_reg= DecisionTreeRegressor()
tree_reg.fit(housing_prepared,housing_labels)

DecisionTreeRegressor()

In [83]:
housing_predictions= tree_reg.predict(housing_prepared)

In [84]:
tree_mse = mean_squared_error(housing_labels,housing_predictions)

In [85]:
tree_rmse = np.sqrt(tree_mse)

In [86]:
tree_rmse

0.0

## 2.6.2 교차 검증을 사용한 평가

In [87]:
from sklearn.model_selection import cross_val_score

In [88]:
scores = cross_val_score(tree_reg,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10)

In [89]:
tree_rmse_scores = np.sqrt(-scores)

In [90]:
def display_scores(scores):
    print("점수 : " , scores)
    print("평균 : " , scores.mean())
    print("표준편차 : ", scores.std())

In [91]:
display_scores(tree_rmse_scores)

점수 :  [68474.92235883 68101.97342892 71373.91117913 69696.09359342
 71828.65217793 74923.10416457 70686.72842084 69640.96717235
 76270.00045413 69482.671658  ]
평균 :  71047.90246081406
표준편차 :  2543.409527406959


In [92]:
lin_scores = cross_val_score(lin_reg,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10)

In [93]:
lin_rmse_scores = np.sqrt(-lin_scores)

In [94]:
display_scores(lin_rmse_scores)

점수 :  [66877.52325028 66608.120256   70575.91118868 74179.94799352
 67683.32205678 71103.16843468 64782.65896552 67711.29940352
 71080.40484136 67687.6384546 ]
평균 :  68828.99948449331
표준편차 :  2662.761570610344


In [95]:
from sklearn.ensemble import RandomForestRegressor

In [96]:
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)

RandomForestRegressor()

In [100]:
forest_scores = cross_val_score(forest_reg,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)

In [101]:
display_scores(forest_rmse_scores)

점수 :  [49550.4023039  47778.10463727 49786.04651115 52207.81989116
 49663.46682213 53616.69420562 48858.57179745 48201.60008478
 52731.86787344 50353.31673644]
평균 :  50274.789086334946
표준편차 :  1860.7223969841566


** tip 

실험한 모델을 모두 저장해두면 필요할 때 쉽게 모델을 복원할 수 있습니다. 교차 검증 점수와 실제 예측값은 물론 하이퍼파라미터와 훈련된 모델 파라미터 모두 저장해야 합니ㅏㄷ. 이렇게 하면 여러 모델의 점수와 모델이 만든 오차를 쉽게 비교할 수 있습니다. 파이썬의 pickle 패키지나 큰 넘파이 배열을 저장하는 데 아주 효율적인 joplib(pip를 사용해 이 라이브러리를 설치할 수 있습니다)을 사용해서 사이킷런 모델을 간단하게 저장할 수 잇습니다.

In [None]:
import joblib

In [None]:
joblib.dump(my_model,"my_model.pkl")
#나중에 불러오기 
my_model_load = joblib.load("my_model.pkl")

# 2.7 모델 세부 튜닝

## 2.7.1 그리드 탐색

In [102]:
from sklearn.model_selection import GridSearchCV

In [103]:
param_grid =[
    {'n_estimators':[3,10,30],'max_features' : [2,4,6,8]},
    {'bootstrap':[False],'n_estimators':[3,10],'max_features':[2,3,4]},
]

In [104]:
forest_reg = RandomForestRegressor()

In [105]:
grid_search = GridSearchCV(forest_reg,param_grid, cv=10,scoring='neg_mean_squared_error',return_train_score=True)

In [106]:
grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [107]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [108]:
grid_search.best_estimator_

RandomForestRegressor(max_features=8, n_estimators=30)

In [109]:
cvres = grid_search.cv_results_

In [110]:
for mean_score,params in zip(cvres["mean_test_score"],cvres["params"]):
    print(np.sqrt(-mean_score),params)

64177.43423563418 {'max_features': 2, 'n_estimators': 3}
55576.628397844426 {'max_features': 2, 'n_estimators': 10}
53049.1257422145 {'max_features': 2, 'n_estimators': 30}
60413.66696954773 {'max_features': 4, 'n_estimators': 3}
53288.51990330492 {'max_features': 4, 'n_estimators': 10}
50981.160506836466 {'max_features': 4, 'n_estimators': 30}
59444.37678341784 {'max_features': 6, 'n_estimators': 3}
52928.272368573605 {'max_features': 6, 'n_estimators': 10}
50847.236480117535 {'max_features': 6, 'n_estimators': 30}
59835.06378230203 {'max_features': 8, 'n_estimators': 3}
52779.71721721805 {'max_features': 8, 'n_estimators': 10}
50457.22828967345 {'max_features': 8, 'n_estimators': 30}
61839.569285578145 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
54473.60867680245 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
60186.70156004209 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
53291.47206014224 {'bootstrap': False, 'max_features': 3, 'n_estimators'

## 2.7.2 랜덤 탐색

그리드 탐색 방법은 이전 예제와 같이 비교적 적은 수의 조합을 탐구할 때 괜찮습니다. 하지만, 하이퍼파라미터 탐색 공간이 커지면 RandomizedSearchCV를 사용하는 편이 더 좋습니다. RandomizedSearchCV는 Grid_SearchCV와 거의 같은 방식으로 사용하지만 가능한 모든 조합을 시도하는 대신 각 반복마다 하이퍼파라미터에 임의의 수를 대입하여 지정한 횟수만큼 평가합니다. 

- 랜덤 탐색을 1,000회 반복하도록 실행하면 하이퍼파라미터마다 각기 다른 1,000개의 값을 탐색합니다.(그리드 탐색에서는 하이퍼파라미터마다 몇 개의 값만 탐색합니다).

- 단순히 반복 횟수를 조절하는 것만으로도 하이퍼 파라미터 탐색에 투입할 컴퓨팅 자원을 제어할 수 있습니다. 

## 2.7.3 앙상블 방법 

모델을 세밀하게 튜닝하는 또 다른 방법은 최상의 모델을 연결해보는 것입니다. 모델의 그룹이 최상의 단일 모델보다 더 나은 성능을 발휘할 때가 많습니다.

## 2.7.4 최상의 모델과 오차 분석

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_

In [None]:
feature_importances

In [None]:
extra_attribs=["rooms_per_hhold","pop_per_hhold","bedrooms_per_room"]

In [None]:
cat_encoder = full_pipeline.named_transformers_["cat"]

In [None]:
cat_one_hot_attribs = list(cat_encoder.categories_[0])

In [None]:
attributes = num_attribs + extra_attribs + cat_one_hot_attribs

In [None]:
sorted(zip(feature_importances,attributes),reverse=True)

## 2.7.5 테스트 세트로 시스템 평가하기

In [None]:
final_model = grid_search.best_estimator_

In [None]:
final_model

In [None]:
X_test = strat_test_set.drop("median_house_value",axis=1)
y_test = strat_test_set["median_house_value"].copy

In [None]:
X_test_prepared = full_pipeline.transform(X_test)

In [None]:
final_predictions = final_model.predict(X_test_prepared)

In [None]:
final_predictions

In [None]:
final_mse = mean_squared_error(y_test,final_predictions)