In [None]:
#몇 가지의 측정 데이터가 주어졌을 때, 주택 가격에 대한 예측
#housing에서 데이터 탐색과 시각화를 했고, 여기서는 그 후로 예측하는것.
#다변량 회귀문제, 데이터의 연속적인 흐름이 없고, 데이터가 메모리에 들어갈 만큼 충분히 작음 => 미니배치 사용X
#성능 측정 지표는 RMSE(평균제곱근 오차) 사용

In [1]:
#데이터 추출. 그냥 다운받는것보다 코드로 하면 자동화되서 편하다고 함.
#디렉터리에 압축을 풀어서 housing.csv파일 생성

import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing/tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH) :
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    

In [2]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

housing = load_housing_data()

In [3]:
#테스트 세트 만들기_1

import numpy as np

def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

train_set, test_set = split_train_test(housing, 0.2) 

#20퍼센트의 테스트 세트 생성
# print(len(train_set), "train +", len(test_set), "test")

In [28]:
#테스트 세트 만들기_2
#위와 같이 테스트 세트를 만들어서 반복하면, 테스트 세트의 의미가 없어짐(전체 데이터 셋을 보는거랑 마찬가지이므로)

from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
    
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)
    
housing = strat_train_set.copy()

In [32]:
#예측변수와 레이블 분리

housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [33]:
#total_bedrooms 특성에는 값이 조금 누락되어있음. 이것을 전처리해줘야함.
#사이킷런의 imputer는 누락된 값을 특성의 중간값으로 대체함

from sklearn.preprocessing import Imputer

imputer = Imputer(strategy="median")

#"ocean_proximity는 텍스트 특성이기 떄문에 잠시 빼둠"
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)

X = imputer.transform(housing_num)

#위의 결과값은 넘파이 배열인데, 이것을 다시 판다스 프레임으로 되돌림.
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index = list(housing.index.values))



In [34]:
#위에서 텍스트 특성이기 때문에 잠시 빼둔 ocean_proximity를 처리해주어야 함.
#ocean_proximity는 텍스트, 범주형 특성인데 이를 one_hot encoding을 통해 숫자로 바꿈

housing_cat = housing["ocean_proximity"]

housing_cat_encoded, housing_categories = housing_cat.factorize() #factorize() : 각 카테고리를 다른 정수값으로 매핑해줌
#이 코드의 문제는, 값이 1과 2면 컴퓨터가 연관이 있다고 생각하는데, 사실 1과 4가 더 연관있고...
#숫자의 의미를 없애기 위해(??) one_hot_encoding사용

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1, 1))
housing_cat_1hot = housing_cat_1hot.toarray()


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [35]:
# [PR #9151](https://github.com/scikit-learn/scikit-learn/pull/9151)에서 가져온 CategoricalEncoder 클래스의 정의.
# 이 클래스는 사이킷런 0.20에 포함될 예정입니다. <<0.21로 업그레이드햇는데 없어서.. 그냥 정의를 가져옴...>

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.preprocessing import LabelEncoder
from scipy import sparse

class CategoricalEncoder(BaseEstimator, TransformerMixin):

    def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,
                 handle_unknown='error'):
        self.encoding = encoding
        self.categories = categories
        self.dtype = dtype
        self.handle_unknown = handle_unknown

    def fit(self, X, y=None):

        if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:
            template = ("encoding should be either 'onehot', 'onehot-dense' "
                        "or 'ordinal', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.handle_unknown not in ['error', 'ignore']:
            template = ("handle_unknown should be either 'error' or "
                        "'ignore', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':
            raise ValueError("handle_unknown='ignore' is not supported for"
                             " encoding='ordinal'")

        X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)
        n_samples, n_features = X.shape

        self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]

        for i in range(n_features):
            le = self._label_encoders_[i]
            Xi = X[:, i]
            if self.categories == 'auto':
                le.fit(Xi)
            else:
                valid_mask = np.in1d(Xi, self.categories[i])
                if not np.all(valid_mask):
                    if self.handle_unknown == 'error':
                        diff = np.unique(Xi[~valid_mask])
                        msg = ("Found unknown categories {0} in column {1}"
                               " during fit".format(diff, i))
                        raise ValueError(msg)
                le.classes_ = np.array(np.sort(self.categories[i]))

        self.categories_ = [le.classes_ for le in self._label_encoders_]

        return self

    def transform(self, X):
        X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)
        n_samples, n_features = X.shape
        X_int = np.zeros_like(X, dtype=np.int)
        X_mask = np.ones_like(X, dtype=np.bool)

        for i in range(n_features):
            valid_mask = np.in1d(X[:, i], self.categories_[i])

            if not np.all(valid_mask):
                if self.handle_unknown == 'error':
                    diff = np.unique(X[~valid_mask, i])
                    msg = ("Found unknown categories {0} in column {1}"
                           " during transform".format(diff, i))
                    raise ValueError(msg)
                else:
                    # Set the problematic rows to an acceptable value and
                    # continue `The rows are marked `X_mask` and will be
                    # removed later.
                    X_mask[:, i] = valid_mask
                    X[:, i][~valid_mask] = self.categories_[i][0]
            X_int[:, i] = self._label_encoders_[i].transform(X[:, i])

        if self.encoding == 'ordinal':
            return X_int.astype(self.dtype, copy=False)

        mask = X_mask.ravel()
        n_values = [cats.shape[0] for cats in self.categories_]
        n_values = np.array([0] + n_values)
        indices = np.cumsum(n_values)

        column_indices = (X_int + indices[:-1]).ravel()[mask]
        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
                                n_features)[mask]
        data = np.ones(n_samples * n_features)[mask]

        out = sparse.csc_matrix((data, (row_indices, column_indices)),
                                shape=(n_samples, indices[-1]),
                                dtype=self.dtype).tocsr()
        if self.encoding == 'onehot-dense':
            return out.toarray()
        else:
            return out

In [36]:
#변환기
#잘 모르겠지만 아마 앞에서 데이터 조합을 보았을 때,
#한 가정당 방의 갯수, 가정당 인구, 방당 침대의 갯수가 가치 있는 조합특성이라고 생각해서 그것을 만들어주는 변환기

from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
        
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room = False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [37]:
#숫자 특성을 처리하는 파이프라인

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', Imputer(strategy="median")), #누락값을 중간값으로 채워주는 imputer
    ('attribs_addr', CombinedAttributesAdder()), #위에서 정의한 클래스
    ('std_scaler', StandardScaler()), #사이킷런에서 제공하는 표준화 변환기
])

housing_num_tr = num_pipeline.fit_transform(housing_num)

#변환기에서 스케일링은, 훈련 데이터에서만 fit(), 테스트 세트는 transform()



In [38]:
#데이터프레임을 넘파이 배열로 바꿈

from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [39]:
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy="median")), #누락값을 중간값으로 채워주는 imputer
    ('attribs_addr', CombinedAttributesAdder()), #위에서 정의한 클래스
    ('std_scaler', StandardScaler()), #사이킷런에서 제공하는 표준화 변환기
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('cat_encoder', CategoricalEncoder()), #책에서는 CategoricalEncoder 쓰라되어있는데... 그거 버전이 안맞아서 없는듯.
])



In [40]:
#위의 두개 파이프라인을 합침

from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline)
])

In [41]:
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared
housing_prepared.shape

(16512, 16)

# 훈련 세트에서 훈련하고 평가하기!!

In [43]:
#선형 회귀모델 훈련

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [44]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("예측 : ", lin_reg.predict(some_data_prepared))
print("레이블: ", list(some_labels))

예측 :  [210644.60483551 317768.80716735 210956.43330317  59218.98834873
 189747.55852797]
레이블:  [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]


In [48]:
#이 모델의 RMSE측정 (평균제곱근 오차)

from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

68628.19819848923

In [49]:
#결과 오차가 너무 큼... 언더핏 -> 더 복잡한 모델 시도

from sklearn.tree import DecisionTreeRegressor

#모델 훈련
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

#평가
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

In [50]:
#교차 검증을 사용하여 평가

from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

#cv는 아마 서브셋 개수인듯?


In [51]:
#위의 평가 결과

def display_scores(scores):
    print("Scores : ", scores)
    print("Mean : ", scores.mean())
    print("Standard deviation : ", scores.std())
    
display_scores(tree_rmse_scores)

Scores :  [68663.61403369 66900.33560394 70519.40767728 68362.97463413
 72624.23143195 74800.13695652 69590.98502338 71455.12465127
 76015.53586255 70363.72691564]
Mean :  70929.60727903388
Standard deviation :  2724.5796311519375


In [52]:
#교차 검증을 사용하여 선형 회귀 모델 평가

lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores :  [66782.73844538 66960.11809747 70347.95244045 74739.57053437
 68031.13391318 71193.84180187 64969.63055372 68281.61137951
 71552.91572318 67665.10084824]
Mean :  69052.46137373625
Standard deviation :  2731.6740029821394


In [53]:
#랜덤포레스트 방법으로 모델링 // 위의 두개보다는 나음

from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)



Scores :  [51620.43791176 49717.50045868 52425.03896761 55329.91069553
 52930.57005434 55969.29932975 52187.39692101 50821.75226105
 55499.18029685 52744.26287854]
Mean :  52924.534977514086
Standard deviation :  1972.7660472154664


In [64]:
forest_reg.fit(housing_prepared, housing_labels)




22253.212668735003

In [66]:
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

#이 결과는 상당히 과적합되어있음. 위의 검증 세트에 대한 점수보다 훨씬 낮으므로

22253.212668735003

In [55]:
#하이퍼파라미터 탐색 - 그리드 방식 // 코세라에서 봤다시피, 랜덤탐색이 가장 좋음

from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap':[False], 'n_estimators':[3, 10], 'max_features': [2, 3, 4]},
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)

grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [56]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [57]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=30, n_jobs=None, oob_score=False,
           random_state=None, verbose=0, warm_start=False)

In [59]:
#테스트 세트로 시스템 평가하기

final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [60]:
final_rmse #위의 하이퍼파라미터 추정치로 rmse를 계산하면 49,000정도가 나옴. 잘 개선된 것을 볼 수 있음

47507.89995547604