In [1]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()
X = housing["data"]
y = housing["target"]

Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to C:\Users\daumsoft\scikit_learn_data


In [2]:
# 테스트셋 분리
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# 데이터 스케일 조정
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
# LinearSVR 훈련
from sklearn.svm import LinearSVR

lin_svr = LinearSVR(max_iter=10000, random_state=42)
lin_svr.fit(X_train_scaled, y_train)

LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=10000,
          random_state=42, tol=0.0001, verbose=0)

In [12]:
# 성능 확인
from sklearn.metrics import mean_squared_error
import numpy as np

y_pred = lin_svr.predict(X_train_scaled)
mse = mean_squared_error(y_train, y_pred)

print('mse: {}'.format(mse))
print('rmse: {}'.format(np.sqrt(mse)))

mse: 0.9526279964217719
rmse: 0.976026637147661


In [13]:
# 훈련 세트의 타깃은 10,000 달러 단위
# RMSE가 클수록 큰 폭으로 증가 -> 에러가 대략 10,000 달러로 예상 가능 -> 별로 좋지 않음

In [18]:
# SVM의 RBF 커널 모델로 훈련
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

param_distributions = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10)}
rnd_search_cv = RandomizedSearchCV(SVR(), param_distributions, cv=3, n_iter=10, verbose=2, random_state=42, n_jobs=-1)
rnd_search_cv.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  1.4min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                                 epsilon=0.1, gamma='scale', kernel='rbf',
                                 max_iter=-1, shrinking=True, tol=0.001,
                                 verbose=False),
                   iid='deprecated', n_iter=10, n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001BDBE5CC288>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001BDBDF58288>},
                   pre_dispatch='2*n_jobs', random_state=42, refit=True,
                   return_train_score=False, scoring=None, verbose=2)

In [19]:
rnd_search_cv.best_estimator_

SVR(C=4.745401188473625, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma=0.07969454818643928, kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [20]:
# RMSE 측정
y_pred = rnd_search_cv.best_estimator_.predict(X_train_scaled)
mse = mean_squared_error(y_train, y_pred)
np.sqrt(mse)

0.572752477078536

In [21]:
# 오류율이 많이 낮아짐 -> 모델 선택 후 테스트셋 평가
y_pred = rnd_search_cv.best_estimator_.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
np.sqrt(mse)

0.5929168385528742