<a href="https://colab.research.google.com/github/jackiekuen2/notes-handson-ml-tf/blob/master/ch5_ExerciseQ10_SVMCaliforniaHouse.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()

Downloading Cal. housing from https://ndownloader.figshare.com/files/5976036 to /root/scikit_learn_data


In [0]:
X = housing['data']
y = housing['target']

In [7]:
housing.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR'])

In [9]:
housing['feature_names']

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [0]:
# Train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [0]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 1. Linear SVR, with epsilon 1.5

In [16]:
from sklearn.svm import LinearSVR

linsvm_reg = LinearSVR(random_state=42)
linsvm_reg.fit(X_train_scaled, y_train)



LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
          random_state=42, tol=0.0001, verbose=0)

In [17]:
from sklearn.metrics import mean_squared_error

y_pred = linsvm_reg.predict(X_train_scaled)
mean_squared_error(y_train, y_pred)

0.949968822217229

In [18]:
import numpy as np
# RMSE
np.sqrt(mean_squared_error(y_train, y_pred))

0.9746634404845752

## 2. Non-Linear RBF SVR

In [20]:
from sklearn.svm import SVR

rbfsvm_reg = SVR(kernel='rbf')
rbfsvm_reg.fit(X_train_scaled, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [21]:
y_pred = rbfsvm_reg.predict(X_train_scaled)
mean_squared_error(y_train, y_pred)

0.3361301529185807

In [22]:
# RMSE
np.sqrt(mean_squared_error(y_train, y_pred))

0.5797673265358964

## 2B. Randomized Search CV for RBF SVR

In [23]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

rbfsvm_reg = SVR(kernel='rbf')

param_distributions = {
    'gamma': reciprocal(0.001, 1),
    'C': uniform(1, 10)
}

random_search_cv = RandomizedSearchCV(rbfsvm_reg, param_distributions=param_distributions,
                                      n_iter=10, verbose=2, cv=3, random_state=42)
random_search_cv.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] C=4.745401188473625, gamma=0.7114476009343418 ...................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .... C=4.745401188473625, gamma=0.7114476009343418, total=  19.7s
[CV] C=4.745401188473625, gamma=0.7114476009343418 ...................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   19.7s remaining:    0.0s


[CV] .... C=4.745401188473625, gamma=0.7114476009343418, total=  19.6s
[CV] C=4.745401188473625, gamma=0.7114476009343418 ...................
[CV] .... C=4.745401188473625, gamma=0.7114476009343418, total=  18.7s
[CV] C=8.31993941811405, gamma=0.06251373574521747 ...................
[CV] .... C=8.31993941811405, gamma=0.06251373574521747, total=  11.0s
[CV] C=8.31993941811405, gamma=0.06251373574521747 ...................
[CV] .... C=8.31993941811405, gamma=0.06251373574521747, total=  11.4s
[CV] C=8.31993941811405, gamma=0.06251373574521747 ...................
[CV] .... C=8.31993941811405, gamma=0.06251373574521747, total=  11.5s
[CV] C=2.560186404424365, gamma=0.0029375384576328287 ................
[CV] . C=2.560186404424365, gamma=0.0029375384576328287, total=   8.8s
[CV] C=2.560186404424365, gamma=0.0029375384576328287 ................
[CV] . C=2.560186404424365, gamma=0.0029375384576328287, total=   8.8s
[CV] C=2.560186404424365, gamma=0.0029375384576328287 ................
[CV] .

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  5.5min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                                 epsilon=0.1, gamma='scale', kernel='rbf',
                                 max_iter=-1, shrinking=True, tol=0.001,
                                 verbose=False),
                   iid='deprecated', n_iter=10, n_jobs=None,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff7a1d4eac8>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff7a1d4e898>},
                   pre_dispatch='2*n_jobs', random_state=42, refit=True,
                   return_train_score=False, scoring=None, verbose=2)

In [24]:
random_search_cv.best_estimator_

SVR(C=4.745401188473625, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma=0.7114476009343418, kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [25]:
random_search_cv.best_params_

{'C': 4.745401188473625, 'gamma': 0.7114476009343418}

In [26]:
random_search_cv.best_score_

0.7601605215852856

In [27]:
# Fit the whole training set in random_search_cv best_estimator_
final_reg = random_search_cv.best_estimator_
y_pred = final_reg.predict(X_train_scaled)
mean_squared_error(y_train, y_pred)

0.2168284470916062

Final training set MSE: 0.2168

In [29]:
y_pred = final_reg.predict(X_test_scaled)
mean_squared_error(y_test, y_pred)

0.30714383355045694

Final test set MSE: 0.3071