# Download resources

In [None]:
import warnings
warnings.simplefilter('ignore', DeprecationWarning)

In [None]:
import os
import tarfile
import urllib.request
import pandas as pd

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
  os.makedirs(housing_path, exist_ok=True)
  tgz_path = os.path.join(housing_path, "housing.tgz")
  urllib.request.urlretrieve(housing_url, tgz_path)
  housing_tgz = tarfile.open(tgz_path)
  housing_tgz.extractall(path=housing_path)
  housing_tgz.close()


def load_housing_data(housing_path=HOUSING_PATH):
  csv_path = os.path.join(housing_path, "housing.csv")
  return pd.read_csv(csv_path)

fetch_housing()
housing_df = load_housing_data()

In [None]:
display(df.head(3))

check missing value.

In [None]:
housing_df.isna().sum()

drop missing value row

In [None]:
df = housing_df.copy()
df.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [None]:
display(df.head(5))

In [24]:
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

y = df['median_house_value']
X = df.drop(columns=['median_house_value'])
print(y.shape)
print(X.shape)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


NameError: ignored

In [None]:
def get_eval(y_true, y_pred):
  rmse = np.sqrt(mean_squared_error(y_true, y_pred))
  print('RMSE: {:.5f}'.format(rmse))  


In [None]:
# kernel{‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’} or callable, default=’rbf’
hyperparameter = {'kernel': 'linear', 'C': [1.0, 10.0, 100.0, 1000.0, 10000.0]}

for c in hyperparameter['C']:
  print('#####################################################')
  print("kernel: {}, C: {}".format(hyperparameter['kernel'], c))
  svr_kernel_model = SVR(kernel=hyperparameter['kernel'], C=c).fit(x_train, y_train)
  y_train_pred = svr_kernel_model.predict(x_train)
  y_test_pred = svr_kernel_model.predict(x_test)
  rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
  print(f"Evaluate SVR model(kernel={hyperparameter['kernel']}) with Training Datasets")
  get_eval(y_train, y_train_pred)
  print(f"Evaluate SVR model(kernel={hyperparameter['kernel']}) with Validation Datasets")
  get_eval(y_test, y_test_pred)

In [None]:

hyperparameter = {'kernel': 'rbf', 'C': [1.0, 10.0, 100.0, 1000.0, 10000.0], 'gamma': ['scale', 'auto']}

for c in hyperparameter['C']:
  for gamma in hyperparameter['gamma']:
    print('#####################################################')
    print("kernel: {}, C: {}, gamma: {}".format(hyperparameter['kernel'], c, gamma))
    svr_kernel_model = SVR(kernel=hyperparameter['kernel'], C=c, gamma=gamma).fit(x_train, y_train)
    y_train_pred = svr_kernel_model.predict(x_train)
    y_test_pred = svr_kernel_model.predict(x_test)
    rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    print(f"Evaluate SVR model(kernel={hyperparameter['kernel']}, C={c}, gamma={gamma}) with Training Datasets")
    get_eval(y_train, y_train_pred)
    print(f"Evaluate SVR model(kernel={hyperparameter['kernel']}, C={c}, gamma={gamma}) with Validation Datasets")
    get_eval(y_test, y_test_pred)


# Preprocessing for features
- Create imputer
  - `sklearn.imputer`
- Create scaler 
  - `sklearn.scalers`
- Create encoder
  - `sklearn.imputer`


## Check feature values
- Check relationship with label data
- Check missing values
- Check skewed data

In [None]:
df.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

## Numeric Feature / Caterogorical Feature

In [None]:
numeric_features = [
  'longitude', 'latitude', 'median_income', 'total_rooms',
  'total_bedrooms', 'population', 'households', 'median_house_value'
]
categorical_features = ['ocean_proximity']
label = ['median_house_value']

housing_numeric = df[numeric_features]
housing_categorical = df[categorical_features]
label_df = df[label]
print(housing_numeric.shape)
print(housing_categorical.shape)
print(label_df.shape)

NameError: ignored

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

imputter = SimpleImputer(strategy="median")
numerical_pipeline = Pipeline([
  ('imputer', SimpleImputer(strategy="median")),
  ('scaler', StandardScaler()),
])

full_pipeline = ColumnTransformer([
  ('numeric', numerical_pipeline, numeric_features),
  ('categorical', OrdinalEncoder(), categorical_features),
])

housing_prepared = full_pipeline.fit_transform(housing_df)
housing_label = housing_df['median_house_value']

In [None]:
display(housing_prepared[0])
display(housing_label)

array([-1.32783522,  1.05254828,  2.34476576, -0.8048191 , -0.97247648,
       -0.9744286 , -0.97703285,  2.12963148,  3.        ])

0        452600.0
1        358500.0
2        352100.0
3        341300.0
4        342200.0
           ...   
20635     78100.0
20636     77100.0
20637     92300.0
20638     84700.0
20639     89400.0
Name: median_house_value, Length: 20640, dtype: float64

# Build SVR(Epsilon-Support Vector Regression) model with GridSearch and RandomizedSearch

- GridSearch
- RandomizedSearch

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

x_train, x_test, y_train, y_test = train_test_split(housing_prepared, housing_label, test_size=0.3, random_state=0)
print(x_train.shape)
print(y_train.shape)
tuned_parameters = [{'kernel': ['linear'], 'C': [1.0, 10.0, 100.0, 1000.0, 10000.0],},
           {'kernel': ['rbf'], 'C': [1.0, 10.0, 100.0, 1000.0, 10000.0], 'gamma': ['scale', 'auto']},]
grid_search_cv = GridSearchCV(
  SVR(), 
  tuned_parameters, 
  scoring='neg_mean_squared_error',)
grid_search_cv.fit(housing_prepared, housing_label)

print(grid_search_cv.best_estimator_)
y_train_pred = grid_search_cv.predict(x_train)
y_test_pred = grid_search_cv.predict(x_test)
rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(f"Evaluate GridSearchCV with Training Datasets")
get_eval(y_train, y_train_pred)
print(f"Evaluate GridSearchCV model with Validation Datasets")
get_eval(y_test, y_test_pred)


(14448, 9)
(14448,)


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

x_train, x_test, y_train, y_test = train_test_split(housing_prepared, housing_label, test_size=0.3, random_state=0)
print(x_train.shape)
print(y_train.shape)
tuned_parameters = [{'kernel': ['linear'], 'C': [1.0, 10.0, 100.0, 1000.0, 10000.0],},
           {'kernel': ['rbf'], 'C': [1.0, 10.0, 100.0, 1000.0, 10000.0], 'gamma': ['scale', 'auto']},]
randomized_search_cv = RandomizedSearchCV(
  SVR(), 
  tuned_parameters, 
  scoring='neg_mean_squared_error',)
randomized_search_cv.fit(housing_prepared, housing_label)
print(randomized_search_cv.best_estimator_)
y_train_pred = randomized_search_cv.predict(x_train)
y_test_pred = randomized_search_cv.predict(x_test)
rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(f"Evaluate RandomizedSearchCV with Training Datasets")
get_eval(y_train, y_train_pred)
print(f"Evaluate RandomizedSearchCV model with Validation Datasets")
get_eval(y_test, y_test_pred)
