## setup

In [63]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

## get the data

In [64]:
import pandas as pd

HOUSING_PATH = os.path.join("datasets", "housing")
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)
housing = load_housing_data()
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
print("train_size: ", train_set.shape)

train_size:  (16512, 10)


## prepare the data

In [65]:
housing = train_set.drop("median_house_value", axis=1) # drop labels for training set
housing_labels = train_set["median_house_value"].copy()

try:
    from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+
except ImportError:
    from sklearn.preprocessing import Imputer as SimpleImputer

# Remove the text attribute because median can only be calculated on numerical attributes
housing_num = housing.drop('ocean_proximity', axis=1)

############# customize: add attributes #############
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
try:
    from sklearn.preprocessing import OrdinalEncoder # just to raise an ImportError if Scikit-Learn < 0.20
    from sklearn.preprocessing import OneHotEncoder
except ImportError:
    from future_encoders import OneHotEncoder # Scikit-Learn < 0.20

# get the right column indices: safer than hard-coding indices 3, 4, 5, 6
rooms_ix, bedrooms_ix, population_ix, household_ix = [
    list(housing.columns).index(col)
    for col in ("total_rooms", "total_bedrooms", "population", "households")]

def add_extra_features(X, add_bedrooms_per_room=True):
    rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
    population_per_household = X[:, population_ix] / X[:, household_ix]
    if add_bedrooms_per_room:
        bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
        return np.c_[X, rooms_per_household, population_per_household,
                     bedrooms_per_room]
    else:
        return np.c_[X, rooms_per_household, population_per_household]
    
def select_features(X, columns):
    A = np.array(['rooms_per_household', 'population_per_household']).transpose()
    new_c = np.r_[columns, A]
    X_tr = pd.DataFrame(X, columns=new_c)
    X_tr["median_house_value"] = housing_labels.values
    array = pd.Series(list(range(0, 16512)))
    X_tr.to_csv(os.path.join(HOUSING_PATH, "result_lyz.csv"))
    corr_matrix = X_tr.corr()
    print(corr_matrix["median_house_value"].sort_values(ascending=False))
    corr_matrix = np.abs(corr_matrix)
    corr_matrix = corr_matrix["median_house_value"].sort_values(ascending=False)
    corr_matrix = corr_matrix[0:5]
    print(corr_matrix.index)
    return X_tr[np.array(corr_matrix.index)].values
        
############## pipeline #############
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),  # imcompletion
        ('attribs_adder', FunctionTransformer(add_extra_features, validate=False, 
                                             kw_args={"add_bedrooms_per_room": False})), # kw_args
        ('attribs_selector', FunctionTransformer(select_features,
                                                kw_args={"columns": housing_num.columns})),
        ('std_scaler', StandardScaler()),
    ])

try:
    from sklearn.compose import ColumnTransformer
except ImportError:
    from future_encoders import ColumnTransformer # Scikit-Learn < 0.2

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs), # text2numeral
    ])

housing_prepared = full_pipeline.fit_transform(housing)




now:::::::::::::::::::::
median_house_value          1.000000
median_income               0.690647
rooms_per_household         0.158485
total_rooms                 0.133989
housing_median_age          0.103706
households                  0.063714
total_bedrooms              0.047980
population_per_household   -0.022030
population                 -0.026032
longitude                  -0.046349
latitude                   -0.142983
Name: median_house_value, dtype: float64
                          longitude  latitude  housing_median_age  \
longitude                  1.000000  0.924485            0.101818   
latitude                   0.924485  1.000000            0.005296   
housing_median_age         0.101818  0.005296            1.000000   
total_rooms                0.038676  0.029224            0.360922   
total_bedrooms             0.063064  0.059998            0.320624   
population                 0.094276  0.102499            0.292283   
households                 0.049306  0.06406

In [68]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import sklearn.linear_model
import time
from sklearn.model_selection import GridSearchCV

def cal_time():
    start = time.time()
    linear_reg = sklearn.linear_model.LinearRegression()
    linear_reg.fit(housing_prepared, housing_labels)
    housing_predictions = linear_reg.predict(housing_prepared)
    mse_linear = mean_squared_error(housing_labels, housing_predictions)
    rmse_linear = np.sqrt(mse_linear)
    end = time.time()
    
    print("time:", end-start)
    print("rmse_linear", rmse_linear)
    start = time.time()
    svm_reg_linear = SVR(kernel="linear")
    svm_reg_linear.fit(housing_prepared, housing_labels)
    housing_predictions = svm_reg_linear.predict(housing_prepared)
    svm_mse_linear = mean_squared_error(housing_labels, housing_predictions)
    svm_rmse_linear = np.sqrt(svm_mse_linear)
    end = time.time()
    print("time:", end-start)
    print("svm_rmse_linear", svm_rmse_linear)

    start = time.time()
    svm_reg_rbf = SVR(kernel="rbf")
    svm_reg_rbf.fit(housing_prepared, housing_labels)
    housing_predictions = svm_reg_rbf.predict(housing_prepared)
    svm_mse_rbf = mean_squared_error(housing_labels, housing_predictions)
    svm_rmse_rbf = np.sqrt(svm_mse_rbf)
    end = time.time()
    print("time:", end-start)
    print("svm_rmse_rbf", svm_rmse_rbf)

???
beginaaa




117832.70735021617 {'C': 1, 'kernel': 'rbf'}
109524.24661586537 {'C': 10, 'kernel': 'rbf'}
102670.28770169668 {'C': 1, 'kernel': 'linear'}
25795.891158353093 {'C': 10, 'kernel': 'linear'}
time: 363.6384525299072


In [84]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = [
    {'kernel': ['rbf', 'linear'], 'C': [1, 10]}
  ]

start = time.time()
grid_search = GridSearchCV(SVR(), param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)
end = time.time()
print("time:", end - start)


param_rand = {'kernel':['rbf', 'linear'], 'C': [10, 100]}


start = time.time()
rand_search = RandomizedSearchCV(SVR(), param_rand, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)
rand_search.fit(housing_prepared, housing_labels)
cvres = rand_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)
end = time.time()
print("time:", end - start)



117832.70735021617 {'C': 1, 'kernel': 'rbf'}
102670.28770169668 {'C': 1, 'kernel': 'linear'}
109524.24661586537 {'C': 10, 'kernel': 'rbf'}
25795.891158353093 {'C': 10, 'kernel': 'linear'}
time: 338.01396918296814




109524.24661586537 {'kernel': 'rbf', 'C': 10}
25795.891158353093 {'kernel': 'linear', 'C': 10}
55919.136582840256 {'kernel': 'rbf', 'C': 100}
0.061299590669716346 {'kernel': 'linear', 'C': 100}
time: 1853.5447618961334


In [87]:
grid_search.refit_time_
rand_search.refit_time_

243.37605476379395

## fine-tune