In [36]:
# Get the Data

### Download the Data

import os
import tarfile
import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

fetch_housing_data()

In [37]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [38]:
### Take a Quick Look at the Data Structure

In [39]:
housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [40]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [41]:
housing["ocean_proximity"].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [42]:
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [43]:
### Create a Test Set

In [44]:
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

In [45]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   longitude           20640 non-null  float64 
 1   latitude            20640 non-null  float64 
 2   housing_median_age  20640 non-null  float64 
 3   total_rooms         20640 non-null  float64 
 4   total_bedrooms      20433 non-null  float64 
 5   population          20640 non-null  float64 
 6   households          20640 non-null  float64 
 7   median_income       20640 non-null  float64 
 8   median_house_value  20640 non-null  float64 
 9   ocean_proximity     20640 non-null  object  
 10  income_cat          20640 non-null  category
dtypes: category(1), float64(9), object(1)
memory usage: 1.6+ MB


In [46]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [47]:
strat_test_set["income_cat"].value_counts() / len(strat_test_set)

3    0.350533
2    0.318798
4    0.176357
5    0.114341
1    0.039971
Name: income_cat, dtype: float64

In [None]:
strat_train_set["rooms_per_household"] = strat_train_set["total_rooms"]/strat_train_set["households"]
strat_train_set["bedrooms_per_room"] = strat_train_set["total_bedrooms"]/strat_train_set["total_rooms"]
strat_train_set["population_per_household"]=strat_train_set["population"]/strat_train_set["households"]

In [None]:
strat_test_set["rooms_per_household"] = strat_test_set["total_rooms"]/strat_test_set["households"]
strat_test_set["bedrooms_per_room"] = housing["total_bedrooms"]/strat_test_set["total_rooms"]
strat_test_set["population_per_household"]=strat_test_set["population"]/strat_test_set["households"]

In [48]:
strat_train_set 

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,income_cat
12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,72100.0,INLAND,2
15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,279600.0,NEAR OCEAN,5
2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.8750,82700.0,INLAND,2
14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,112500.0,NEAR OCEAN,2
20496,-118.70,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,238300.0,<1H OCEAN,3
...,...,...,...,...,...,...,...,...,...,...,...
15174,-117.07,33.03,14.0,6665.0,1231.0,2026.0,1001.0,5.0900,268500.0,<1H OCEAN,4
12661,-121.42,38.51,15.0,7901.0,1422.0,4769.0,1418.0,2.8139,90400.0,INLAND,2
19263,-122.72,38.44,48.0,707.0,166.0,458.0,172.0,3.1797,140400.0,<1H OCEAN,3
19140,-122.70,38.31,14.0,3155.0,580.0,1208.0,501.0,4.1964,258100.0,<1H OCEAN,3


In [55]:
X_train = strat_train_set.drop('median_house_value', axis =1)
Y_train = strat_train_set["median_house_value"].copy()
X_test = strat_test_set.drop('median_house_value', axis =1)
Y_test = strat_test_set["median_house_value"].copy()

In [53]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]

        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=True)
housing_extra_attribs = attr_adder.transform(housing.values)

In [59]:
cat_cols = ['ocean_proximity', 'income_cat']
num_cols = list(set(X_train.columns) - set(cat_cols))

In [61]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

housing_num_tr = num_pipeline.fit_transform(X_train[num_cols])
housing_num_test = num_pipeline.transform(X_test[num_cols])

In [63]:
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import OneHotEncoder
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_cols),
        ("cat", OneHotEncoder(), cat_cols),
    ])

housing_prepared_train = full_pipeline.fit_transform(X_train)
housing_prepared_test = full_pipeline.transform(X_test)

In [66]:

from sklearn.svm import SVR

In [70]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
svm_reg = SVR(kernel="linear")
svm_reg.fit(housing_prepared_train, Y_train)
housing_predictions = svm_reg.predict(housing_prepared_test)
svm_mse = mean_squared_error(Y_test, housing_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_rmse

102392.83305106734

In [72]:
from sklearn.model_selection import GridSearchCV


param_grid = [
        {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},
        {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
         'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
    ]

svm_reg = SVR()
grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(housing_prepared_train, Y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END ..............................C=10.0, kernel=linear; total time=  27.0s
[CV] END ..............................C=10.0, kernel=linear; total time=  26.2s
[CV] END ..............................C=10.0, kernel=linear; total time=  26.1s
[CV] END ..............................C=10.0, kernel=linear; total time=  25.7s
[CV] END ..............................C=10.0, kernel=linear; total time=  25.3s
[CV] END ..............................C=30.0, kernel=linear; total time=  25.3s
[CV] END ..............................C=30.0, kernel=linear; total time=  24.9s
[CV] END ..............................C=30.0, kernel=linear; total time=  25.7s
[CV] END ..............................C=30.0, kernel=linear; total time=  26.0s
[CV] END ..............................C=30.0, kernel=linear; total time=  25.6s
[CV] END .............................C=100.0, kernel=linear; total time=  24.8s
[CV] END .............................C=100.0, 

[CV] END .....................C=10.0, gamma=0.01, kernel=rbf; total time=  46.0s
[CV] END .....................C=10.0, gamma=0.01, kernel=rbf; total time=  46.6s
[CV] END .....................C=10.0, gamma=0.01, kernel=rbf; total time=  46.4s
[CV] END .....................C=10.0, gamma=0.01, kernel=rbf; total time=  46.1s
[CV] END .....................C=10.0, gamma=0.03, kernel=rbf; total time=  46.3s
[CV] END .....................C=10.0, gamma=0.03, kernel=rbf; total time=  45.5s
[CV] END .....................C=10.0, gamma=0.03, kernel=rbf; total time=  46.2s
[CV] END .....................C=10.0, gamma=0.03, kernel=rbf; total time=  45.2s
[CV] END .....................C=10.0, gamma=0.03, kernel=rbf; total time=  45.7s
[CV] END ......................C=10.0, gamma=0.1, kernel=rbf; total time=  44.7s
[CV] END ......................C=10.0, gamma=0.1, kernel=rbf; total time=  45.8s
[CV] END ......................C=10.0, gamma=0.1, kernel=rbf; total time=  45.1s
[CV] END ...................

[CV] END .....................C=300.0, gamma=0.1, kernel=rbf; total time=  23.3s
[CV] END .....................C=300.0, gamma=0.1, kernel=rbf; total time=  24.2s
[CV] END .....................C=300.0, gamma=0.3, kernel=rbf; total time=  20.9s
[CV] END .....................C=300.0, gamma=0.3, kernel=rbf; total time=  21.6s
[CV] END .....................C=300.0, gamma=0.3, kernel=rbf; total time=  22.6s
[CV] END .....................C=300.0, gamma=0.3, kernel=rbf; total time=  20.9s
[CV] END .....................C=300.0, gamma=0.3, kernel=rbf; total time=  20.3s
[CV] END .....................C=300.0, gamma=1.0, kernel=rbf; total time=  21.1s
[CV] END .....................C=300.0, gamma=1.0, kernel=rbf; total time=  21.4s
[CV] END .....................C=300.0, gamma=1.0, kernel=rbf; total time=  21.6s
[CV] END .....................C=300.0, gamma=1.0, kernel=rbf; total time=  21.9s
[CV] END .....................C=300.0, gamma=1.0, kernel=rbf; total time=  20.3s
[CV] END ...................

GridSearchCV(cv=5, estimator=SVR(),
             param_grid=[{'C': [10.0, 30.0, 100.0, 300.0, 1000.0, 3000.0,
                                10000.0, 30000.0],
                          'kernel': ['linear']},
                         {'C': [1.0, 3.0, 10.0, 30.0, 100.0, 300.0, 1000.0],
                          'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0],
                          'kernel': ['rbf']}],
             scoring='neg_mean_squared_error', verbose=2)

In [73]:
svm_mse = grid_search.best_score_
svm_rmse = np.sqrt(-svm_mse)
svm_rmse

70113.443785535

In [74]:
# Best Parameters for SVM Regressor
grid_search.best_params_

{'C': 3000.0, 'kernel': 'linear'}