In [1]:
import os

import numpy as np
import pandas as pd


In [2]:
MODELS_DIR = os.path.join("models", "02")
os.makedirs(MODELS_DIR, exist_ok=True)


In [2]:
housing = pd.read_csv(os.path.join("datasets", "housing", "housing.csv"))
housing["income_cat"] = pd.cut(
    housing["median_income"], bins=[0, 1.5, 3, 4.5, 6, np.inf], labels=[1, 2, 3, 4, 5]
)


In [3]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)


In [4]:
housing_train = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()


# 1

In [10]:
from sklearn.base import BaseEstimator, TransformerMixin

col_names = ["total_rooms", "total_bedrooms", "population", "households"]
rooms_ix, bedrooms_ix, population_ix, households_ix = [
    housing.columns.get_loc(c) for c in col_names
]  # get the column indices


class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):  # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self  # nothing else to do

    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]


In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("attribs_adder", CombinedAttributesAdder()),
        ("std_scaler", StandardScaler()),
    ]
)

num_attribs = list(housing_train.drop("ocean_proximity", axis=1))
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer(
    [("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs)]
)

housing_prepared = full_pipeline.fit_transform(housing_train)


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

param_grid = [
    {"kernel": ["linear"], "C": [1, 10, 100, 1000]},
    {"kernel": ["rbf"], "C": [1, 10, 100, 1000], "gamma": [1, 10, 100, 1000]},
]

grid_search = GridSearchCV(
    SVR(), param_grid, scoring="neg_mean_squared_error", cv=5, return_train_score=True
)
grid_search.fit(housing_prepared, housing_labels)


In [23]:
import joblib

# joblib.dump(grid_search, os.path.join(MODELS_DIR, "svr_grid_search.pkl"))
# grid_search = joblib.load(os.path.join(MODELS_DIR, "svr_grid_search.pkl"))


['models/svr.pkl']

In [58]:
print("Best estimator:", grid_search.best_estimator_)
print("Best score:", grid_search.best_score_)
print("Best RMSE:", np.sqrt(-grid_search.best_score_))


Best estimator: SVR(C=1000, kernel='linear')
Best score: -4955666869.470196
Best RMSE: 70396.49756536326


# 2

In [50]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR

param_distribs = {"kernel": ["linear", "rbf"], "C": randint(1, 10000), "gamma": randint(1, 10000)}

rand_search = RandomizedSearchCV(
    SVR(),
    param_distribs,
    n_iter=25,
    scoring="neg_mean_squared_error",
    cv=5,
    random_state=42,
    return_train_score=True,
)
rand_search.fit(housing_prepared, housing_labels)


In [5]:
import joblib

# joblib.dump(rand_search, os.path.join(MODELS_DIR, "svr_rand_search.pkl"))
rand_search = joblib.load(os.path.join(MODELS_DIR, "svr_rand_search.pkl"))


In [53]:
print("Best estimator:", rand_search.best_estimator_)
print("Best score:", rand_search.best_score_)
print("Best RMSE:", np.sqrt(-rand_search.best_score_))


Best estimator: SVR(C=8434, gamma=7514, kernel='linear')
Best score: -4941686797.977544
Best RMSE: 70297.13221730701


# 3

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin


class TopAttributesSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_importances, max_attributes):
        self.attribute_importances = attribute_importances
        self.max_attributes = max_attributes
        
    def fit(self, X, y=None):
        num_attributes = len(self.attribute_importances)
        indices = list(range(num_attributes))
        indices.sort(key=lambda i: self.attribute_importances[i], reverse=True)
        n = min(num_attributes, self.max_attributes)
        self.top_attributes = sorted(indices[:n])
        return self

    def transform(self, X):
        return X[:, self.top_attributes]


In [None]:
from scipy.stats import randint
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

forest_param_distribs = {
    "n_estimators": randint(1, 101),
    "max_features": randint(1, housing_prepared.shape[1] + 1),
}

forest_rand_search = RandomizedSearchCV(
    RandomForestRegressor(),
    forest_param_distribs,
    n_iter=50,
    scoring="neg_mean_squared_error",
    cv=10,
    random_state=42,
    return_train_score=True,
)
forest_rand_search.fit(housing_prepared, housing_labels)


In [9]:
import joblib

# joblib.dump(forest_rand_search, os.path.join(MODELS_DIR, "forest_rand_search.pkl"), compress=3)
forest_rand_search = joblib.load(os.path.join(MODELS_DIR, "forest_rand_search.pkl"))


In [12]:
from sklearn.pipeline import Pipeline

feature_importances = forest_rand_search.best_estimator_.feature_importances_
max_features = forest_rand_search.best_estimator_.max_features
best_svr = rand_search.best_estimator_

prep_pipeline = Pipeline(
    [
        ("clean_data", full_pipeline),
        ("top_attribs", TopAttributesSelector(feature_importances, max_features)),
    ]
)

housing_top_attribs = prep_pipeline.fit_transform(housing_train)


In [21]:
sorted(feature_importances, reverse=True)


[0.33026749660870547,
 0.15501932747139774,
 0.1061239468630947,
 0.07630982326818835,
 0.0744682940690267,
 0.06846010128950838,
 0.06206363002055509,
 0.042986582069573204,
 0.017557775651997683,
 0.01734489413797329,
 0.017019590453738884,
 0.01628086409887824,
 0.00920500483735856,
 0.004264938466165726,
 0.002546950790624872,
 8.077990321320064e-05]

In [13]:
housing_top_attribs


array([[-0.94135046,  1.34743822, -0.8936472 ,  0.00622264, -0.12112176,
         1.        ],
       [ 1.17178212, -1.19243966,  1.292168  , -0.04081077, -0.81086696,
         0.        ],
       [ 0.26758118, -0.1259716 , -0.52543365, -0.07537122, -0.33827252,
         1.        ],
       ...,
       [-1.5707942 ,  1.31001828, -0.36547546, -0.03743619,  0.32286937,
         0.        ],
       [-1.56080303,  1.2492109 ,  0.16826095, -0.05915604, -0.45702273,
         0.        ],
       [-1.28105026,  2.02567448, -0.390569  ,  0.00657083, -0.12169672,
         1.        ]])

# 4

In [24]:
from sklearn.pipeline import Pipeline

best_svr = rand_search.best_estimator_

prep_plus_pred = Pipeline([("preparation", prep_pipeline), ("prediction", best_svr)])
prep_plus_pred.fit(housing_train, housing_labels)


In [25]:
from sklearn.metrics import mean_squared_error

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

predictions = prep_plus_pred.predict(X_test)
predictions_mse = mean_squared_error(predictions, y_test)
predictions_rmse = np.sqrt(predictions_mse)


In [26]:
print("Predictions MSE:", predictions_mse)
print("Predictions RMSE:", predictions_rmse)


Predictions MSE: 5180161891.106912
Predictions RMSE: 71973.34153078424


# 5

In [27]:
from sklearn.model_selection import GridSearchCV

prep_params = {
    "preparation__clean_data__num__imputer__strategy": ["mean", "median", "most_frequent"],
    "preparation__clean_data__num__attribs_adder__add_bedrooms_per_room": [False, True],
    "preparation__top_attribs__max_attributes": list(range(1, 9)),
}

prep_grid_search = GridSearchCV(
    prep_plus_pred, prep_params, scoring="neg_mean_squared_error", cv=10, return_train_score=True
)
prep_grid_search.fit(housing_train, housing_labels)


In [28]:
import joblib

# joblib.dump(prep_grid_search, os.path.join(MODELS_DIR, "prep_grid_search.pkl"))
# prep_grid_search = joblib.load(os.path.join(MODELS_DIR, "prep_grid_search.pkl"))


['models/prep_grid_search.pkl']

In [29]:
prep_grid_search.best_estimator_


In [31]:
print("Best score:", prep_grid_search.best_score_)
print("Best RMSE:", np.sqrt(-prep_grid_search.best_score_))


Best score: -5871609613.887552
Best RMSE: 76626.4289516845
