# getting data and preocessing

In [8]:
# Download the data:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True,exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url,tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path='datasets')
    return pd.read_csv(Path("datasets/housing/housing.csv"))


In [9]:
housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [10]:
housing_labels = housing['median_house_value']
housing = housing.drop(columns='median_house_value')

In [11]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import  StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

num_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler(),
)

from sklearn.compose import make_column_selector, make_column_transformer

preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
    (cat_pipeline, make_column_selector(dtype_include=object)),
)

# housing = preprocessing.fit_transform(housing)

1)  Try a Support Vector Machine regressor (sklearn.svm.SVR) with various hyperparameters, such as kernel="linear" (with various values for the C hyperparameter) or kernel="rbf" (with various values for the C and gamma hyperparameters). Note that SVMs don't scale well to large datasets, so you should probably train your model on just the first 5,000 instances of the training set and use only 3-fold cross-validation, or else it will take hours. Don't worry about what the hyperparameters mean for now (see the SVM notebook if you're interested). How does the best SVR predictor perform?

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

param_grid = [
    {'svr__kernel':['linear'],
     'svr__C':[10., 30., 100., 300., 1000.,3000., 10000., 30000.0]
    },

    {
        'svr__kernel':['rbf'],
        'svr__C': [1.0, 3.0, 10., 30., 100., 300.,1000.0],
        'svr__gamma':[0.01, 0.03, 0.1, 0.3, 1.0, 3.0]
    },
]

from sklearn.pipeline import Pipeline

svr_pipeline = Pipeline([
    ("preprocessing",preprocessing),
    ('svr',SVR())
])

grid_search = GridSearchCV(svr_pipeline, param_grid,cv=3,scoring="neg_root_mean_squared_error")

grid_search.fit(housing.iloc[:5000],housing_labels[:5000])




In [13]:
#score of the best model
-grid_search.best_score_

"""That's much worse than the RandomForestRegressor (but to be fair, we trained the model on much less data."""

"That's much worse than the RandomForestRegressor (but to be fair, we trained the model on much less data."

In [14]:
#let's check best params
grid_search.best_params_
"""The linear kernel seems better than the RBF kernel. Notice that the value of C is the maximum tested value. When this happens you definitely want to launch the grid search again with higher values for C (removing the smallest values), because it is likely that higher values of C will be better."""

'The linear kernel seems better than the RBF kernel. Notice that the value of C is the maximum tested value. When this happens you definitely want to launch the grid search again with higher values for C (removing the smallest values), because it is likely that higher values of C will be better.'

2) Try replacing the GridSearchCV with a RandomizedSearchCV.

In [15]:
# warning : long runtime
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, loguniform

"""
 see https://docs.scipy.org/doc/scipy/reference/stats.html
 for `expon()` and `loguniform()` documentation and more probability distribution functions.
"""

# note: gamma is ignored when kernel is linear
param_distribs = {
    'svr__kernel':['linear','rbf'],
    'svr__C':loguniform(20,200_000),
    'svr__gamma':expon(scale=1.0),
}

rnd_search = RandomizedSearchCV(svr_pipeline,
                                param_distributions=param_distribs,
                                n_iter=10,
                                cv=3,
                                scoring="neg_root_mean_squared_error",
                                random_state=42)

rnd_search.fit(housing.iloc[:5000],housing_labels.iloc[:5000])

In [16]:
#sccore
-rnd_search.best_score_

59134.15315796338

In [17]:
# best params
rnd_search.best_params_

{'svr__C': 157055.10989448498,
 'svr__gamma': 0.26497040005002437,
 'svr__kernel': 'rbf'}

3) Try adding a SelectFromModel transformer in the preparation pipeline to select only the most important attributes.

In [18]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
selector_pipeline = Pipeline([
    ('preprocessing',preprocessing),
    ('selector',SelectFromModel(RandomForestRegressor(random_state=42), threshold=0.005)),
    ('svr',SVR(C=rnd_search.best_params_['svr__C'],
                gamma=rnd_search.best_params_['svr__gamma'],
                kernel=rnd_search.best_params_['svr__kernel'])),
])

In [20]:
from sklearn.model_selection import cross_val_score
selector_rmses = -cross_val_score(
    selector_pipeline,
    housing.iloc[:5000],
    housing_labels.iloc[:5000],
    scoring="neg_root_mean_squared_error",
    cv=3
)

pd.Series(selector_rmses).describe()

count        3.000000
mean     59104.545084
std       1710.465953
min      57671.450918
25%      58157.761856
50%      58644.072793
75%      59821.092168
max      60998.111542
dtype: float64

4) Try creating a custom transformer that trains a k-Nearest Neighbors regressor (sklearn.neighbors.KNeighborsRegressor) in its fit() method, and outputs the model's predictions in its transform() method. Then add this feature to the preprocessing pipeline, using latitude and longitude as the inputs to this transformer. This will add a feature in the model that corresponds to the housing median price of the nearest districts.

In [23]:
"""Rather than restrict ourselves to k-Nearest Neighbors regressors, let's create a transformer that accepts any regressor. For this, we can extend the MetaEstimatorMixin and 
have a required estimator argument in the constructor. The fit() method must work on a clone of this estimator, and it must also save feature_names_in_. The MetaEstimatorMixin 
will ensure that estimator is listed as a required parameters, and it will update get_params() and set_params() to make the estimator's hyperparameters available for tuning. 
Lastly, we create a get_feature_names_out() method: the output column name is the ..."""

from sklearn.neighbors import KNeighborsRegressor
from sklearn.base import MetaEstimatorMixin, clone
from sklearn.utils.validation import check_array, check_is_fitted
from sklearn.base import BaseEstimator, TransformerMixin 

class FeatureFromRegressor(MetaEstimatorMixin, TransformerMixin, BaseEstimator):
    def __init__(self,estimator):
        self.estimator = estimator

    def fit(self,X,y=None):
        estimator_ = clone(self.estimator)
        estimator_.fit(X,y)
        self.estimator_ = estimator_
        self.n_features_in_ = self.estimator_.n_features_in_
        if hasattr(self.estimator,"feature_names_in_"):
            self.feature_names_in_ = self.estimator.feature_names_in_
        return self
    
    def transform(self,X):
        check_is_fitted(self)
        preds = self.estimator_.predict(X)
        if preds.ndim == 1:
            preds = preds.reshape(-1,1)
        return preds
    
    def get_feature_names_out(self,names=None):
        check_is_fitted(self)
        n_outputs = getattr(self.estimator_, "n_outputs_",1)
        estimator_class_name = self.estimator_.__class__.__name__
        estimator_short_name = estimator_class_name.lower().replace("_","")
        return [f"{estimator_short_name}_prediction_{i}" for i in range(n_outputs)]
    

In [24]:
from sklearn.utils.estimator_checks import check_estimator

check_estimator(FeatureFromRegressor(KNeighborsRegressor()))

[{'estimator': FeatureFromRegressor(estimator=KNeighborsRegressor()),
  'check_name': 'check_estimator_cloneable',
  'exception': None,
  'status': 'passed',
  'expected_to_fail': False,
  'expected_to_fail_reason': 'Check is not expected to fail'},
 {'estimator': FeatureFromRegressor(estimator=KNeighborsRegressor()),
  'check_name': 'check_estimator_cloneable',
  'exception': None,
  'status': 'passed',
  'expected_to_fail': False,
  'expected_to_fail_reason': 'Check is not expected to fail'},
 {'estimator': FeatureFromRegressor(estimator=KNeighborsRegressor()),
  'check_name': 'check_estimator_tags_renamed',
  'exception': None,
  'status': 'passed',
  'expected_to_fail': False,
  'expected_to_fail_reason': 'Check is not expected to fail'},
 {'estimator': FeatureFromRegressor(estimator=KNeighborsRegressor()),
  'check_name': 'check_valid_tag_types',
  'exception': None,
  'status': 'passed',
  'expected_to_fail': False,
  'expected_to_fail_reason': 'Check is not expected to fail'},
 

In [25]:
knn_reg = KNeighborsRegressor(n_neighbors=3,weights='distance')
knn_transformer = FeatureFromRegressor(knn_reg)
geo_features = housing[["latitude",'longitude']]
knn_transformer.fit_transform(geo_features,housing_labels)

array([[452600.        ],
       [358500.        ],
       [395700.33333333],
       ...,
       [ 92300.        ],
       [ 84700.        ],
       [ 89400.        ]])

In [26]:
knn_transformer.get_feature_names_out()

['kneighborsregressor_prediction_0']

In [33]:
from sklearn.base import clone
from sklearn.compose import ColumnTransformer

transformers = [(name, clone(transformer), columns) for name, transformer, columns in preprocessing.transformers]
transformers.append(("geo", knn_transformer, ["latitude", "longitude"]))

new_geo_preprocessing = ColumnTransformer(transformers)

In [34]:
new_geo_pipeline = Pipeline([
    ("preprocessing",new_geo_preprocessing),
    ("svr",SVR(C=rnd_search.best_params_['svr__C'],
               gamma=rnd_search.best_params_['svr__gamma'],
               kernel=rnd_search.best_params_['svr__kernel'])),
])

: 

In [None]:
new_pipe_rmses = -cross_val_score(
    new_geo_pipeline,
    housing.iloc[:5000],
    housing_labels.iloc[:5000],
    scoring="neg_root_mean_squared_error",
    cv=3
)

pd.Series(new_pipe_rmses).describe()

5) Automatically explore some preparation options using RandomSearchCV.

In [None]:
param_distribs = {
    "preprocessing__geo__estimator__n_neighbors": range(1, 30),
    "preprocessing__geo__estimator__weights": ["distance", "uniform"],
    "svr__C": loguniform(20, 200_000),
    "svr__gamma": expon(scale=1.0),
}

new_geo_rnd_search = RandomizedSearchCV(new_geo_pipeline,
                                        param_distributions=param_distribs,
                                        n_iter=50,
                                        cv=3,
                                        scoring='neg_root_mean_squared_error',
                                        random_state=42)
new_geo_rnd_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])

In [None]:
-new_geo_rnd_search.best_score_

6) Try to implement the StandardScalerClone class again from scratch, then add support for the inverse_transform() method: executing scaler.inverse_transform(scaler.fit_transform(X)) should return an array very close to X. Then add support for feature names: set feature_names_in_ in the fit() method if the input is a DataFrame. This attribute should be a NumPy array of column names. Lastly, implement the get_feature_names_out() method: it should have one optional input_features=None argument. If passed, the method should check that its length matches n_features_in_, and it should match feature_names_in_ if it is defined, then input_features should be returned. If input_features is None, then the method should return feature_names_in_ if it is defined or np.array(["x0", "x1", ...]) with length n_features_in_ otherwise.

In [None]:
from sklearn.base import BaseEstimator , TransformerMixin
from sklearn.utils.validation import check_array, check_is_fitted

class StandardScalerClone(BaseEstimator, TransformerMixin):
    def __init__(self,with_mean-True):
        self.with_mean = with_mean

    def fit(self,X,y=None):
        X_orig = X
        X = check_array(X)
        self.mean_ = X.mean(axis=0)
        self.scale_ = X.std(axis=0)
        self.n_features_in_ = X.shape[1]
        if hasattr(X_orig, "columns"):
            self.feature_names_in_ = np.array(X_orig.columns, dtype=object)
        return self
    

    def transform(self,X):
        check_is_fitted(self)
        X = check_array(X)
        if self.n_features_in_ != X.shape[1]:
            raise ValueError("Unexpected number of features")
        if self.with_mean:
            X = X - self.mean_
        return X / self.scale_
    
    def inverse_transform(self, X):
        check_is_fitted(self)
        X = check_array(X)
        if self.n_features_in_ != X.shape[1]:
            raise ValueError("Unexpected number of features")
        X = X * self.scale_
        return X + self.mean_ if self.with_mean else X
    
    def get_feature_names_out(self, input_features=None):
        if input_features is None:
            return getattr(self, "feature_names_in_",
                           [f"x{i}" for i in range(self.n_features_in_)])
        else:
            if len(input_features) != self.n_features_in_:
                raise ValueError("Invalid number of features")
            if hasattr(self, "feature_names_in_") and not np.all(
                self.feature_names_in_ == input_features
            ):
                raise ValueError("input_features ≠ feature_names_in_")
            return input_features


In [None]:
from sklearn.utils.estimator_checks import check_estimator
 
check_estimator(StandardScalerClone())

In [None]:
np.random.seed(42)
X = np.random.rand(1000, 3)

scaler = StandardScalerClone()
X_scaled = scaler.fit_transform(X)

assert np.allclose(X_scaled, (X - X.mean(axis=0)) / X.std(axis=0))

In [None]:
scaler = StandardScalerClone(with_mean=False)
X_scaled_uncentered = scaler.fit_transform(X)

assert np.allclose(X_scaled_uncentered, X / X.std(axis=0))

In [None]:
scaler = StandardScalerClone()
X_back = scaler.inverse_transform(scaler.fit_transform(X))

assert np.allclose(X, X_back)

In [None]:
assert np.all(scaler.get_feature_names_out() == ["x0", "x1", "x2"])
assert np.all(scaler.get_feature_names_out(["a", "b", "c"]) == ["a", "b", "c"])

In [None]:
df = pd.DataFrame({"a": np.random.rand(100), "b": np.random.rand(100)})
scaler = StandardScalerClone()
X_scaled = scaler.fit_transform(df)

assert np.all(scaler.feature_names_in_ == ["a", "b"])
assert np.all(scaler.get_feature_names_out() == ["a", "b"])