**Excercise 1: Train a support vector machine regressor**

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path
import tarfile
import urllib.request

def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("datasets/housing/housing.csv"))

housing = load_housing_data()

In [5]:
# Generate the income_cat feature in order to use it for Stratified sampling
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

In [8]:
# Split the data into a training set and a test set using stratified sampling
from sklearn.model_selection import train_test_split

strat_train_set, strat_test_set = train_test_split(
    housing, test_size=0.2, stratify=housing["income_cat"], random_state=42)

In [9]:
# Verify the proportions of the different values of the income_cat feature
# have been preserved in the splits
print(strat_train_set["income_cat"].value_counts() / len(strat_train_set))
print(strat_test_set["income_cat"].value_counts() / len(strat_test_set))
print(housing["income_cat"].value_counts() / len(housing))

income_cat
3    0.350594
2    0.318859
4    0.176296
5    0.114462
1    0.039789
Name: count, dtype: float64
income_cat
3    0.350533
2    0.318798
4    0.176357
5    0.114341
1    0.039971
Name: count, dtype: float64
income_cat
3    0.350581
2    0.318847
4    0.176308
5    0.114438
1    0.039826
Name: count, dtype: float64


In [10]:
# Drop the income_cat column as it is not needed anymore
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [11]:
# Separate the labels
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()
housing.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16512 entries, 13096 to 19888
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16344 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
 8   ocean_proximity     16512 non-null  object 
dtypes: float64(8), object(1)
memory usage: 1.3+ MB


In [12]:
housing_labels.info()

<class 'pandas.core.series.Series'>
Index: 16512 entries, 13096 to 19888
Series name: median_house_value
Non-Null Count  Dtype  
--------------  -----  
16512 non-null  float64
dtypes: float64(1)
memory usage: 258.0 KB


In [13]:
# Data preparation
import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.cluster import KMeans

sklearn.set_config(display="diagram")

class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self  # always return self!
    
    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

def column_ratio(X):
    return X[:, [0]] / X[:, [1]]

def ratio_name(function_transformer, feature_names_in):
    return ["ratio"]  # feature names out

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler()
    )

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out="one-to-one"),
    StandardScaler()
)
cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)
default_num_pipeline = make_pipeline(SimpleImputer(strategy="median"),
                                     StandardScaler())
preprocessing = ColumnTransformer([
    ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
    ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
    ("people_per_house", ratio_pipeline(), ["population", "households"]),
    ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population",
                           "households", "median_income"]),
    ("geo", cluster_simil, ["latitude", "longitude"]),
    ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
],
remainder=default_num_pipeline)  # one column remaining: housing_median_age
# TODO transform also housing_median_age using rbf_kernel

preprocessing

In [14]:
housing_prepared = preprocessing.fit_transform(housing)
housing_prepared.shape

(16512, 24)

In [15]:
preprocessing.get_feature_names_out()

array(['bedrooms__ratio', 'rooms_per_house__ratio',
       'people_per_house__ratio', 'log__total_bedrooms',
       'log__total_rooms', 'log__population', 'log__households',
       'log__median_income', 'geo__Cluster 0 similarity',
       'geo__Cluster 1 similarity', 'geo__Cluster 2 similarity',
       'geo__Cluster 3 similarity', 'geo__Cluster 4 similarity',
       'geo__Cluster 5 similarity', 'geo__Cluster 6 similarity',
       'geo__Cluster 7 similarity', 'geo__Cluster 8 similarity',
       'geo__Cluster 9 similarity', 'cat__ocean_proximity_<1H OCEAN',
       'cat__ocean_proximity_INLAND', 'cat__ocean_proximity_ISLAND',
       'cat__ocean_proximity_NEAR BAY', 'cat__ocean_proximity_NEAR OCEAN',
       'remainder__housing_median_age'], dtype=object)

In [16]:
# Try an SVM regressor with Grid Search.
# Train on just the first 5000 instances of the training set or else
# it could take hours
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR

full_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("svm", SVR()),
])
param_grid = [
    {'svm__kernel': ['linear'],
     'svm__C':[0.1, 1., 10., 100.]},
     {'svm__kernel': ['rbf'],
      'svm__C': [0.1, 1., 10.],
      'svm__gamma': ['scale', 'auto', 0.1, 1.]}
]
grid_search  = GridSearchCV(full_pipeline, param_grid, cv=3,
                            scoring='neg_root_mean_squared_error')
grid_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])

In [17]:
grid_search.best_params_

{'svm__C': 100.0, 'svm__kernel': 'linear'}

In [18]:
grid_search.best_score_

-79576.86328472171

In [19]:
# Let's try again with more cross validations
grid_search  = GridSearchCV(full_pipeline, param_grid, cv=10,
                            scoring='neg_root_mean_squared_error')
grid_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])

In [20]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'svm__C': 100.0, 'svm__kernel': 'linear'}
-77750.4973277525


In [21]:
# Let's try with more training data
grid_search  = GridSearchCV(full_pipeline, param_grid, cv=3,
                            scoring='neg_root_mean_squared_error')
grid_search.fit(housing.iloc[:10000], housing_labels.iloc[:10000])

In [22]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'svm__C': 100.0, 'svm__kernel': 'linear'}
-102477.49078978998


In [23]:
# Let's try with the full data and 10 cross validations
grid_search  = GridSearchCV(full_pipeline, param_grid, cv=10,
                            scoring='neg_root_mean_squared_error')
grid_search.fit(housing, housing_labels)
print(grid_search.best_params_)
print(grid_search.best_score_)

{'svm__C': 100.0, 'svm__kernel': 'linear'}
-78085.77288593627


**Excercise 2: Replace GridSearch with RandomizedSearch**

In [30]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

param_distribs = {'svm__C': uniform(loc=0., scale=100000.),
                  'svm__kernel': ['linear']}
rnd_search = RandomizedSearchCV(
    full_pipeline, param_distributions=param_distribs, n_iter=100, cv=3,
    scoring='neg_root_mean_squared_error', random_state=42
)
rnd_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])
print(rnd_search.best_params_)
print(rnd_search.best_score_)

{'svm__C': 17052.412368729154, 'svm__kernel': 'linear'}
-69843.42598445255


**Excercise 4: Create a custom transformer using k-nearest neighbors**

In [40]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.utils.validation import check_array, check_is_fitted

class KNeighborsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_neighbors=3, random_state=None):
        self.n_neighbors = n_neighbors
        self.random_state = random_state

    def fit(self, X, y):
        X = check_array(X)
        y = check_array(y, ensure_2d=False)
        self.kneihbors_ = KNeighborsRegressor(self.n_neighbors)
        self.kneihbors_.fit(X, y)
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = check_array(X)
        return self.kneihbors_.predict(X).reshape(-1, 1)

    def get_feature_names_out(self, names=None):
        return ["nearest_districts_median_house_value"]

In [41]:
kneihbors_transformer = KNeighborsTransformer(n_neighbors=3, random_state=42)
nearest_district_prices = kneihbors_transformer.fit_transform(housing[["latitude", "longitude"]], housing_labels)


In [42]:
nearest_district_prices[:3].round(2)

array([[456667.  ],
       [356133.33],
       [105100.  ]])

In [43]:
# Add the new feature to the preprocessing pipeline

preprocessing = ColumnTransformer([
    ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
    ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
    ("people_per_house", ratio_pipeline(), ["population", "households"]),
    ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population",
                           "households", "median_income"]),
    ("geo", cluster_simil, ["latitude", "longitude"]),
    ("neighbors", kneihbors_transformer, ["latitude", "longitude"]),
    ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
],
remainder=default_num_pipeline)  # one column remaining: housing_median_age
# TODO transform also housing_median_age using rbf_kernel

preprocessing

In [44]:
housing_prepared = preprocessing.fit_transform(housing, housing_labels)
housing_prepared.shape

(16512, 25)

In [45]:
preprocessing.get_feature_names_out()

array(['bedrooms__ratio', 'rooms_per_house__ratio',
       'people_per_house__ratio', 'log__total_bedrooms',
       'log__total_rooms', 'log__population', 'log__households',
       'log__median_income', 'geo__Cluster 0 similarity',
       'geo__Cluster 1 similarity', 'geo__Cluster 2 similarity',
       'geo__Cluster 3 similarity', 'geo__Cluster 4 similarity',
       'geo__Cluster 5 similarity', 'geo__Cluster 6 similarity',
       'geo__Cluster 7 similarity', 'geo__Cluster 8 similarity',
       'geo__Cluster 9 similarity',
       'neighbors__nearest_districts_median_house_value',
       'cat__ocean_proximity_<1H OCEAN', 'cat__ocean_proximity_INLAND',
       'cat__ocean_proximity_ISLAND', 'cat__ocean_proximity_NEAR BAY',
       'cat__ocean_proximity_NEAR OCEAN', 'remainder__housing_median_age'],
      dtype=object)

**Excercise 3: Select only the most important attributes using SelectFromModel**

In [52]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor

selector = make_pipeline(preprocessing, SelectFromModel(RandomForestRegressor(random_state=42),
                                                        threshold="median"))
selector.fit(housing, housing_labels)
housing_most_important = selector.transform(housing)

In [53]:
selector.get_feature_names_out()

array(['bedrooms__ratio', 'rooms_per_house__ratio',
       'people_per_house__ratio', 'log__total_bedrooms',
       'log__total_rooms', 'log__population', 'log__households',
       'log__median_income', 'geo__Cluster 4 similarity',
       'geo__Cluster 5 similarity', 'geo__Cluster 9 similarity',
       'neighbors__nearest_districts_median_house_value',
       'remainder__housing_median_age'], dtype=object)

**Excercise 5: Explore preparation options with GridSearchCV**

In [59]:
selector.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',
                                                SimpleImputer(strategy='median')),
                                               ('standardscaler',
                                                StandardScaler())]),
                     transformers=[('bedrooms',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer(strategy='median')),
                                                    ('functiontransformer',
                                                     FunctionTransformer(feature_names_out=<function ratio_name at 0x155d1bd00>,
                                                                         func=<function column_ratio at 0...
                                   ('geo', ClusterSimilarity(random_state=42),
                                    ['latitude', 'longitude']),
    

In [60]:
from sklearn.linear_model import LinearRegression

full_pipeline = Pipeline([
    ("selector", selector),
    ("linreg", LinearRegression())
])
param_grid = [
    {'selector__columntransformer__geo__n_clusters': [5, 8, 10],
     'selector__selectfrommodel__max_features':[5, 10]},
     {'selector__columntransformer__neighbors__n_neighbors': [3, 5],
      'selector__selectfrommodel__threshold': ['mean', 'median']}
]
grid_search = GridSearchCV(full_pipeline, param_grid, cv=3,
                           scoring='neg_root_mean_squared_error')
grid_search.fit(housing, housing_labels)

In [61]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'selector__columntransformer__neighbors__n_neighbors': 5, 'selector__selectfrommodel__threshold': 'median'}
-49484.79487504853


In [65]:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
cv_res.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_selector__columntransformer__geo__n_clusters,param_selector__selectfrommodel__max_features,param_selector__columntransformer__neighbors__n_neighbors,param_selector__selectfrommodel__threshold,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
9,13.04678,0.110196,0.028884,0.00381,,,5.0,median,{'selector__columntransformer__neighbors__n_ne...,-47511.08766,-50055.27302,-50888.023945,-49484.794875,1436.43277,1
7,12.961674,0.060332,0.031705,0.002793,,,3.0,median,{'selector__columntransformer__neighbors__n_ne...,-49856.977705,-51909.076744,-52215.401223,-51327.151891,1047.065033,2
3,11.612555,0.1772,0.033461,0.000679,8.0,10.0,,,{'selector__columntransformer__geo__n_clusters...,-50104.830867,-52267.719348,-52590.539192,-51654.363136,1103.582304,3
1,9.814586,0.094963,0.036838,0.005229,5.0,10.0,,,{'selector__columntransformer__geo__n_clusters...,-50230.564674,-52344.157786,-52516.646091,-51697.12285,1039.40134,4
5,12.926062,0.016529,0.037674,0.009108,10.0,10.0,,,{'selector__columntransformer__geo__n_clusters...,-50216.512186,-52344.140973,-52589.261653,-51716.63827,1065.459131,5


**Excercise 6: Reimplement StandardScalerClone with support for inverse_transform() and others**

In [136]:
from pandas.core.frame import DataFrame

class StandardScalerClone(BaseEstimator, TransformerMixin):
    def __init__(self, with_mean=True):
        self.with_mean = with_mean

    def fit(self, X, y=None):
        self.mean_ = X.mean(axis=0)
        self.scale_ = X.std(axis=0)
        self.n_features_in_ = X.shape[1]
        if type(X) == DataFrame:
            self.feature_names_in_ = X.columns.to_numpy()
            assert self.feature_names_in_.size == self.n_features_in_
        return self
    
    def transform(self, X):
        check_is_fitted(self)
        assert self.n_features_in_== X.shape[1]
        if self.with_mean:
            X = X - self.mean_
        return X / self.scale_
    
    def inverse_transform(self, X):
        check_is_fitted(self)
        assert self.n_features_in_== X.shape[1]
        X = X * self.scale_
        if self.with_mean:
            X = X + self.mean_
        return X
    
    def get_feature_names_out(self, input_features=None):
        feature_names_out = None
        if input_features:
            if hasattr(self, 'feature_names_in_'):
                assert np.all(self.feature_names_in_ == input_features)
                assert len(input_features) == self.n_features_in_
            feature_names_out = input_features
        elif hasattr(self, 'feature_names_in_'):
            feature_names_out = self.feature_names_in_
        else:
            feature_names_out = np.array([f"x{i}" for i in range(self.n_features_in_)])
        return feature_names_out



In [137]:
scaler = StandardScalerClone()
labels_scaled = scaler.fit_transform(housing_labels.to_frame())
labels_inv = scaler.inverse_transform(labels_scaled)
print(np.allclose(housing_labels.to_frame(), labels_inv))
print(scaler.feature_names_in_)

True
['median_house_value']


In [138]:
scaler.get_feature_names_out()

array(['median_house_value'], dtype=object)

In [139]:
scaler = StandardScalerClone()
scaler.fit(housing[["latitude", "longitude"]])
scaler.get_feature_names_out()

array(['latitude', 'longitude'], dtype=object)

In [140]:
scaler.get_feature_names_out(input_features=["lat", "long"])

AssertionError: 

In [141]:
X = np.random.rand(1000, 3)
scaler = StandardScalerClone()
X_scaled = scaler.fit_transform(X)
X_back = scaler.inverse_transform(X_scaled)
np.allclose(X, X_back)

True

In [142]:
scaler.get_feature_names_out()

array(['x0', 'x1', 'x2'], dtype='<U2')

In [143]:
scaler.get_feature_names_out(input_features=["a", "b", "c"])

['a', 'b', 'c']