# Actual implementation of the model

In [73]:
import pandas as pd


file_path = "/home/jadosh/.cache/kagglehub/datasets/camnugent/california-housing-prices/versions/1/housing.csv"
housing = pd.read_csv(file_path)
print(housing.head())

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  


# Creating stratified splits

In [74]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

# Stratified split by income category
strat_train_set, strat_test_set = train_test_split(
    housing, test_size=0.2,  
    random_state=42,
    stratify=housing["income_cat"]
)

# Drop income category
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()


In [75]:
from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import rbf_kernel

class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=0.1, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state
        
    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(
            self.n_clusters, 
            random_state=self.random_state
        )
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self
    
    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]
    
cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1.,
                                  random_state=42)

In [76]:
class IsInlandTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, category_name='INLAND'):
        self.category_name = category_name

    def fit(self, X, y=None):
        self.imputer_ = SimpleImputer(strategy="most_frequent")
        self.imputer_.fit(X)
        return self

    def transform(self, X):
        X_imputed = self.imputer_.transform(X)
        ocean_proximity_series = pd.Series(X_imputed[:, 0]) # Access the single column
        return (ocean_proximity_series == self.category_name).astype(int).to_frame(name=f'is_{self.category_name.lower()}')

    def get_feature_names_out(self, input_features=None):
        return [f'is_{self.category_name.lower()}']

In [77]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor


num_attribs = ["longitude", "latitude", "housing_median_age",
"total_rooms", "total_bedrooms", "population", "households",
"median_income"]
cat_attribs = ["ocean_proximity"]

num_pipeline = make_pipeline(SimpleImputer(strategy="median"),
                             StandardScaler())

cat_pipeline = make_pipeline(
    IsInlandTransformer()
)

In [78]:
from sklearn.compose import make_column_selector

def column_ratio(X):
    return X[:, [0]] / X[:, [1]]

def ratio_name(function_transformer, feature_names_in):
    return ["ratio"]

# For returning the ratio between two features => new feature
def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio,
                            feature_names_out=ratio_name),
        StandardScaler()
    )

log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out="one-to-one"),
    StandardScaler()
)
cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1.,
                                  random_state=42)

default_num_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler()
)

preprocessing = ColumnTransformer([
    ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
    ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
    ("people_per_house", ratio_pipeline(), ["population", "households"]),
    ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population", 
                           "households", "median_income"]),
    ("geo", cluster_simil, ["latitude", "longitude"]),
    ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
    ],
    remainder=default_num_pipeline)

In [79]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

full_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("random_forest", RandomForestRegressor(random_state=42, n_jobs=-1)),
])

param_distribs = {"preprocessing__geo__n_clusters": randint(low=3, high=50),
                  "random_forest__max_features": randint(low=2, high=20)}

rnd_search = RandomizedSearchCV(
    full_pipeline, param_distributions=param_distribs, n_iter=10,
    cv=3, scoring="neg_root_mean_squared_error", random_state=42
)

rnd_search.fit(housing, housing_labels)

In [80]:
final_model = rnd_search.best_estimator_
feature_importances = final_model["random_forest"].feature_importances_
feature_importances.round(2)

sorted(zip(feature_importances, 
           final_model["preprocessing"].get_feature_names_out()),
       reverse=True)


[(np.float64(0.19954956605876098), 'log__median_income'),
 (np.float64(0.06791734012863442), 'cat__is_inland'),
 (np.float64(0.06298972221380393), 'bedrooms__ratio'),
 (np.float64(0.0574323301818771), 'rooms_per_house__ratio'),
 (np.float64(0.04917711225534545), 'people_per_house__ratio'),
 (np.float64(0.044703169219904436), 'geo__Cluster 43 similarity'),
 (np.float64(0.02379496790720735), 'geo__Cluster 6 similarity'),
 (np.float64(0.02244167149634949), 'geo__Cluster 10 similarity'),
 (np.float64(0.02230849377986096), 'geo__Cluster 2 similarity'),
 (np.float64(0.01831333753000982), 'geo__Cluster 24 similarity'),
 (np.float64(0.01728021403600396), 'geo__Cluster 11 similarity'),
 (np.float64(0.01671893493529711), 'geo__Cluster 38 similarity'),
 (np.float64(0.01633611808605413), 'geo__Cluster 26 similarity'),
 (np.float64(0.016024002711570895), 'geo__Cluster 13 similarity'),
 (np.float64(0.015234574737762357), 'geo__Cluster 22 similarity'),
 (np.float64(0.015037777966339271), 'geo__Cluste

In [82]:
from sklearn.metrics import root_mean_squared_error

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

final_predictions = final_model.predict(X_test)
final_rmse = root_mean_squared_error(y_test, final_predictions)
print(final_rmse)


39785.84482518517


Now we launch monitor and maintain the system

In [83]:
import joblib

joblib.dump(final_model, "my_california_housing_model.pkl")

['my_california_housing_model.pkl']