# California housing
(Adapted from: Geron, A. (2017). Hands-on machine learning with Scikit-Learn and TensorFlow. O'Reilly.)

In [None]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

Import the data from *datasets/housing.csv* and print out some basic info about it.

In [None]:
housing = pd.read_csv("datasets/housing.csv")

In [None]:
# there are some missing values in attribute total_bedrooms: this line filters rows in 
# the housing DataFrame that contain any missing values.
# isna() → identifies missing values.
# any(axis=1) → checks if any column in a row has NaN.
housing[housing.isna().any(axis=1)]

In [None]:
print(housing.head())
print(housing.describe())
print(housing.info())
print()
print(housing["ocean_proximity"].value_counts())

Plot the histograms of the features in the data set.

Save the image to a file.

In [None]:
housing.hist(bins=50, figsize=(20,15))
plt.savefig("housing_hist.png", bbox_inches='tight')

Split the data set into a learning and test set in 70:30 ratio. Make the split stratified.

In [None]:
bins = np.linspace(0, max(housing["median_house_value"]), 50)
# Save discretized Y values in a new array, broken down by the bins created above.
mhv = np.digitize(housing["median_house_value"], bins)
# this is needed to make stratified train/test sets
L, T = train_test_split(housing, test_size=0.2, random_state=42, stratify=mhv)
print("Learning set size: {:d}\nTest set size: {:d}".format(len(L), len(T)))

In [None]:
L.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
plt.savefig("housing_lat_lon.png", bbox_inches='tight')

L.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
       s=L["population"]/100, label="population",
       c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True)
plt.legend();

In [None]:
corr_matrix = L.corr(numeric_only=True)
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix
print("Computing correlations...");
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"];
scatter_matrix(L[attributes], figsize=(12, 8));

In [None]:
# Manual feature construction
'''
L["rooms_per_household"] = L["total_rooms"] / L["households"]
L["bedrooms_per_room"] = L["total_bedrooms"] / L["total_rooms"]
L["population_per_household"] = L["population"] / L["households"]
corr_matrix = L.corr(numeric_only=True)
corr_matrix["median_house_value"].sort_values(ascending=False)
'''
# Comment and restart kernel after this

In [None]:
#Prepare the data for Machine Learning algorithms
housing = L.drop("median_house_value", axis=1) # drop labels for training set
housing_labels = L["median_house_value"].copy()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer

In [None]:
rooms_ix, bedrooms_ix, population_ix, household_ix = [list(housing.columns).index(col) for col in ("total_rooms", "total_bedrooms", "population", "households")]

def add_extra_features(X):
    rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
    population_per_household = X[:, population_ix] / X[:, household_ix]
    return np.c_[X, rooms_per_household, population_per_household]

attr_adder = FunctionTransformer(add_extra_features, validate=False)

all_attributes = list(housing.columns)+["rooms_per_household", "population_per_household"]
housing_extra_attribs = attr_adder.fit_transform(housing.values)
housing_extra_attribs = pd.DataFrame(
    housing_extra_attribs,
    columns = all_attributes)

In [None]:
housing_num = housing.drop("ocean_proximity", axis=1)
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', FunctionTransformer(add_extra_features, validate=False)),
        ('std_scaler', StandardScaler()),
    ])
housing_num_tr = num_pipeline.fit_transform(housing_num)

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])
print("Finishing data preparation...")

housing_prepared = full_pipeline.fit_transform(housing)
print("{}".format(housing_prepared.shape))

In [None]:
from sklearn.linear_model import *
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

models = [LinearRegression(),
          KNeighborsRegressor(n_neighbors=5),
          DecisionTreeRegressor(random_state=42),
          RandomForestRegressor(n_estimators=10, random_state=42),
          LassoLars(alpha=.1),
          Ridge(alpha=.5),
         ]

In [None]:
print("Learning: fitting the models to data...")
for m in models:
    m.fit(housing_prepared, housing_labels)
print("_______________________________________\n")

print("Evaluating the models on training set")
from sklearn.metrics import mean_squared_error
for reg in models:
    housing_predictions = reg.predict(housing_prepared)
    rmse = np.sqrt(mean_squared_error(housing_labels, housing_predictions))
    print("{:23s}: {:.3f}".format(reg.__class__.__name__, rmse))

In [None]:
print("Evaluating the models using internal cross-validation")
from sklearn.model_selection import cross_val_score
for reg in models:
    scores = cross_val_score(reg, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
    rmse_scores = np.sqrt(-scores)
    print("{:23s}: {:.3f}+-{:.3f}".format(reg.__class__.__name__, rmse_scores.mean(), rmse_scores.std()))

In [None]:
print("Fine tuning the models with internal cross-validation")
from sklearn.model_selection import GridSearchCV
param_grid = [
              {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
              {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
             ]
grid_search = GridSearchCV(models[3], param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_labels)

cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

print(grid_search.best_params_)

feature_importances = grid_search.best_estimator_.feature_importances_
for i in sorted(zip(feature_importances, list(housing_extra_attribs)), reverse=True):
    print(i)

In [None]:
print("Final testing...")
final_model = grid_search.best_estimator_
X_test = T.drop("median_house_value", axis=1)
y_test = T["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_rmse = np.sqrt(mean_squared_error(y_test, final_predictions))
print("RMSE:{:10.2f}".format(final_rmse))