In [12]:
from sklearn.datasets import fetch_california_housing
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import pickle

RANDOM_STATE = 42

In [13]:
dataset = fetch_california_housing()

In [14]:
print(dataset.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [15]:
dataset_df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)

In [16]:
dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
dtypes: float64(8)
memory usage: 1.3 MB


In [17]:
dataset_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [18]:
dataset_df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


In [19]:
X_data, y_data = dataset.data, dataset.target
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=RANDOM_STATE)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

X_train shape: (16512, 8)
X_test shape: (4128, 8)


In [20]:
def create_model():
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('selector', SelectKBest(score_func=f_regression, k=8)),
        ('svr', SVR(C=10, epsilon=0.1, gamma=1.0, kernel='rbf'))
    ])
    return model

In [21]:
model = create_model()

In [22]:
model.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('selector',
                 SelectKBest(k=8,
                             score_func=<function f_regression at 0x7f837762fee0>)),
                ('svr', SVR(C=10, gamma=1.0))])

In [23]:
y_pred_test = model.predict(X_test)
print(mean_squared_error(y_test, y_pred_test))

0.3139917951791932


In [24]:
model = create_model()

In [25]:
model.fit(X_data, y_data)

Pipeline(steps=[('scaler', StandardScaler()),
                ('selector',
                 SelectKBest(k=8,
                             score_func=<function f_regression at 0x7f837762fee0>)),
                ('svr', SVR(C=10, gamma=1.0))])

In [26]:
# REF: https://www.kaggle.com/prmohanty/python-how-to-save-and-load-ml-models
model_name = 'model_california_housing.pkl'
with open(model_name, 'wb') as file:  
    pickle.dump(model, file)
print(model_name, 'saved.')

model_california_housing.pkl saved.
