# Using sklearn for predicting house prices in California

Based on:
- https://www.kaggle.com/code/amritvirsinghx/step-by-step-ml-california-housing-prices/notebook
- https://www.kaggle.com/code/ilialar/california-housing-analysis-and-preciction#10.-Plotting-training-and-validation-curves

## Import libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV

## Import data

In [2]:
housing = pd.read_csv("../data/raw/housing.csv")

In [3]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## Split datasets

In [5]:
train_df, test_df = train_test_split(housing,shuffle = True, test_size = 0.25, random_state=17)

print(train_df.shape)
print(test_df.shape)

(15480, 10)
(5160, 10)


# FEATURE ENGINEERING

### Encode ocean proximity

In [6]:
ocean_proximity_dummies = pd.get_dummies(pd.concat([train_df['ocean_proximity'],test_df['ocean_proximity']]),
                                         drop_first=True)

In [7]:
dummies_names=list(ocean_proximity_dummies.columns)

In [8]:
train_df=pd.concat([train_df,ocean_proximity_dummies[:train_df.shape[0]]], axis=1 )
test_df=pd.concat([test_df,ocean_proximity_dummies[train_df.shape[0]:]], axis=1 )

train_df=train_df.drop(['ocean_proximity'], axis=1)
test_df=test_df.drop(['ocean_proximity'], axis=1)

In [8]:
train_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
3400,-118.35,34.28,30.0,3214.0,513.0,1700.0,533.0,4.6944,248200.0,0,0,0,0
9159,-118.45,34.44,16.0,13406.0,2574.0,7030.0,2440.0,4.6861,187900.0,0,0,0,0
10592,-117.78,33.69,16.0,3400.0,501.0,1575.0,488.0,6.0961,295500.0,0,0,0,0
4281,-118.32,34.09,27.0,210.0,98.0,332.0,112.0,2.5556,175000.0,0,0,0,0
230,-122.2,37.79,40.0,1060.0,256.0,667.0,235.0,4.1739,169600.0,0,0,1,0


### Impute missing values

In [16]:
imputer=SimpleImputer(strategy="median")
imputer.fit(train_df)

SimpleImputer(strategy='median')

In [17]:
# use this imputer to transform
train_df_imputed=imputer.transform(train_df)
test_df_imputed=imputer.transform(test_df)

In [18]:
train_df=pd.DataFrame(train_df_imputed,columns=train_df.columns)
test_df=pd.DataFrame(test_df_imputed,columns=test_df.columns)

In [26]:
train_df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
INLAND                0
ISLAND                0
NEAR BAY              0
NEAR OCEAN            0
distance_to_SF        0
distance_to_LA        0
dtype: int64

### Derive distance to San Francisco and Los Angeles

In [20]:
# derive features with distance to LA and SF
sf_coord=[-122.4194, 37.7749]
la_coord=[-118.2437, 34.0522]

train_df['distance_to_SF']=round(np.sqrt((train_df['longitude']-sf_coord[0])**2+(train_df['latitude']-sf_coord[1])**2),2)
test_df['distance_to_SF']=round(np.sqrt((test_df['longitude']-sf_coord[0])**2+(test_df['latitude']-sf_coord[1])**2),2)

train_df['distance_to_LA']=round(np.sqrt((train_df['longitude']-la_coord[0])**2+(train_df['latitude']-la_coord[1])**2),2)
test_df['distance_to_LA']=round(np.sqrt((test_df['longitude']-la_coord[0])**2+(test_df['latitude']-la_coord[1])**2),2)

In [14]:
train_df.head(10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,INLAND,ISLAND,NEAR BAY,NEAR OCEAN,distance_to_SF,distance_to_LA
0,-118.35,34.28,30.0,3214.0,513.0,1700.0,533.0,4.6944,248200.0,0.0,0.0,0.0,0.0,5.36,0.25
1,-118.45,34.44,16.0,13406.0,2574.0,7030.0,2440.0,4.6861,187900.0,0.0,0.0,0.0,0.0,5.18,0.44
2,-117.78,33.69,16.0,3400.0,501.0,1575.0,488.0,6.0961,295500.0,0.0,0.0,0.0,0.0,6.18,0.59
3,-118.32,34.09,27.0,210.0,98.0,332.0,112.0,2.5556,175000.0,0.0,0.0,0.0,0.0,5.51,0.09
4,-122.2,37.79,40.0,1060.0,256.0,667.0,235.0,4.1739,169600.0,0.0,0.0,1.0,0.0,0.22,5.44
5,-118.12,34.03,20.0,2595.0,428.0,1751.0,479.0,5.6112,308000.0,0.0,0.0,0.0,0.0,5.7,0.13
6,-117.95,33.63,29.0,1496.0,282.0,463.0,215.0,6.0516,500001.0,0.0,0.0,0.0,0.0,6.1,0.51
7,-119.45,35.16,34.0,3437.0,696.0,1783.0,608.0,2.3912,52900.0,1.0,0.0,0.0,0.0,3.96,1.64
8,-117.92,34.63,34.0,81.0,26.0,53.0,14.0,1.4091,137500.0,1.0,0.0,0.0,0.0,5.49,0.66
9,-117.11,32.75,34.0,2131.0,594.0,1373.0,562.0,2.113,102100.0,0.0,0.0,0.0,1.0,7.31,1.73


### Scale numeric features

In [22]:
numerical_features=list(train_df.columns)
numerical_features.remove('median_house_value')
print(numerical_features)

features_to_scale=numerical_features

scaler = StandardScaler()

X_train_scaled=pd.DataFrame(scaler.fit_transform(train_df[features_to_scale]),
                            columns=features_to_scale, index=train_df.index)
X_test_scaled=pd.DataFrame(scaler.transform(test_df[features_to_scale]),
                           columns=features_to_scale, index=test_df.index)

['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN', 'distance_to_SF', 'distance_to_LA']


In [24]:
X_train_scaled.head(10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,INLAND,ISLAND,NEAR BAY,NEAR OCEAN,distance_to_SF,distance_to_LA
0,0.609553,-0.6337,0.105068,0.267195,-0.053828,0.247422,0.092293,0.424206,-0.683491,-0.013923,-0.355746,-0.384079,0.593053,-0.993913
1,0.559735,-0.558979,-1.003622,4.990733,4.923272,5.005839,5.12421,0.419866,-0.683491,-0.013923,-0.355746,-0.384079,0.521083,-0.915359
2,0.893517,-0.909234,-1.003622,0.353398,-0.082807,0.135827,-0.026446,1.157171,-0.683491,-0.013923,-0.355746,-0.384079,0.920914,-0.853343
3,0.624499,-0.722431,-0.132509,-1.125025,-1.056009,-0.973875,-1.018581,-0.694198,-0.683491,-0.013923,-0.355746,-0.384079,0.653027,-1.060063
4,-1.308445,1.005496,0.896989,-0.731088,-0.674456,-0.6748,-0.694026,0.15203,-0.683491,-0.013923,2.810991,-0.384079,-1.462079,1.15184
5,0.724135,-0.750452,-0.686854,-0.019684,-0.259094,0.292953,-0.050194,0.903611,-0.683491,-0.013923,-0.355746,-0.384079,0.728995,-1.043526
6,0.808826,-0.937255,0.025876,-0.529021,-0.611669,-0.856923,-0.746799,1.133901,-0.683491,-0.013923,-0.355746,-0.384079,0.888928,-0.886419
7,0.061554,-0.222733,0.421836,0.370546,0.388098,0.321521,0.290192,-0.780164,1.463078,-0.013923,-0.355746,-0.384079,0.033289,-0.419232
8,0.823771,-0.470247,0.421836,-1.184811,-1.229882,-1.222955,-1.277169,-1.293716,1.463078,-0.013923,-0.355746,-0.384079,0.645031,-0.824403
9,1.227298,-1.348221,0.421836,-0.234727,0.141779,-0.044511,0.168814,-0.925638,-0.683491,-0.013923,-0.355746,2.603631,1.372723,-0.382022


# MODEL TRAINING AND EVALUATION

Let's prepare cross validation samples. As far as there are not a lot of data we can easily divide it on 10 folds, that are taken from shuffled train data. Within every split we will train our model on 90% of train data and compute CV metric on the other 10%.

We fix the random state for the reproducibility.

In [25]:
kf = KFold(n_splits=10, random_state=17, shuffle=True)

In [27]:
model=Ridge(alpha=1)
X=train_df[numerical_features]
y=train_df['median_house_value']
cv_scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error', n_jobs=-1)
print(np.sqrt(-cv_scores.mean()))

68302.45387901775


In [29]:
features_for_trees=['INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN',
       'longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'distance_to_SF', 'distance_to_LA']  

In [32]:
X_trees=X[features_for_trees]

model_rf=RandomForestRegressor(n_estimators=100, random_state=17)
cv_scores = cross_val_score(model_rf, X_trees, y, cv=kf, scoring='neg_mean_squared_error', n_jobs=-1)

print(np.sqrt(-cv_scores.mean()))

47976.25441081968


We can see significant improvement, comparing to the linear model and higher n_estimator probably will help. But first, let's try to tune other hyperparametres:

In [33]:
param_grid={'n_estimators': [100],
            'max_depth':  [22, 23, 24, 25],
            'max_features': [5,6,7,8]}

gs=GridSearchCV(model_rf, param_grid, scoring='neg_mean_squared_error', n_jobs=-1, cv=kf, verbose=1)

gs.fit(X_trees,y)

Fitting 10 folds for each of 16 candidates, totalling 160 fits


GridSearchCV(cv=KFold(n_splits=10, random_state=17, shuffle=True),
             estimator=RandomForestRegressor(random_state=17), n_jobs=-1,
             param_grid={'max_depth': [22, 23, 24, 25],
                         'max_features': [5, 6, 7, 8], 'n_estimators': [100]},
             scoring='neg_mean_squared_error', verbose=1)

In [34]:
print(np.sqrt(-gs.best_score_))

45834.08928011807


In [35]:
gs.best_params_

{'max_depth': 24, 'max_features': 5, 'n_estimators': 100}

In [36]:
best_depth=gs.best_params_['max_depth']
best_features=gs.best_params_['max_features']

In [37]:
%%time
model_rf=RandomForestRegressor(n_estimators=100, max_depth=best_depth, max_features=best_features, random_state=17)
cv_scores = cross_val_score(model_rf, X_trees, y, cv=kf, scoring='neg_mean_squared_error', n_jobs=-1)

print(np.sqrt(-cv_scores.mean()))

45834.08928011807
CPU times: user 29.4 ms, sys: 3.86 ms, total: 33.3 ms
Wall time: 16 s
