In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/california-housing-prices/housing.csv


In [9]:
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,GridSearchCV

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('/kaggle/input/california-housing-prices/housing.csv')
print(data.shape)
data.head()

(20640, 10)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


# Nan values

In [3]:
data.isnull().mean()

longitude             0.000000
latitude              0.000000
housing_median_age    0.000000
total_rooms           0.000000
total_bedrooms        0.010029
population            0.000000
households            0.000000
median_income         0.000000
median_house_value    0.000000
ocean_proximity       0.000000
dtype: float64

Using KNN imputer for imputing missing values, but before that we will have to convert categorical values

In [4]:
data_dummies = pd.get_dummies(data)
print(data_dummies.shape)
data_dummies.head()

(20640, 14)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,0,1,0


In [10]:
imputer = KNNImputer(n_neighbors=5)
data_imputed = imputer.fit_transform(data_dummies)

In [11]:
data_imputed = pd.DataFrame(data_imputed,columns = data_dummies.columns)
data_imputed.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0.0,0.0,0.0,1.0,0.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0.0,0.0,0.0,1.0,0.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0.0,0.0,0.0,1.0,0.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0.0,0.0,0.0,1.0,0.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0.0,0.0,0.0,1.0,0.0


In [12]:
# check for missing values
data_imputed.isnull().mean()

longitude                     0.0
latitude                      0.0
housing_median_age            0.0
total_rooms                   0.0
total_bedrooms                0.0
population                    0.0
households                    0.0
median_income                 0.0
median_house_value            0.0
ocean_proximity_<1H OCEAN     0.0
ocean_proximity_INLAND        0.0
ocean_proximity_ISLAND        0.0
ocean_proximity_NEAR BAY      0.0
ocean_proximity_NEAR OCEAN    0.0
dtype: float64

There are no missing values

## Splitting train and test

In [13]:
X_train,X_test,y_train,y_test = train_test_split(data_imputed.drop('median_house_value',axis=1),
                                                 data_imputed['median_house_value'],test_size = 0.3,random_state=1)
print(X_train.shape,X_test.shape)

(14448, 13) (6192, 13)


# Training the model 


In [18]:
model = RandomForestRegressor()
search = GridSearchCV(model,param_grid={'n_estimators':[100,400,1000,2000],'max_depth':[2,4,7,9]},verbose = 2)
search.fit(X_train,y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] max_depth=2, n_estimators=100 ...................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .................... max_depth=2, n_estimators=100, total=   1.4s
[CV] max_depth=2, n_estimators=100 ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s


[CV] .................... max_depth=2, n_estimators=100, total=   1.4s
[CV] max_depth=2, n_estimators=100 ...................................
[CV] .................... max_depth=2, n_estimators=100, total=   1.5s
[CV] max_depth=2, n_estimators=100 ...................................
[CV] .................... max_depth=2, n_estimators=100, total=   1.4s
[CV] max_depth=2, n_estimators=100 ...................................
[CV] .................... max_depth=2, n_estimators=100, total=   1.5s
[CV] max_depth=2, n_estimators=400 ...................................
[CV] .................... max_depth=2, n_estimators=400, total=   5.8s
[CV] max_depth=2, n_estimators=400 ...................................
[CV] .................... max_depth=2, n_estimators=400, total=   5.7s
[CV] max_depth=2, n_estimators=400 ...................................
[CV] .................... max_depth=2, n_estimators=400, total=   5.7s
[CV] max_depth=2, n_estimators=400 ...................................
[CV] .

[CV] ................... max_depth=7, n_estimators=2000, total= 1.4min
[CV] max_depth=9, n_estimators=100 ...................................
[CV] .................... max_depth=9, n_estimators=100, total=   5.5s
[CV] max_depth=9, n_estimators=100 ...................................
[CV] .................... max_depth=9, n_estimators=100, total=   5.2s
[CV] max_depth=9, n_estimators=100 ...................................
[CV] .................... max_depth=9, n_estimators=100, total=   5.4s
[CV] max_depth=9, n_estimators=100 ...................................
[CV] .................... max_depth=9, n_estimators=100, total=   5.2s
[CV] max_depth=9, n_estimators=100 ...................................
[CV] .................... max_depth=9, n_estimators=100, total=   5.1s
[CV] max_depth=9, n_estimators=400 ...................................
[CV] .................... max_depth=9, n_estimators=400, total=  20.5s
[CV] max_depth=9, n_estimators=400 ...................................
[CV] .

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed: 38.8min finished


GridSearchCV(estimator=RandomForestRegressor(),
             param_grid={'max_depth': [2, 4, 7, 9],
                         'n_estimators': [100, 400, 1000, 2000]},
             verbose=2)

In [19]:
search.best_params_

{'max_depth': 9, 'n_estimators': 1000}

In [20]:
best_model = search.best_estimator_
best_model

RandomForestRegressor(max_depth=9, n_estimators=1000)

In [23]:
print('Training score is ',best_model.score(X_train,y_train))
print('Test score is ', best_model.score(X_test,y_test))

Training score is  0.832124078318674
Test score is  0.7701043742494237
