In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import GridSearchCV, ShuffleSplit, train_test_split
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.pipeline import make_pipeline

In [2]:
from sklearn.datasets import load_boston
boston = load_boston()

In [3]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR'])

In [4]:
print(boston.data.shape)
print(boston.target.shape)
print(boston.feature_names)

(506, 13)
(506,)
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


In [5]:
bos = pd.DataFrame(boston.data, columns=boston.feature_names)
bos.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [6]:
target = boston.target

In [7]:
bos.isnull().any()

CRIM       False
ZN         False
INDUS      False
CHAS       False
NOX        False
RM         False
AGE        False
DIS        False
RAD        False
TAX        False
PTRATIO    False
B          False
LSTAT      False
dtype: bool

* all the column are free from `NaN` value so no need for Imputation

In [8]:
data = bos.values

In [27]:
x_train, x_test, y_train, y_test = train_test_split(data, target,
                                                    test_size=0.25,
                                                    random_state=42)

pipe = make_pipeline(StandardScaler(),
                     RandomForestRegressor())

cv = ShuffleSplit(test_size=0.2, random_state=0)

param_grid = {'randomforestregressor__n_estimators': [500],
              'randomforestregressor__max_features':['sqrt', 'log2', 10],
              'randomforestregressor__max_depth':[9, 11, 13],
              'randomforestregressor__random_state': [123]}                 

grid = GridSearchCV(pipe, param_grid=param_grid, cv=cv)

In [28]:
grid.fit(x_train, y_train)          #training 

GridSearchCV(cv=ShuffleSplit(n_splits=10, random_state=0, test_size=0.2, train_size=None),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decr...imators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'randomforestregressor__n_estimators': [500], 'randomforestregressor__max_features': ['sqrt', 'log2', 10], 'randomforestregressor__max_depth': [9, 11, 13], 'randomforestregressor__random_state': [123]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [29]:
grid.score(x_test, y_test)       #accuracy of model

0.8616433697755181

In [30]:
grid.best_params_        #best combination of decision tree parameter which results above accuracy

{'randomforestregressor__max_depth': 13,
 'randomforestregressor__max_features': 10,
 'randomforestregressor__n_estimators': 500,
 'randomforestregressor__random_state': 123}

Accuracy of the model is approximatly 85%.