In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler 
from sklearn import datasets 
from sklearn.ensemble import RandomForestClassifier
boston = datasets.load_boston()
X = pd.DataFrame(boston.data,columns=boston.feature_names)
y = boston.target

In [2]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [3]:
X.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


In [4]:
X['CHAS'].value_counts()

0.0    471
1.0     35
Name: CHAS, dtype: int64

In [5]:
## doing standardization for X 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
## split the data
X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size = 0.30, random_state= 191)

In [7]:
## builing model for random forest
from sklearn.ensemble import RandomForestRegressor

rand_reg = RandomForestRegressor(random_state=12)
rand_reg.fit(X_train,y_train)
rand_reg.score(X_test,y_test)

0.8598058076623292

In [8]:
## hyperparameter tunning
grid_param = {
    "n_estimators" : [90,100,115,130],
    'criterion': ['mse', 'mae'],
    'max_features' : ['auto','log2']
}

In [9]:
grid_search = GridSearchCV(estimator=rand_reg,param_grid=grid_param,cv=5,n_jobs =-1,verbose = 3)

In [10]:
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=12), n_jobs=-1,
             param_grid={'criterion': ['mse', 'mae'],
                         'max_features': ['auto', 'log2'],
                         'n_estimators': [90, 100, 115, 130]},
             verbose=3)

In [11]:
grid_search.best_params_

{'criterion': 'mse', 'max_features': 'log2', 'n_estimators': 130}

In [12]:
grid_search.best_score_

0.824295383386039

In [23]:
## grid search other parameter 
grid_param1 = {'max_depth' : range(2,20,1),
            'min_samples_leaf' : range(1,10,1),
            'min_samples_split': range(2,10,1)
             }

In [24]:
grid_search1 = GridSearchCV(estimator=rand_reg,param_grid=grid_param1,cv=5,n_jobs =-1,verbose = 3)

In [25]:
grid_search1.fit(X_train,y_train)

Fitting 5 folds for each of 1296 candidates, totalling 6480 fits


GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=12), n_jobs=-1,
             param_grid={'max_depth': range(2, 20),
                         'min_samples_leaf': range(1, 10),
                         'min_samples_split': range(2, 10)},
             verbose=3)

In [26]:
grid_search1.best_params_

{'max_depth': 16, 'min_samples_leaf': 1, 'min_samples_split': 2}

In [27]:
## recreate the model 
rand_reg = RandomForestRegressor(random_state=12,**grid_search.best_params_,**grid_search1.best_params_)
rand_reg.fit(X_train,y_train)
rand_reg.score(X_test,y_test)

0.8517863944215387