# Decision Tree Regressor Implementation

In [1]:
# Importing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Importing House pricing Dataset

from sklearn.datasets import fetch_california_housing
california_df = fetch_california_housing()

In [3]:
#checking the dataset
california_df

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [4]:
# taking dataset into the Pandas DataFrame

df = pd.DataFrame(california_df.data, columns = california_df.feature_names)
df['Target'] = california_df.target

In [5]:
# Taking sapmle data

df = df.sample(frac=0.25)

In [6]:
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
15013,3.6625,37.0,5.601607,1.040184,2076.0,2.383467,32.76,-117.05,1.548
10188,3.1410,27.0,4.450000,1.445000,416.0,2.080000,33.87,-117.96,1.675
2377,2.2448,20.0,5.084034,1.050420,1665.0,4.663866,36.70,-119.54,0.589
4312,3.6953,36.0,4.739130,1.347826,416.0,3.014493,34.09,-118.33,2.000
9479,1.8500,27.0,4.039755,1.033639,779.0,2.382263,39.40,-123.35,0.718
...,...,...,...,...,...,...,...,...,...
19042,3.9309,18.0,6.318881,1.156643,2230.0,3.118881,38.49,-121.81,1.785
11579,5.2077,7.0,5.476636,1.102804,232.0,2.168224,33.77,-118.02,1.813
699,2.3973,10.0,4.718220,1.186441,1140.0,2.415254,37.69,-122.12,1.673
8171,2.7092,35.0,4.389952,1.040670,774.0,1.851675,33.80,-118.12,2.563


In [7]:
# independent feature
X = df.iloc[:, :-1]

In [8]:
# Dependet feature
y = df.iloc[:,-1]

In [9]:
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
15013,3.6625,37.0,5.601607,1.040184,2076.0,2.383467,32.76,-117.05
10188,3.141,27.0,4.45,1.445,416.0,2.08,33.87,-117.96
2377,2.2448,20.0,5.084034,1.05042,1665.0,4.663866,36.7,-119.54
4312,3.6953,36.0,4.73913,1.347826,416.0,3.014493,34.09,-118.33
9479,1.85,27.0,4.039755,1.033639,779.0,2.382263,39.4,-123.35


In [10]:
y.head()

15013    1.548
10188    1.675
2377     0.589
4312     2.000
9479     0.718
Name: Target, dtype: float64

In [11]:
# import train test split function

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [12]:
# Importing Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()

In [13]:
#fitting the model

regressor.fit(X_train, y_train)

In [14]:
#predicting the y values

y_pred = regressor.predict(X_test)
y_pred

array([2.402, 0.959, 1.292, ..., 1.362, 1.682, 0.503])

In [15]:
# checking score of the model through r2_score

from sklearn.metrics import r2_score
score = r2_score(y_pred, y_test)
score

0.5142223815172828

In [16]:
# Hyperparameter tuning 

parameter = {
    'criterion': ['mse', 'friedmn_mse', 'mae', 'poisson'], 
    'splitter' : ['best', 'random'],
    'max_depth' : [1,2,3,4,5,6,7,8,9,10,11,12],
    'max_features' : ['auto', 'sqrt', 'log2']
}

In [17]:
# importing warnings 

import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import GridSearchCV
regressorcv = GridSearchCV(regressor, param_grid = parameter, cv = 2, scoring = 'neg_mean_squared_log_error')

In [18]:
# traing model 

regressorcv.fit(X_train, y_train)

In [19]:
regressorcv.best_params_

{'criterion': 'poisson',
 'max_depth': 8,
 'max_features': 'log2',
 'splitter': 'best'}

In [20]:
#predicting the result

y_pred = regressorcv.predict(X_test)

In [21]:
# checking the r2 score on test set

r2_score(y_test, y_pred)

0.4962417013442729