In [3]:
import pandas as pd
from pandas_datareader import data
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import GridSearchCV

In [4]:
data = fetch_california_housing()

In [8]:
X, y = data.data, data.target

In [9]:
X

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

In [10]:
import pandas as pd
from sklearn.datasets import fetch_california_housing

# Load the California Housing dataset
data = fetch_california_housing()
X, y = data.data, data.target

# Create a Pandas DataFrame from the dataset
column_names = data.feature_names + ["MedHouseVal"]
df = pd.DataFrame(data=np.c_[X, y], columns=column_names)

# Display the first few rows of the DataFrame
print(df.head())


   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  


In [13]:
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [16]:
x = df.iloc[:,0:8]
y = df.iloc[:,8]

In [18]:
y

0        4.526
1        3.585
2        3.521
3        3.413
4        3.422
         ...  
20635    0.781
20636    0.771
20637    0.923
20638    0.847
20639    0.894
Name: MedHouseVal, Length: 20640, dtype: float64

In [19]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state=42)

In [24]:
rt = DecisionTreeRegressor(max_depth=5)

In [25]:
rt.fit(x_train,y_train)

In [26]:
y_pred = rt.predict(x_test)

In [27]:
y_pred

array([1.16857267, 1.27158128, 3.19958174, ..., 4.86065073, 1.27158128,
       1.64279735])

In [28]:
r2_score(y_test,y_pred)

0.5997321244428705

In [29]:
#HYPERPARAMETER TUNING

In [40]:
param_grid = {
    'max_depth':[2,4,8,10,None],
    'max_features':[0.25,0.5,1.0],
    'min_samples_split':[0.25,0.5,1.0]
}

In [41]:
reg = GridSearchCV(DecisionTreeRegressor(),param_grid=param_grid)

In [42]:
reg.fit(x_train,y_train)

In [44]:
reg.best_score_

0.46149607986708385

In [46]:
reg.best_params_

{'max_depth': 10, 'max_features': 1.0, 'min_samples_split': 0.25}

In [49]:
#FEATURES IMPORTANCE
for importance, name in sorted(zip(rt.feature_importances_,x_train.columns),reverse=True):
    print(name,importance)

MedInc 0.7712117162048083
AveOccup 0.12840674614895392
HouseAge 0.04162087993608612
AveRooms 0.031260721268004166
Latitude 0.022049480286785723
Population 0.0024849982871799504
Longitude 0.0020969502013726514
AveBedrms 0.0008685076668091378
