In [180]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [181]:
from sklearn.datasets import fetch_california_housing
housing_data = fetch_california_housing()
print(housing_data)

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]], shape=(20640, 8)), 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894], shape=(20640,)), 'frame': None, 'target_names': ['MedHouseVal'], 'feature_names': ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude'], 'DESCR': '.. _california_housing_dataset

In [182]:
X.shape

(20640, 8)

### Merging the Independent and Target Columns to check how the data looks like

In [183]:
dependent_col = pd.DataFrame(housing_data.data, columns=housing_data.feature_names)

target_col = pd.DataFrame(housing_data.target, columns=['House Price'])

housing_data_df = pd.concat([dependent_col, target_col], axis=1)
housing_data_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,House Price
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


### Creating Independent and Dependent columns for Train Test Split

In [184]:
X = pd.DataFrame(housing_data.data, columns=housing_data.feature_names)
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [185]:
Y = housing_data['target']
Y[:100]

array([4.526  , 3.585  , 3.521  , 3.413  , 3.422  , 2.697  , 2.992  ,
       2.414  , 2.267  , 2.611  , 2.815  , 2.418  , 2.135  , 1.913  ,
       1.592  , 1.4    , 1.525  , 1.555  , 1.587  , 1.629  , 1.475  ,
       1.598  , 1.139  , 0.997  , 1.326  , 1.075  , 0.938  , 1.055  ,
       1.089  , 1.32   , 1.223  , 1.152  , 1.104  , 1.049  , 1.097  ,
       0.972  , 1.045  , 1.039  , 1.914  , 1.76   , 1.554  , 1.5    ,
       1.188  , 1.888  , 1.844  , 1.823  , 1.425  , 1.375  , 1.875  ,
       1.125  , 1.719  , 0.938  , 0.975  , 1.042  , 0.875  , 0.831  ,
       0.875  , 0.853  , 0.803  , 0.6    , 0.757  , 0.75   , 0.861  ,
       0.761  , 0.735  , 0.784  , 0.844  , 0.813  , 0.85   , 1.292  ,
       0.825  , 0.952  , 0.75   , 0.675  , 1.375  , 1.775  , 1.021  ,
       1.083  , 1.125  , 1.313  , 1.625  , 1.125  , 1.125  , 1.375  ,
       1.188  , 0.982  , 1.188  , 1.625  , 1.375  , 5.00001, 1.625  ,
       1.375  , 1.625  , 1.875  , 1.792  , 1.3    , 1.838  , 1.25   ,
       1.7    , 1.93

In [186]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

In [187]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(X_train, Y_train)

In [188]:
y_pred = regressor.predict(X_test)
print("MAE:",mean_absolute_error(Y_test, y_pred))
print("MSE:",mean_squared_error(Y_test, y_pred))
print("r2:",r2_score(Y_test, y_pred))

MAE: 0.46424249224806197
MSE: 0.521410319105155
r2: 0.605952852995923


### Hyperparamter Tuning with GridSearchCv

In [189]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.32, random_state=40)

In [190]:
from sklearn.model_selection import GridSearchCV
from time import time

In [191]:
model = DecisionTreeRegressor()
params = {
    'criterion': ["squared_error", "friedman_mse", "absolute_error"],
    'splitter': ["best", "random"],
    'max_depth': [15, 20, 25, 30, 35, 40],
    'max_features': ['sqrt', 'log2'],  # Removed 'auto'
    'ccp_alpha': [0.0, 0.01, 0.05]  # Lower ccp_alpha to avoid over-pruning
}

In [192]:
start_time = time()
grid = GridSearchCV(estimator=model, param_grid=params, cv=5, scoring='r2', n_jobs=-1)
grid.fit(X_train, Y_train)
end_time = time()
execution_time = end_time - start_time
print(f"GridSearchCV execution time: {execution_time:.2f} seconds")

GridSearchCv Start Time: 1738245041.318474
GridSearchCV execution time: 45.95 seconds


In [197]:
grid.best_params_

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'max_depth': 20,
 'max_features': 'log2',
 'splitter': 'best'}

In [194]:
grid.best_score_

np.float64(0.623551731398362)

In [195]:
y_pred = regressor.predict(X_test)
print("MAE:",mean_absolute_error(Y_test, y_pred))
print("MSE:",mean_squared_error(Y_test, y_pred))
print("r2:",r2_score(Y_test, y_pred))

MAE: 0.12276650113550344
MSE: 0.13681454317144587
r2: 0.8985103663428695


In [196]:
# New unseen data point
unseen_data = np.array([[8.3252, 41.0, 6.984127, 1.023810, 322.0, 2.555556, 37.88, -122.23]])

# Make the prediction
y_pred_new = regressor.predict(unseen_data)

# Print the predicted house price
print(f"Predicted House Price: {y_pred_new[0]}")

Predicted House Price: 4.043
