Cross-validation 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import fetch_california_housing

df = fetch_california_housing()

# print(df)

#independent features
x = pd.DataFrame(df.data, columns=df.feature_names)

#dependent features
y = df.target


#train test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

#decison tree
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor()
regressor.fit(x_train, y_train)

#test model
y_pred = regressor.predict(x_test)


# evalution
from sklearn.metrics import r2_score
score = r2_score(y_test, y_pred)
display(score)

#--Hyperparameter Tunning
parameter ={
    'criterion' :['squared_error','friedman_mse'],
    'splitter':['best','random'],
    'max_depth':[3, 5, 7, 10, 12],
    'max_features':['sqrt','log2', None]
}

regressor = DecisionTreeRegressor()

#gridsearchcv for in train data to make validation help of with GridSearchCv
from sklearn.model_selection import GridSearchCV
regressorcv = GridSearchCV(regressor, param_grid=parameter, cv=5 , scoring='neg_mean_squared_error')

#train the model
regressorcv.fit(x_train, y_train)

#select best prams
print(regressorcv.best_params_)



0.6250243002128792

{'criterion': 'squared_error', 'max_depth': 10, 'max_features': None, 'splitter': 'best'}


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import fetch_california_housing

# Load dataset
df = fetch_california_housing()

# Independent features
x = pd.DataFrame(df.data, columns=df.feature_names)

# Dependent feature
y = df.target

# Train-test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Decision Tree Model
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(random_state=42)
regressor.fit(x_train, y_train)

# Test model
y_pred = regressor.predict(x_test)

# Evaluation
from sklearn.metrics import r2_score
score = r2_score(y_test, y_pred)
print("R2 Score before tuning:", score)


#   OPTIMIZED HYPERPARAMETER TUNING

from sklearn.model_selection import RandomizedSearchCV

# Keep only fast and meaningful parameters
param_grid = {
    'criterion': ['squared_error', 'friedman_mse'],
    'splitter': ['best', 'random'],
    'max_depth': [3, 5, 7, 9, 12],
    'max_features': ['sqrt', 'log2', None]
}

# Randomized Search (much faster than GridSearch)
random_search = RandomizedSearchCV(
    estimator=DecisionTreeRegressor(random_state=42),
    param_distributions=param_grid,
    n_iter=20,            # Try only 20 random combinations
    cv=5,
    scoring='neg_mean_squared_error',
    random_state=42,
    n_jobs=-1             # Use all CPU cores â†’ FAST
)

random_search.fit(x_train, y_train)

# Best parameters
print("Best Parameters:", random_search.best_params_)

# Best estimator
best_model = random_search.best_estimator_

# Predict again
y_pred_best = best_model.predict(x_test)

# New score
score_best = r2_score(y_test, y_pred_best)
print("R2 Score after tuning:", score_best)


R2 Score before tuning: 0.622075845135081
Best Parameters: {'splitter': 'best', 'max_features': None, 'max_depth': 9, 'criterion': 'friedman_mse'}
R2 Score after tuning: 0.683330702280899
