# Data Loading

Read the cleaned/processed dataset (24 features)
No pipelines needed since data is already preprocessed

In [21]:
import pandas as pd
from pathlib import Path
from sklearn.tree import DecisionTreeRegressor

train_path = Path("/Users/jo/Documents/ca_housing_project/data/train/housing_train_processed.csv")
housing = pd.read_csv(train_path)
X = housing.drop("median_house_value", axis=1)
y = housing["median_house_value"]

X = housing.drop("median_house_value", axis=1)
y = housing["median_house_value"]

X = pd.get_dummies(X, columns=["ocean_proximity"])

# Model Fitting

Initialize and fit the specific model
Display basic training results

In [23]:
dtree_model = DecisionTreeRegressor(random_state=42)
dtree_model.fit(X, y)
print("Training Results:", dtree_model.score(X, y))

Training Results: 0.9998263063701696


# Cross-Validation

Implement cross-validation evaluation
Display CV scores and statistics

In [27]:
from sklearn.model_selection import cross_val_score
import numpy as np

cv_scores = cross_val_score(dtree_model, X, y, cv=5, scoring='r2')
print("Cross-Validation R squared score:", cv_scores)
print("Mean Cross Validation:", np.mean(cv_scores))

Cross-Validation R squared score: [0.66884137 0.65288728 0.64395006 0.64043665 0.62178246]
Mean Cross Validation: 0.6455795656155263


# Hyperparameter Tuning

Use GridSearchCV or RandomizedSearchCV
Show best parameters and improved performance

In [32]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(dtree_model, param_grid, cv=10, scoring='r2')
grid_search.fit(X, y)
print("Best parameters:", grid_search.best_params_)
print("Best Cross Validation score:", grid_search.best_score_)

Best parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Best Cross Validation score: 0.7360504840931417


# Model Saving

Save the trained model to the /models directory
Use appropriate naming convention (e.g., linear_regression_model.pkl)

In [36]:
import joblib
model_path = Path("/Users/jo/Documents/ca_housing_project/models/decision_tree_model.pkl")
joblib.dump(grid_search.best_estimator_, model_path)

['/Users/jo/Documents/ca_housing_project/models/decision_tree_model.pkl']