# Data Loading

Read the cleaned/processed dataset (24 features)
No pipelines needed since data is already preprocessed

In [10]:
import pandas as pd
from pathlib import Path
from sklearn.tree import DecisionTreeRegressor

train_path = Path("/Users/jo/Documents/ca_housing_project/data/train/housing_train_processed.csv")
housing = pd.read_csv(train_path)
X = housing.drop("median_house_value", axis=1)
y = housing["median_house_value"]


X = pd.get_dummies(X, columns=["ocean_proximity"])

# Model Fitting

Initialize and fit the specific model
Display basic training results

In [12]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=42, n_jobs=-1)
rf.fit(X, y)
print("Training r squared:", rf.score(X, y))

Training r squared: 0.9753395337088929


# Cross-Validation

Implement cross-validation evaluation
Display CV scores and statistics

In [16]:
from sklearn.model_selection import cross_val_score
import numpy as np
cv_scores = cross_val_score(rf, X, y, cv=10, scoring='r2', n_jobs=-1)
print("Cross Validation Score:", cv_scores)
print("Mean Cross validation:", np.mean(cv_scores))

Cross Validation Score: [0.82964641 0.82139573 0.82451851 0.81241992 0.82011453 0.8214679
 0.82689758 0.81111375 0.80811891 0.82122236]
Mean Cross validation: 0.8196915586451492


# Hyperparameter Tuning

Use GridSearchCV or RandomizedSearchCV
Show best parameters and improved performance

In [23]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X, y)
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Best Score: 0.8176605012150958


# Model Saving

Save the trained model to the /models directory
Use appropriate naming convention (e.g., linear_regression_model.pkl)

In [22]:
import joblib
model_path = Path("/Users/jo/Documents/ca_housing_project/models/random_forest_model.pkl")
joblib.dump(grid_search.best_estimator_, model_path)

['/Users/jo/Documents/ca_housing_project/models/random_forest_model.pkl']