<a href="https://colab.research.google.com/github/kaivalyagnik/ml-practice-portfolio/blob/main/california_housing_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import numpy as np
import pandas as pd
import kagglehub

# Download latest version
path = kagglehub.dataset_download("camnugent/california-housing-prices")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/california-housing-prices


In [25]:
data = pd.read_csv(path + '/housing.csv')
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [27]:
from sklearn.model_selection import train_test_split

X = data.drop('median_house_value', axis = 1)
y = data['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

num_pipeline = make_pipeline(SimpleImputer(strategy = 'median'), StandardScaler())
cat_pipeline = make_pipeline(SimpleImputer(strategy = 'most_frequent'), OneHotEncoder(handle_unknown='ignore'))

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include='object').columns.tolist()

preprocessor = ColumnTransformer([('num', num_pipeline, num_cols), ('cat', cat_pipeline, cat_cols)])

X_train_prepared = preprocessor.fit_transform(X_train)
X_test_prepared = preprocessor.transform(X_test)

In [29]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train_prepared, y_train)

In [30]:
from sklearn.metrics import root_mean_squared_error, r2_score
lin_reg_pred = lin_reg.predict(X_test_prepared)
f"r2 score of linear regression is: {r2_score(y_test, lin_reg_pred)}."

'r2 score of linear regression is: 0.6254382675296266.'

In [31]:
f"rmse of linear regression is: {root_mean_squared_error(y_test, lin_reg_pred)}."

'rmse of linear regression is: 70059.19333925014.'

In [32]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(X_train_prepared, y_train)

In [33]:
forest_reg_pred = forest_reg.predict(X_test_prepared)
f"rmse of random forest model is: {root_mean_squared_error(y_test, forest_reg_pred)}."

'rmse of random forest model is: 48941.70034309343.'

In [34]:
f"r2 score of random forest model is: {r2_score(y_test, forest_reg_pred)}."

'r2 score of random forest model is: 0.8172104989933294.'

In [39]:
from sklearn.model_selection import RandomizedSearchCV

def train_and_tune_rf(X_train, X_test, y_train, y_test, n_iter=10, cv=3, random_state=42):
    """
    Trains and tunes a Random Forest Regressor using RandomizedSearchCV.

    Parameters:
    - n_iter: number of random combinations to try
    - cv: number of cross-validation folds

    Returns:
    - best_model: the tuned RandomForestRegressor
    - results_df: a summary DataFrame of performance
    """

    # Parameter distribution for sampling
    param_dist = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt']
    }

    rf = RandomForestRegressor(random_state=random_state)

    random_search = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_dist,
        n_iter=n_iter,
        cv=cv,
        scoring='neg_root_mean_squared_error',
        verbose=1,
        n_jobs=-1,
        random_state=random_state
    )

    print("🔄 Starting Randomized Search...")
    random_search.fit(X_train, y_train)

    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)

    rmse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("\n✅ Best Parameters:", random_search.best_params_)
    print(f"📊 Tuned Random Forest RMSE: {rmse:.2f}")
    print(f"📈 Tuned Random Forest R² Score: {r2:.4f}")

    results_df = pd.DataFrame({
        "Model": ["Tuned Random Forest (Randomized Search)"],
        "RMSE": [rmse],
        "R2 Score": [r2],
        "Best Params": [random_search.best_params_]
    })

    return best_model, results_df



In [40]:
best_rf_model, rf_results = train_and_tune_rf(X_train_prepared, X_test_prepared, y_train, y_test)
rf_results

🔄 Starting Randomized Search...
Fitting 3 folds for each of 10 candidates, totalling 30 fits


15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
sklea


✅ Best Parameters: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20}
📊 Tuned Random Forest RMSE: 50330.23
📈 Tuned Random Forest R² Score: 0.8067


Unnamed: 0,Model,RMSE,R2 Score,Best Params
0,Tuned Random Forest (Randomized Search),50330.233631,0.806691,"{'n_estimators': 100, 'min_samples_split': 5, ..."
