<a href="https://colab.research.google.com/github/hgabrali/Machine-Learning/blob/main/house_prices_regression_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1) Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

# 2) Load data
cal = fetch_california_housing(as_frame=True)
df = cal.frame  # all numeric; target is 'MedHouseVal' (in 100k USD)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


# Linear Regression

In [None]:
# 3) Split features/target
X = df.drop(columns=["MedHouseVal"])
y = df["MedHouseVal"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 4) (Optional) Scale features — LR doesn't require it, but helps with stability
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

# 5) Fit Linear Regression
lin = LinearRegression()
lin.fit(X_train_s, y_train)

# 6) Predict
y_pred = lin.predict(X_test_s)

print(y_pred)

[0.71912284 1.76401657 2.70965883 ... 4.46877017 1.18751119 2.00940251]


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Display nicely
metrics_df = pd.DataFrame({
    "MAE": [mae],
    "MSE": [mse],
    "RMSE": [rmse],
    "R²": [r2]
})
metrics_df


Unnamed: 0,MAE,MSE,RMSE,R²
0,0.5332,0.555892,0.745581,0.575788


Lets put the metrics calculation and display in a function so we can use it later with other algorithms.

In [None]:
# --------------------------------------------------
# Helper function: Evaluate regression model results
# --------------------------------------------------
def evaluate(y_true, y_pred):
    """
    Takes the true target values (y_true) and the predicted values (y_pred).
    Returns a DataFrame with common regression metrics:
    - MAE: Mean Absolute Error
    - MSE: Mean Squared Error
    - RMSE: Root Mean Squared Error
    - R²: Coefficient of Determination
    """
    # Calculate metrics
    mae  = mean_absolute_error(y_true, y_pred)       # Average of absolute differences
    mse  = mean_squared_error(y_true, y_pred)        # Average of squared differences
    rmse = np.sqrt(mse)                              # Square root of MSE
    r2   = r2_score(y_true, y_pred)                  # Variance explained by the model

    # Put metrics into a DataFrame for nicer display
    metrics_df = pd.DataFrame({
        "MAE":  [mae],
        "MSE":  [mse],
        "RMSE": [rmse],
        "R²":   [r2]
    })
    return metrics_df

In [None]:
evaluate(y_test, y_pred)

Unnamed: 0,MAE,MSE,RMSE,R²
0,0.5332,0.555892,0.745581,0.575788


# Polynomial Regression (needs scaling. We will use X_train and X_test (the ones without scaling) to show again how we scale the features)

In [None]:
# Models & transforms
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

In [None]:
# First scale the features
scaler_poly = StandardScaler()
X_train_scaled = scaler_poly.fit_transform(X_train)
X_test_scaled  = scaler_poly.transform(X_test)

# Then create polynomial features
poly = PolynomialFeatures(degree=3)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly  = poly.transform(X_test_scaled)

# Fit linear regression on polynomial features
poly_reg = LinearRegression()
poly_reg.fit(X_train_poly, y_train)
y_pred_poly = poly_reg.predict(X_test_poly)


# Decision Tree (no scaling)

In [None]:
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
res_dt = evaluate(y_test, y_pred_dt)

# Random Forest (no scaling)

In [None]:
rf = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
res_rf = evaluate(y_test, y_pred_rf)


# KNN (needs scaling - use the same scaled data as before)

In [None]:
knn = KNeighborsRegressor(n_neighbors=7)
knn.fit(X_train_s, y_train)     # use the same scaled data
y_pred_knn = knn.predict(X_test_s)
res_knn = evaluate(y_test, y_pred_knn)

# Compare results

In [None]:
# Evaluate each model’s predictions and add a column for the model name
results = pd.concat([
    evaluate(y_test, y_pred).assign(Model="Linear Regression"),
    evaluate(y_test, y_pred_poly).assign(Model="Polynomial Regression"),
    evaluate(y_test, y_pred_dt).assign(Model="Decision Tree"),
    evaluate(y_test, y_pred_rf).assign(Model="Random Forest"),
    evaluate(y_test, y_pred_knn).assign(Model="KNN Regression"),
], ignore_index=True)

# Make "Model" the index so it’s easy to compare side by side
results.set_index("Model", inplace=True)

# Display the comparison table
results

Unnamed: 0_level_0,MAE,MSE,RMSE,R²
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Linear Regression,0.5332,0.555892,0.745581,0.575788
Polynomial Regression,0.538679,25.407137,5.040549,-18.388708
Decision Tree,0.454679,0.495235,0.703729,0.622076
Random Forest,0.326607,0.253434,0.503422,0.8066
KNN Regression,0.444039,0.428334,0.654472,0.67313


# Hyperparameter tuning

## GridSearch

In [None]:
# Importing necessary libraries
from sklearn.model_selection import GridSearchCV

# Define a Decision Tree Regressor model
dt = DecisionTreeRegressor(random_state=42)

# Define the grid of hyperparameters to search
param_grid = {
    'max_depth': [3, 5, 10, 20, None],                # Control the depth of the tree
    'min_samples_split': [2, 5, 10],             # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 5, 10],               # Minimum number of samples required to be at a leaf node
    'max_features': [None, 3, 5, 0.5, 0.8] # Number of features to consider when looking for the best split
}

# Set up the GridSearchCV
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)

# Fit the grid search model
grid_search.fit(X_train, y_train)

# Best parameters found by Grid Search
print("Best Hyperparameters:", grid_search.best_params_)
print("Best CV R2:", grid_search.best_score_)

# Evaluating the model with the best hyperparameters
best_tree_model = grid_search.best_estimator_
y_pred = best_tree_model.predict(X_test)

# Evaluating performance
evaluate(y_test, y_pred)

Fitting 5 folds for each of 225 candidates, totalling 1125 fits
Best Hyperparameters: {'max_depth': None, 'max_features': None, 'min_samples_leaf': 10, 'min_samples_split': 2}
Best CV R2: -0.37498931717177364


Unnamed: 0,MAE,MSE,RMSE,R²
0,0.404814,0.363823,0.603177,0.722359


## RandomSearch

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
import numpy as np
import pandas as pd

# ---------------------------------------
# 0) OPTIONAL: search on a smaller subset, so it doesnt take too long
# ---------------------------------------
X_sub, _, y_sub, _ = train_test_split(
    X_train, y_train, train_size=0.5, random_state=42, stratify=None  # for regression no stratify
)

# ---------------------------------------
# 1) Small/fast RF during search
# ---------------------------------------
rf_fast = RandomForestRegressor(
    n_estimators=10,      # few trees for speed during search
    max_samples=0.7,      # subsample rows per tree
    n_jobs=-1,
    random_state=42
)

# Minimal grid (keep it tiny)
param_grid = {
    "max_depth": [3, None],
    "min_samples_split": [2, 10],
    "min_samples_leaf": [1, 5],
    "max_features": [0.3, 0.6]  # fractions are valid for RF
}

# 3-fold CV is much faster than 5
grid = GridSearchCV(
    estimator=rf_fast,
    param_grid=param_grid,
    cv=3,
    scoring="r2",
    n_jobs=-1,
    verbose=0
)

# Run the *fast* search on the subset
grid.fit(X_sub, y_sub)

print("Best params (fast search):", grid.best_params_)
print("Best CV R² (subset, fast RF):", round(grid.best_score_, 4))

# ---------------------------------------
# 2) Refit a final RF with more trees and full data
# ---------------------------------------
best_params = grid.best_params_
rf_final = RandomForestRegressor(
    n_estimators=300,     # more trees now that we’re done searching
    max_samples=None,     # use all rows per tree
    n_jobs=-1,
    random_state=42,
    **best_params
)

rf_final.fit(X_train, y_train)
y_pred = rf_final.predict(X_test)

evaluate(y_test, y_pred)


Best params (fast search): {'max_depth': None, 'max_features': 0.6, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best CV R² (subset, fast RF): 0.7599


Unnamed: 0,MAE,MSE,RMSE,R²
0,0.320645,0.243816,0.493777,0.813939
