<a href="https://colab.research.google.com/github/hemhalatha/ML_projects/blob/main/comparision_for_house_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install scikit-learn pandas numpy



In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [3]:
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name="Price")  # target is in 100,000 $

print("Dataset shape:", X.shape)
print("Sample rows:")
print(X.head())

Dataset shape: (20640, 8)
Sample rows:
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  
0    -122.23  
1    -122.22  
2    -122.24  
3    -122.25  
4    -122.25  


In [4]:
print(y.head())

0    4.526
1    3.585
2    3.521
3    3.413
4    3.422
Name: Price, dtype: float64


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [6]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(max_depth=10, random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=200, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, random_state=42)
}

In [7]:
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results[name] = {"MSE": mse, "R²": r2}
    print(f"\n{name}")
    print("Mean Squared Error:", mse)
    print("R² Score:", r2)



Linear Regression
Mean Squared Error: 0.5558915986952444
R² Score: 0.5757877060324508

Decision Tree
Mean Squared Error: 0.4154681981618525
R² Score: 0.6829476865157171

Random Forest
Mean Squared Error: 0.2539759249192041
R² Score: 0.8061857564039718

Gradient Boosting
Mean Squared Error: 0.26149849837343114
R² Score: 0.8004451261281281


In [8]:
sample = X_test.iloc[[0]]   # keep DataFrame format
print("\nSample Input Features:", sample.to_dict())
for name, model in models.items():
    print(f"{name} Prediction: {model.predict(sample)[0]:.2f} (≈ ${model.predict(sample)[0]*100000:,.0f})")


Sample Input Features: {'MedInc': {20046: 1.6812}, 'HouseAge': {20046: 25.0}, 'AveRooms': {20046: 4.192200557103064}, 'AveBedrms': {20046: 1.0222841225626742}, 'Population': {20046: 1392.0}, 'AveOccup': {20046: 3.8774373259052926}, 'Latitude': {20046: 36.06}, 'Longitude': {20046: -119.01}}
Linear Regression Prediction: 0.72 (≈ $71,912)
Decision Tree Prediction: 0.55 (≈ $55,230)
Random Forest Prediction: 0.49 (≈ $49,443)
Gradient Boosting Prediction: 0.43 (≈ $43,168)


In [9]:
print("\nModel Comparison Results:")
summary = pd.DataFrame(results).T
print(summary)


Model Comparison Results:
                        MSE        R²
Linear Regression  0.555892  0.575788
Decision Tree      0.415468  0.682948
Random Forest      0.253976  0.806186
Gradient Boosting  0.261498  0.800445
