In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import joblib

# Load and clean data
df = pd.read_csv("bengaluru_house_prices.csv")
df = df.drop(['area_type', 'availability', 'society'], axis=1)
df = df.dropna()

# Feature engineering
df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))
df = df.drop(['size'], axis=1)

def convert_sqft(x):
    try:
        return float(x)
    except:
        if '-' in x:
            a, b = x.split('-')
            return (float(a) + float(b)) / 2
        return None

df['total_sqft'] = df['total_sqft'].apply(convert_sqft)
df = df.dropna()
df = df[df['bath'] < df['bhk'] + 2]

df['location'] = df['location'].apply(lambda x: x.strip())
loc_counts = df['location'].value_counts()
df['location'] = df['location'].apply(lambda x: 'other' if loc_counts[x] <= 10 else x)

# One-hot encoding for location
dummies = pd.get_dummies(df['location'], drop_first=True)
df = pd.concat([df.drop('location', axis=1), dummies], axis=1)

# Split data
X = df.drop('price', axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [2]:
def evaluate_model(model, name):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    r2 = r2_score(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    print(f"{name} --> R²: {r2:.3f}, RMSE: {rmse:.3f}")

# Linear Regression
evaluate_model(LinearRegression(), "Linear Regression")

# Ridge Regression
evaluate_model(Ridge(alpha=1.0), "Ridge Regression")

# Lasso Regression
evaluate_model(Lasso(alpha=0.1), "Lasso Regression")

# Random Forest Regressor
evaluate_model(RandomForestRegressor(n_estimators=100, random_state=42), "Random Forest")

# Gradient Boosting
evaluate_model(GradientBoostingRegressor(n_estimators=100, random_state=42), "Gradient Boosting")


Linear Regression --> R²: 0.544, RMSE: 71.242
Ridge Regression --> R²: 0.545, RMSE: 71.183
Lasso Regression --> R²: 0.535, RMSE: 71.968
Random Forest --> R²: 0.475, RMSE: 76.469
Gradient Boosting --> R²: 0.449, RMSE: 78.288


In [3]:
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20]
}

grid = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='r2')
grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)
best_model = grid.best_estimator_

# Evaluate best model
y_pred = best_model.predict(X_test)
print("Best R²:", r2_score(y_test, y_pred))


Best Parameters: {'max_depth': 10, 'n_estimators': 100}
Best R²: 0.5026422885067361
