# Bangalore_house_price_prediction

## Importing the necessary libraries

In [216]:
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
import pickle

## Reading the dataset

In [217]:
df = pd.read_csv(r"D:\study material\PWskills DATA SCIENCE\Projects\Machine_Learning_Projects\house_price_prediction\dataset\Bengaluru_House_Data.csv")

In [218]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [219]:
df.shape

(13320, 9)

In [220]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [221]:
df.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


## Checking for null values

In [222]:
df.isna().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

## Dropping the columns that are not so useful for our prediction

In [223]:
df.drop(columns=['area_type','availability','society','balcony'],inplace=True)

In [224]:
df.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

## Checking for duplicate values


In [225]:
df.duplicated().sum()

np.int64(882)

## Dropping the duplicate rows

In [226]:
df.drop_duplicates(inplace=True)

In [227]:
df.duplicated().sum()

np.int64(0)

In [228]:
df['location'].value_counts()

location
Whitefield          507
Sarjapur  Road      364
Electronic City     273
Thanisandra         224
Kanakpura Road      223
                   ... 
Udayagiri             1
pavitra paradise      1
kadubisnahalli        1
Rahmath Nagar         1
Chikkanahalli         1
Name: count, Length: 1305, dtype: int64

In [229]:
df.location.isnull().sum()

np.int64(1)

## Filling the missing value in location column by Whitefield(as it's the most common loaction)                     

In [230]:
df['location'] = df['location'].fillna("Whitefield")

In [231]:
df['size'].value_counts()

size
2 BHK         4768
3 BHK         3988
4 Bedroom      819
4 BHK          562
3 Bedroom      527
1 BHK          507
2 Bedroom      303
5 Bedroom      288
6 Bedroom      191
1 Bedroom      100
8 Bedroom       84
7 Bedroom       82
5 BHK           59
9 Bedroom       46
6 BHK           30
7 BHK           17
1 RK            12
10 Bedroom      12
9 BHK            8
8 BHK            5
11 BHK           2
10 BHK           2
11 Bedroom       2
27 BHK           1
19 BHK           1
43 Bedroom       1
16 BHK           1
14 BHK           1
12 Bedroom       1
13 BHK           1
18 Bedroom       1
Name: count, dtype: int64

## Filling the missing values in the size column by 2BHK 

In [232]:
df['size'] = df['size'].fillna("2 BHK")

In [233]:
df['size'].isnull().sum()

np.int64(0)

## Filling the missing values in the bathroom column by it's median

In [234]:
df['bath'] = df['bath'].fillna(df['bath'].median())

In [235]:
df['bath'].isnull().sum()

np.int64(0)

In [236]:
df.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

### No more missing values present


## Since the size column contains discrepancy so we will filter the column and store it in another column

In [237]:
df['BHK'] = df['size'].str.split().str.get(0).astype('int')

In [238]:
df

Unnamed: 0,location,size,total_sqft,bath,price,BHK
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.00,4
2,Uttarahalli,3 BHK,1440,2.0,62.00,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.00,3
4,Kothanur,2 BHK,1200,2.0,51.00,2
...,...,...,...,...,...,...
13314,Green Glen Layout,3 BHK,1715,3.0,112.00,3
13315,Whitefield,5 Bedroom,3453,4.0,231.00,5
13316,Richards Town,4 BHK,3600,5.0,400.00,4
13317,Raja Rajeshwari Nagar,2 BHK,1141,2.0,60.00,2


## Dropping the size column as it is no longer needed

In [239]:
df.drop(columns=['size'],inplace=True)

In [240]:
df['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      shape=(2117,), dtype=object)

### since this column contains range, which is not desired so we will take the mean value of the range

In [241]:
def convert_sqft(x):
    try:
        if '-' in str(x):
            temp = x.split('-')
            return (float(temp[0]) + float(temp[1])) / 2
        return float(x)
    except:
        return np.nan

In [242]:
df['total_sqft'] = df['total_sqft'].apply(convert_sqft)

In [243]:
df = df.dropna(subset=['total_sqft']) 

In [244]:
df

Unnamed: 0,location,total_sqft,bath,price,BHK
0,Electronic City Phase II,1056.0,2.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,120.00,4
2,Uttarahalli,1440.0,2.0,62.00,3
3,Lingadheeranahalli,1521.0,3.0,95.00,3
4,Kothanur,1200.0,2.0,51.00,2
...,...,...,...,...,...
13314,Green Glen Layout,1715.0,3.0,112.00,3
13315,Whitefield,3453.0,4.0,231.00,5
13316,Richards Town,3600.0,5.0,400.00,4
13317,Raja Rajeshwari Nagar,1141.0,2.0,60.00,2


In [245]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12392 entries, 0 to 13318
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    12392 non-null  object 
 1   total_sqft  12392 non-null  float64
 2   bath        12392 non-null  float64
 3   price       12392 non-null  float64
 4   BHK         12392 non-null  int64  
dtypes: float64(3), int64(1), object(1)
memory usage: 580.9+ KB


## Creating a new column for price per square feet

In [246]:
df['price_per_sqft'] = (df['price']*100000)/ df ['total_sqft']

In [247]:
df['price_per_sqft']

0         3699.810606
1         4615.384615
2         4305.555556
3         6245.890861
4         4250.000000
             ...     
13314     6530.612245
13315     6689.834926
13316    11111.111111
13317     5258.545136
13318    10407.336319
Name: price_per_sqft, Length: 12392, dtype: float64

In [248]:
df.describe()

Unnamed: 0,total_sqft,bath,price,BHK,price_per_sqft
count,12392.0,12392.0,12392.0,12392.0,12392.0
mean,1576.376848,2.71409,115.298769,2.825775,8099.928
std,1273.831772,1.364432,153.189143,1.320146,110147.9
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4306.941
50%,1288.5,2.0,73.835,3.0,5500.0
75%,1700.0,3.0,124.0,3.0,7446.809
max,52272.0,40.0,3600.0,43.0,12000000.0


In [249]:
df['location'].value_counts()

location
Whitefield              506
Sarjapur  Road          364
Electronic City         273
Thanisandra             223
Kanakpura Road          221
                       ... 
Kanakapura Road,          1
Hal old airport road      1
Anantapuram               1
Javarandoddi              1
Ashraya Layout            1
Name: count, Length: 1299, dtype: int64

In [250]:
df['location'] = df['location'].apply(lambda x: x.strip())


In [251]:
location_count = df['location'].value_counts()

In [252]:
location_count_less10 = location_count[location_count<=10]
location_count_less10

location
Naganathapura                    10
Dairy Circle                     10
BEML Layout                      10
Thyagaraja Nagar                 10
Gunjur Palya                     10
                                 ..
Gubbi Cross, Hennur Main Road     1
Akash Nagar                       1
Shauhardha Layout                 1
Harohalli                         1
Electronic city phase 1,          1
Name: count, Length: 1058, dtype: int64

In [253]:
df['location'] = df['location'].apply(lambda x : 'other' if x in location_count_less10 else x)

In [254]:
df['location'].value_counts()

location
other                   2918
Whitefield               507
Sarjapur  Road           364
Electronic City          275
Thanisandra              225
                        ... 
Doddakallasandra          11
Banashankari Stage V      11
Bommenahalli              11
Doddaballapur             11
HAL 2nd Stage             11
Name: count, Length: 231, dtype: int64

In [255]:
df = df[((df['total_sqft']/df['BHK'])>=300)]

## Removing outliers

In [256]:
def remove_outlier_sqft(df):
    df_output = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = subdf['price_per_sqft'].mean()
        st = subdf['price_per_sqft'].std()
        gen_df = subdf[(subdf['price_per_sqft'] > (m - st)) & (subdf['price_per_sqft'] <= (m + st))]
        df_output = pd.concat([df_output, gen_df], ignore_index=True)
    return df_output

df = remove_outlier_sqft(df)

In [257]:
def remove_bhk_outliers(df):
    df = df.copy()
    exclude_indices = []
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('BHK'):
            clean_prices = bhk_df['price_per_sqft'].dropna()
            if not clean_prices.empty:
                bhk_stats[bhk] = {
                    "mean": clean_prices.mean(),
                    "count": clean_prices.shape[0]
                }
        for bhk, bhk_df in location_df.groupby('BHK'):
            prev_stats = bhk_stats.get(bhk - 1)
            if prev_stats and prev_stats['count'] > 5:
                mask = bhk_df['price_per_sqft'] >= prev_stats['mean']
                exclude_indices.extend(bhk_df[~mask].index.tolist())
    return df.drop(index=exclude_indices)

df = remove_bhk_outliers(df)


In [258]:
df.shape

(6877, 6)

In [259]:
df

Unnamed: 0,location,total_sqft,bath,price,BHK,price_per_sqft
0,1st Block Jayanagar,2850.0,4.0,428.0,4,15017.543860
1,1st Block Jayanagar,1630.0,3.0,194.0,3,11901.840491
2,1st Block Jayanagar,1875.0,2.0,235.0,3,12533.333333
3,1st Block Jayanagar,1200.0,2.0,130.0,3,10833.333333
4,1st Block Jayanagar,1235.0,2.0,148.0,2,11983.805668
...,...,...,...,...,...,...
9635,other,1650.0,1.0,130.0,2,7878.787879
9639,other,1155.0,2.0,64.0,2,5541.125541
9641,other,1200.0,2.0,70.0,2,5833.333333
9642,other,1800.0,1.0,200.0,1,11111.111111


## removing the unnecessary columns

In [260]:
df.drop(columns=['price_per_sqft'],inplace=True)

In [261]:
df.head()

Unnamed: 0,location,total_sqft,bath,price,BHK
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2


## Saving the cleaned data in .csv format

In [262]:
df.to_csv("cleaned_house_data.csv")

## X, Y split

In [263]:
x = df.drop(columns=['price'])
y = df['price']

## Train test split

In [264]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [265]:
x_train.shape,x_test.shape

((5501, 4), (1376, 4))

## Encoding the location column

In [266]:
column_trans = make_column_transformer(
    (OneHotEncoder(drop='first', sparse_output=False), ['location']),
    remainder='passthrough'
)

## Defining different models for comparison 

In [267]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso Regression": Lasso(),
    "Ridge Regression": Ridge(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42)
}

In [268]:
best_model = None
best_score = -np.inf
best_model_name = None

for name, model in models.items():
    pipe = make_pipeline(column_trans, StandardScaler(with_mean=False), model)
    scores = cross_val_score(pipe, x_train, y_train, cv=5, scoring='r2')
    mean_score = np.mean(scores)
    print(f"{name}: Mean CV R² Score = {mean_score:.4f}")
    if mean_score > best_score:
        best_score = mean_score
        best_model = pipe
        best_model_name = name

print(f"\nBest Model Before Tuning: {best_model_name}")

# Hyperparameter tuning
param_grids = {
    "XGBoost": {
        "xgbregressor__n_estimators": [50, 100, 200],
        "xgbregressor__learning_rate": [0.01, 0.1, 0.2],
        "xgbregressor__max_depth": [3, 5, 7],
        "xgbregressor__subsample": [0.8, 1.0],
        "xgbregressor__colsample_bytree": [0.8, 1.0]
    },
    "Random Forest": {
        "randomforestregressor__n_estimators": [50, 100, 200],
        "randomforestregressor__max_depth": [5, 10, None],
        "randomforestregressor__min_samples_split": [2, 5, 10],
        "randomforestregressor__min_samples_leaf": [1, 2, 4]
    }
}

# Perform tuning for the best model
if best_model_name in param_grids:
    grid_search = GridSearchCV(best_model, param_grids[best_model_name], cv=5, scoring='r2', n_jobs=-1)
    grid_search.fit(x_train, y_train)
    best_model = grid_search.best_estimator_
    print(f"\nBest Hyperparameters: {grid_search.best_params_}")

# Train and evaluate the best model
best_model.fit(x_train, y_train)
y_pred = best_model.predict(x_test)

# Print final metrics
print(f"\nFinal Selected Model: {best_model_name}")
print(f"Test R² Score: {r2_score(y_test, y_pred):.4f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")

Linear Regression: Mean CV R² Score = 0.8413
Lasso Regression: Mean CV R² Score = 0.8333
Ridge Regression: Mean CV R² Score = 0.8413
Decision Tree: Mean CV R² Score = 0.7397
Random Forest: Mean CV R² Score = 0.7869
XGBoost: Mean CV R² Score = 0.8328

Best Model Before Tuning: Ridge Regression

Final Selected Model: Ridge Regression
Test R² Score: 0.8016
MAE: 19.6350
RMSE: 39.9124


In [269]:
import pickle
pickle.dump(best_model, open('BestModel.pkl', 'wb'))  