In [1]:
# In this file all the issues are expected to be solved

In [19]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split 
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv('Bengaluru_House_Data.csv')
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [5]:
# Drop these features (too many missing or irrelevant)
df = df.drop(['area_type','society','balcony','availability'], axis='columns')

In [6]:
# Drop rows with missing values
df = df.dropna()

In [7]:
# Convert 'size' to numeric (e.g., '2 BHK' → 2)
df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))

In [8]:
# Convert 'total_sqft' to a single numeric value
def convert_sqft_to_num(x):
    try:
        x = str(x)
        tokens = x.split('-')
        if len(tokens) == 2:
            return (float(tokens[0]) + float(tokens[1])) / 2
        return float(x)
    except:
        return None

df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_num)

# Drop rows with invalid or null 'total_sqft'
df = df.dropna(subset=['total_sqft'])

print("After sqft conversion:", df.shape)

After sqft conversion: (13200, 6)


In [9]:
# Add new column: price per square foot
df['price_per_sqft'] = df['price'] * 100000 / df['total_sqft']


In [10]:
# Clean 'location' text
df['location'] = df['location'].apply(lambda x: x.strip())

# Group rare locations as 'other'
location_counts = df['location'].value_counts()
rare_locations = location_counts[location_counts <= 10].index
df['location'] = df['location'].apply(lambda x: 'other' if x in rare_locations else x)

print("Unique locations:", len(df['location'].unique()))

Unique locations: 241


In [11]:
# Remove rows where total_sqft per BHK is less than 300
df = df[df['total_sqft'] / df['bhk'] >= 300]

print("After removing low sqft per BHK:", df.shape)

After removing low sqft per BHK: (12456, 7)


In [12]:
# This function removes price_per_sqft outliers location-wise
def remove_pps_outliers(df):
    cleaned_df = pd.DataFrame()  # Empty DataFrame to store clean data
    
    # Group data by each location
    for location, group in df.groupby('location'):
        mean = group['price_per_sqft'].mean()
        std = group['price_per_sqft'].std()
        
        # Keep only rows where price_per_sqft is within one std deviation from mean
        filtered = group[(group['price_per_sqft'] > (mean - std)) & 
                         (group['price_per_sqft'] <= (mean + std))]
        
        cleaned_df = pd.concat([cleaned_df, filtered], ignore_index=True)
    
    return cleaned_df

# Apply the function to clean the dataset
df = remove_pps_outliers(df)

print("✅ After removing price_per_sqft outliers:", df.shape)

✅ After removing price_per_sqft outliers: (10293, 7)


In [13]:
# This function removes BHK-based pricing outliers for each location
def remove_bhk_outliers(df):
    indices_to_remove = []  # Store indexes of outliers

    # Group data by each location
    for location, location_df in df.groupby('location'):
        # Create a dictionary to store price stats for each BHK level
        bhk_price_stats = {}

        # Loop through each BHK group in that location
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_price_stats[bhk] = {
                'mean_price': bhk_df['price_per_sqft'].mean(),
                'std_dev': bhk_df['price_per_sqft'].std(),
                'count': bhk_df.shape[0]
            }

        # Now check if a higher BHK is priced less than the lower BHK's average
        for bhk, bhk_df in location_df.groupby('bhk'):
            lower_bhk_stats = bhk_price_stats.get(bhk - 1)
            if lower_bhk_stats and lower_bhk_stats['count'] > 5:
                # If this BHK is priced less than the previous BHK's average → outlier
                bad_bhk = bhk_df[bhk_df['price_per_sqft'] < lower_bhk_stats['mean_price']]
                indices_to_remove.extend(bad_bhk.index)

    # Drop all detected outliers
    return df.drop(indices_to_remove, axis='index')

# Apply the function
df = remove_bhk_outliers(df)

print("✅ After removing BHK outliers:", df.shape)

✅ After removing BHK outliers: (7365, 7)


In [14]:
# Keep rows where number of bathrooms is reasonable
df = df[df['bath'] < df['bhk'] + 2]

print("After removing bathroom outliers:", df.shape)

After removing bathroom outliers: (7286, 7)


In [15]:
# Drop unused columns
df_model = df.drop(['size', 'price_per_sqft'], axis=1)
df_model.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2


In [17]:
ct = ColumnTransformer(transformers=
    [("tran1", SimpleImputer(missing_values=np.nan, strategy= 'median'), ['bath']),
     ("tran2", OneHotEncoder(handle_unknown='ignore'), ['location'])
    ], remainder='passthrough'
)

In [18]:
from sklearn.model_selection import train_test_split
#Split features and label
X = df_model.drop('price', axis=1)
y = df_model['price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

pipeline_ct = Pipeline(steps=[
    ('preprocessing', ct),
    ('model', XGBRegressor(n_estimators=100, random_state=42, learning_rate= 0.1))
])

In [21]:
pipeline_ct.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [22]:
from sklearn.metrics import r2_score

y_pred = pipeline_ct.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.2f}")

R² Score: 0.86


In [23]:
import joblib
joblib.dump(pipeline_ct, 'prediction-model4-86.pkl')

['prediction-model4-86.pkl']