In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df=pd.read_csv('bhp.csv')
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250


In [4]:
df.tail()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
13195,Whitefield,5 Bedroom,3453.0,4.0,231.0,5,6689
13196,other,4 BHK,3600.0,5.0,400.0,4,11111
13197,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,60.0,2,5258
13198,Padmanabhanagar,4 BHK,4689.0,4.0,488.0,4,10407
13199,Doddathoguru,1 BHK,550.0,1.0,17.0,1,3090


In [5]:
df.shape

(13200, 7)

In [6]:
df.columns

Index(['location', 'size', 'total_sqft', 'bath', 'price', 'bhk',
       'price_per_sqft'],
      dtype='object')

In [7]:
df.dtypes

location           object
size               object
total_sqft        float64
bath              float64
price             float64
bhk                 int64
price_per_sqft      int64
dtype: object

In [8]:
df.isna().sum()

location          0
size              0
total_sqft        0
bath              0
price             0
bhk               0
price_per_sqft    0
dtype: int64

In [18]:
# sns.boxplot(x='location',y='price_per_sqft',data=df)
# plt.xticks(rotation=90)

In [10]:
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13200.0,13200.0,13200.0,13200.0,13200.0
mean,1555.302783,2.691136,112.276178,2.800833,7920.337
std,1237.323445,1.338915,149.175995,1.292843,106727.2
min,1.0,1.0,8.0,1.0,267.0
25%,1100.0,2.0,50.0,2.0,4267.0
50%,1275.0,2.0,71.85,3.0,5438.0
75%,1672.0,3.0,120.0,3.0,7317.0
max,52272.0,40.0,3600.0,43.0,12000000.0


total_sqft
------------

| Value in table | Meaning                                        |
| -------------- | ---------------------------------------------- |
| mean = 1555    | average home size is 1555 sqft                 |
| min = 1        | lowest home size is 1 sqft → **impossible**    |
| max = 52272    | biggest home is 52,272 sqft → **very unusual** |


bath (number of bathrooms)
--------------------------

| Value       | Meaning                                               |
| ----------- | ----------------------------------------------------- |
| mean = 2.69 | average house has about 3 bathrooms                   |
| min = 1     | smallest is 1 bathroom                                |
| max = 40    | some house shows 40 bathrooms → **wrong/unrealistic** |


price_per_sqft
--------------

| Value                | Meaning                                                                                                                                           |
| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- |
| **mean = 7,920**     | On average, houses cost about ₹7,920 per square foot.                                                                                             |
| **median = 5,438**   | Half the houses cost less than ₹5,438 per sqft and half cost more — this is the typical/central price without being affected by extreme outliers. |
| **max = 12,000,000** | The highest price per sqft is ₹1.2 crore per sqft → this is extremely unrealistic and indicates an outlier or incorrect data entry.               |


In [11]:
max_threshold=df['price_per_sqft'].quantile(0.999)
max_threshold

50959.36200000098

In [12]:
min_threshold=df['price_per_sqft'].quantile(0.001)
min_threshold

1366.184

So , price per sqft should not be less than 1366.184 or greater than 50959.362

In [13]:
# df.loc[df['price_per_sqft']<min_threshold]

| Check          | Purpose                    |
| -------------- | -------------------------- |
| sqft           | house must be normal size  |
| bath vs bhk    | bathroom count must match  |
| bhk            | bedrooms reasonable number |
| price_per_sqft | realistic price            |

Domain rules:

sqft > 300 & sqft < 15000

bath <= bhk + 2

bhk <= 10

price_per_sqft < 25000 (for Bangalore)

In [14]:
# df.loc[df['price_per_sqft']>max_threshold]

In [15]:
df2=df.loc[(df['price_per_sqft']<max_threshold)&(df['price_per_sqft']>min_threshold)]

In [16]:
df2.shape

(13172, 7)

In [17]:
df.shape

(13200, 7)