In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Data Collection

In [2]:
df = pd.read_csv("Bengaluru_House_Data.csv")
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
df.shape

(13320, 9)

In [4]:
df.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


# Data Cleaning

In [6]:
df.groupby("area_type")['area_type'].agg("count")

area_type
Built-up  Area          2418
Carpet  Area              87
Plot  Area              2025
Super built-up  Area    8790
Name: area_type, dtype: int64

In [7]:
df.groupby("society")['society'].agg("count")

society
3Codeli    2
7 ise P    1
A idse     2
A rtsai    1
ACersd     1
          ..
Zonce E    2
Zostaa     3
i1ncyRe    1
i1odsne    1
i1rtsCo    3
Name: society, Length: 2688, dtype: int64

The features like area_type, society, balcony, and availability can't affect too much our price so wee will drop these columns.

In [8]:
df2= df.drop(["area_type", "society","balcony","availability"],axis='columns')
df2.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [9]:
df2.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [10]:
df2['bath']= df2['bath'].fillna(df.bath.median())

In [11]:
df2.isnull().sum()

location       1
size          16
total_sqft     0
bath           0
price          0
dtype: int64

In [12]:
df2=df2.dropna()
df2.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [13]:
df2.shape

(13303, 5)

In [14]:
df2.head(2)

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0


In [15]:
df2['bhk'] = df2['size'].apply(lambda x: int(x.split(' ')[0]))

In [16]:
df2.head(2)

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4


In [17]:
df3 = df2.drop('size', axis= 'columns')
df3.head(2)

Unnamed: 0,location,total_sqft,bath,price,bhk
0,Electronic City Phase II,1056,2.0,39.07,2
1,Chikka Tirupathi,2600,5.0,120.0,4


In [18]:
df3['bhk'].value_counts() #some houses with 43 bedrooms which is rarely not common

2     5528
3     4856
4     1417
1      656
5      356
6      221
7      100
8       89
9       54
10      14
11       4
27       1
19       1
16       1
43       1
14       1
12       1
13       1
18       1
Name: bhk, dtype: int64

In [19]:
df3[df3.bhk>20] # look at this the sqft of house is not so large as number of bedrooms & bathrooms

Unnamed: 0,location,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,8000,27.0,230.0,27
4684,Munnekollal,2400,40.0,660.0,43


In [20]:
df3. total_sqft.unique() #look at the intervals 1133 - 1384, i dont want these ranges to my model so i will take the average of these intervals.

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [21]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [22]:
df4= df3.copy()
df4[~df4['total_sqft'].apply(is_float)].head(10)
#the ~(negate) sign will show the ony not float values otherwise it will return me whole dataset

Unnamed: 0,location,total_sqft,bath,price,bhk
30,Yelahanka,2100 - 2850,4.0,186.0,4
56,Devanahalli,3010 - 3410,2.0,192.0,4
81,Hennur Road,2957 - 3450,2.0,224.5,4
122,Hebbal,3067 - 8156,4.0,477.0,4
137,8th Phase JP Nagar,1042 - 1105,2.0,54.005,2
165,Sarjapur,1145 - 1340,2.0,43.49,2
188,KR Puram,1015 - 1540,2.0,56.8,2
224,Devanahalli,1520 - 1740,2.0,74.82,3
410,Kengeri,34.46Sq. Meter,1.0,18.5,1
549,Hennur Road,1195 - 1440,2.0,63.77,2


In [23]:
def convert_sqft_to_avgnum(x):
    tokens = x.split('-')
    if len(tokens)==2:
        return(float(tokens[0])+float(tokens[1])/2)
    try:
        return float(x)
    except:
        return None

In [24]:
convert_sqft_to_avgnum('2100 - 2850') # return average

3525.0

In [25]:
convert_sqft_to_avgnum('1195')# return float

1195.0

In [26]:
convert_sqft_to_avgnum('34.46Sq. Meter')# return None

In [27]:
df4['total_sqft'] = df4['total_sqft'].apply(convert_sqft_to_avgnum)
df4.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,Electronic City Phase II,1056.0,2.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,120.0,4
2,Uttarahalli,1440.0,2.0,62.0,3
3,Lingadheeranahalli,1521.0,3.0,95.0,3
4,Kothanur,1200.0,2.0,51.0,2


In [28]:
df4[~df4['total_sqft'].apply(is_float)].head(5)

Unnamed: 0,location,total_sqft,bath,price,bhk


In [29]:
df4["total_sqft"].isnull().sum()

46

In [30]:
df4.shape

(13303, 5)

In [31]:
df4 = df4.dropna()

In [32]:
df4.shape

(13257, 5)

In [33]:
df4.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,Electronic City Phase II,1056.0,2.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,120.0,4
2,Uttarahalli,1440.0,2.0,62.0,3
3,Lingadheeranahalli,1521.0,3.0,95.0,3
4,Kothanur,1200.0,2.0,51.0,2


In [34]:
df5= df4.copy()
df5.head(2)

Unnamed: 0,location,total_sqft,bath,price,bhk
0,Electronic City Phase II,1056.0,2.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,120.0,4


In [35]:
df5['price_per_sqft'] = df5['price']*100000/df5['total_sqft']
df5.head(3)

Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,1440.0,2.0,62.0,3,4305.555556


now we have to work on location column, because location is categorical column and as the no of location is inceases our dataset will become more complicated due to to the curse of high dimensionality during one hot incoding.

In [36]:
len(df5.location.unique())# boom! look at the this big no 

1298

In [37]:
location_stats= df5.groupby('location')['location'].agg('count').sort_values(ascending=False)
location_stats                # many locations have only 1 records

location
Whitefield             537
Sarjapur  Road         397
Electronic City        302
Kanakpura Road         271
Thanisandra            233
                      ... 
 Banaswadi               1
Kanakadasa Layout        1
Kanakapur main road      1
Kanakapura  Rod          1
whitefiled               1
Name: location, Length: 1298, dtype: int64

In [38]:
len(location_stats[location_stats <=10]) #1058 locations has less than eqaul 10 records

1058

In [39]:
location_stats_less_than_10 = location_stats[location_stats <=10]

In [40]:
df5['location']= df5.location.apply(lambda x : 'others' if x in location_stats_less_than_10 else x)
len(df5.location.unique())   # now location column drops to 241 columns from 1298  

241

# Outlier Detection

Outlier Detection is a process of finding errors and unexpected records in the dataset for example a men of hiegh 20 feet talls or 1000 kg weight which is rarely not possible

-In a real state domain the minimum size of bedroom is 300 sqft, now  in any case where bedroom size is less than 300sqft, we treat it like an Outlier and remove them from the dataset.

In [41]:
df5.head(3)

Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,1440.0,2.0,62.0,3,4305.555556


we divide total_sqft by nnumber of bedrooms
for example = 600/6= 100 sqft

In [42]:
df5[df5.total_sqft/df5.bhk <300].head()

Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqft
9,others,1020.0,6.0,370.0,6,36274.509804
45,HSR Layout,600.0,9.0,200.0,8,33333.333333
58,Murugeshpalya,1407.0,4.0,150.0,6,10660.98081
68,others,1350.0,7.0,85.0,8,6296.296296
70,others,500.0,3.0,100.0,3,20000.0


In [43]:
df5.shape

(13257, 6)

In [44]:
df6= df5[~(df5.total_sqft/df5.bhk <300)]
df6.shape

(12513, 6)

In [45]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m= np.mean(subdf.price_per_sqft)
        std = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft >=(m-std)) & (subdf.price_per_sqft <=(m+std))]
        df_out =pd.concat([df_out, reduced_df], ignore_index= True)
    return df_out

In [46]:
df7 = remove_pps_outliers(df6)
df7.shape

(10272, 6)

In [47]:
df7[df7.bath > df7.bhk]

Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqft
23,1st Phase JP Nagar,2065.0,4.0,210.0,3,10169.491525
29,1st Phase JP Nagar,840.0,2.0,50.0,1,5952.380952
38,1st Phase JP Nagar,2615.0,5.0,222.0,4,8489.483748
49,2nd Stage Nagarbhavi,3000.0,8.0,451.0,6,15033.333333
50,2nd Stage Nagarbhavi,2400.0,8.0,450.0,6,18750.000000
...,...,...,...,...,...,...
10205,others,600.0,3.0,72.0,2,12000.000000
10223,others,2710.0,5.0,142.0,4,5239.852399
10233,others,6652.0,6.0,660.0,4,9921.828022
10240,others,6688.0,6.0,700.0,4,10466.507177


There are some records that have more than 2 bathrooms as compare to bedroom which is usually not happened, so we remove these records.

In [48]:
df7[df7.bath > df7.bhk +2]

Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqft
1640,Chikkabanavar,2460.0,7.0,80.0,4,3252.03252
5254,Nagasandra,7000.0,8.0,450.0,4,6428.571429
6730,Thanisandra,1806.0,6.0,116.0,3,6423.03433
8428,others,11338.0,9.0,1000.0,6,8819.897689


In [49]:
df8 = df7[~(df7.bath > df7.bhk +2)]
df8.shape

(10268, 6)

Now our dataset looks good we reomove outliers, clean data and do some feature engineering as well. Now its time to train our model.

# Model Creation

In [50]:
df9= df8.drop(['price_per_sqft'], axis='columns')

In [51]:
df9.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,Devarachikkanahalli,1250.0,2.0,44.0,3
1,Devarachikkanahalli,1250.0,2.0,40.0,2
2,Devarachikkanahalli,1200.0,2.0,83.0,2
3,Devarachikkanahalli,1170.0,2.0,40.0,2
4,Devarachikkanahalli,1425.0,2.0,65.0,3


In [52]:
dummies = pd.get_dummies(df9.location)

In [53]:
df10= pd.concat([df9.drop('location',axis='columns'),dummies], axis= 'columns')
df10.head()

Unnamed: 0,total_sqft,bath,price,bhk,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,others
0,1250.0,2.0,44.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1250.0,2.0,40.0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1200.0,2.0,83.0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1170.0,2.0,40.0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1425.0,2.0,65.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
X = df10.drop('price', axis='columns')
X.head()

Unnamed: 0,total_sqft,bath,bhk,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,others
0,1250.0,2.0,3,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1250.0,2.0,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1200.0,2.0,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1170.0,2.0,2,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1425.0,2.0,3,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
Y = df10['price']
Y.head()

0    44.0
1    40.0
2    83.0
3    40.0
4    65.0
Name: price, dtype: float64

In [56]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 100)

In [57]:
from sklearn.linear_model import LinearRegression
model= LinearRegression()
model.fit(X_train,Y_train)
model.score(X_test,Y_test)

0.8694638906269937

In [58]:
def predict_price():
    location = input('Enter Location : ')
    check_location = True
    while check_location:
        if location in X.columns:
            loc_index = np.where(X.columns == location)[0][0]
            check_location = False
        else:
            print("\033[1m"+"Location not Found, Please Enter the Location From: ")
            print(X.columns[3:])
            location = input('Enter Location: ')
################################################    
    check_sqft= True
    while check_sqft:   
        sqft = input('Enter Square feet: ')
        try:
            int(sqft)
            check_sqft= False
        except:
            print('Square Feet must be numeric')
###################################################
    check_bathroom= True
    while check_bathroom:   
        bath = input('Enter No. of bathrooms: ')
        try:
            int(bath)
            check_bathroom= False
        except:
            print('No. of Bathrooms must be numeric')
###########################################            
    check_bedroom= True
    while check_bedroom:   
        bhk = input('Enter No. of bedrooms: ')
        try:
            int(bhk)
            check_bedroom= False
        except:
            print('No. of Bedrooms must be numeric')
###################################################    
    x = np.zeros(len(X.columns))
    x[0]= sqft
    x[1]= bath
    x[2]= bhk
    if loc_index >=0:
        x[loc_index]= 1
    price = round(model.predict([x])[0]*100000, 0)
    print()
    print("\033[1m"+'\033[91m'+ 'The Price is',int(price), "RS.")

In [59]:
predict_price()

Enter Location : 5th Phase JP Nagar
Enter Square feet: 1200
Enter No. of bathrooms: 5
Enter No. of bedrooms: 7

[1m[91mThe Price is 4364248 RS.
