# Cleaning Missing values and Outlier removal

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df=pd.read_csv('Bengaluru_House_Data.csv')

In [4]:
df.head(5)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [5]:
df.shape

(13320, 9)

In [6]:
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [7]:
df['area_type'].value_counts()

Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64

In [8]:
df=df.drop(['area_type','availability','society'],axis=1)
df.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Kothanur,2 BHK,1200,2.0,1.0,51.0


In [9]:
df.shape

(13320, 6)

In [10]:
df.isnull().sum()

location        1
size           16
total_sqft      0
bath           73
balcony       609
price           0
dtype: int64

In [11]:
df=df.drop(['balcony'],axis=1)

In [12]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [13]:
df=df.dropna()
df.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [14]:
df.shape

(13246, 5)

In [15]:
df['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [16]:
df['bedrooms'] = df['size'].apply(lambda y:int(y.split()[0]))
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bedrooms
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [17]:

df=df.drop(['size'],axis=1)

In [18]:
df.head()

Unnamed: 0,location,total_sqft,bath,price,bedrooms
0,Electronic City Phase II,1056,2.0,39.07,2
1,Chikka Tirupathi,2600,5.0,120.0,4
2,Uttarahalli,1440,2.0,62.0,3
3,Lingadheeranahalli,1521,3.0,95.0,3
4,Kothanur,1200,2.0,51.0,2


In [19]:
df.total_sqft.unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [20]:
def value_check(x):
    try:
        float(x)
    except:
        return False
    return True

In [21]:
df[~df['total_sqft'].apply(value_check)].head()

Unnamed: 0,location,total_sqft,bath,price,bedrooms
30,Yelahanka,2100 - 2850,4.0,186.0,4
122,Hebbal,3067 - 8156,4.0,477.0,4
137,8th Phase JP Nagar,1042 - 1105,2.0,54.005,2
165,Sarjapur,1145 - 1340,2.0,43.49,2
188,KR Puram,1015 - 1540,2.0,56.8,2


In [22]:
def value_correction(y):
    a=y.split('-')
    if(len(a)==2):
        return(float(a[0])+float(a[1]))/2
    try:
        return float(y)
    except:
        return None

In [23]:
df['total_sqft']=df['total_sqft'].apply(value_correction)

In [24]:
df.head()

Unnamed: 0,location,total_sqft,bath,price,bedrooms
0,Electronic City Phase II,1056.0,2.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,120.0,4
2,Uttarahalli,1440.0,2.0,62.0,3
3,Lingadheeranahalli,1521.0,3.0,95.0,3
4,Kothanur,1200.0,2.0,51.0,2


In [25]:
df[~df['total_sqft'].apply(value_check)].head().sum()

location      0.0
total_sqft    0.0
bath          0.0
price         0.0
bedrooms      0.0
dtype: float64

In [26]:
df2=df['location'].value_counts()

In [27]:
df2

Whitefield           534
Sarjapur  Road       392
Electronic City      302
Kanakpura Road       266
Thanisandra          233
                    ... 
Vidyapeeta             1
Maruthi Extension      1
Okalipura              1
Old Town               1
Abshot Layout          1
Name: location, Length: 1304, dtype: int64

In [28]:
df2=df2[df2>9]

In [29]:
len(df2)

253

In [30]:
def more_data(x):
    if(x in df2):
        return x
    else:
        return None

In [31]:
df['location']=df['location'].apply(more_data)

In [32]:
len(df['location'].unique())

254

In [33]:
df[df['bedrooms']>5]

Unnamed: 0,location,total_sqft,bath,price,bedrooms
9,,1020.0,6.0,370.0,6
45,HSR Layout,600.0,9.0,200.0,8
58,Murugeshpalya,1407.0,4.0,150.0,6
64,Bommanahalli,3000.0,8.0,140.0,8
68,,1350.0,7.0,85.0,8
...,...,...,...,...,...
13221,,1178.0,9.0,75.0,9
13226,Raja Rajeshwari Nagar,6000.0,8.0,215.0,8
13277,,1400.0,7.0,218.0,7
13279,,1200.0,5.0,130.0,6


In [34]:
df['price_per_sqrft']=(df['price']*100000)/df['total_sqft']

In [35]:
df.head()

Unnamed: 0,location,total_sqft,bath,price,bedrooms,price_per_sqrft
0,Electronic City Phase II,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,1200.0,2.0,51.0,2,4250.0


In [36]:
df['bedrooms'].describe()

count    13246.000000
mean         2.801902
std          1.295758
min          1.000000
25%          2.000000
50%          3.000000
75%          3.000000
max         43.000000
Name: bedrooms, dtype: float64

In [37]:
def outliers(x):
    mean=np.mean(df.bedrooms)
    std=np.std(df.bedrooms)
    if(x>mean-(3*std) and  x<mean+(3*std)):
        return x
    else:
        return None    

In [38]:
df['bedrooms']=df['bedrooms'].apply(outliers)

In [39]:
df.head()

Unnamed: 0,location,total_sqft,bath,price,bedrooms,price_per_sqrft
0,Electronic City Phase II,1056.0,2.0,39.07,2.0,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.0,4.0,4615.384615
2,Uttarahalli,1440.0,2.0,62.0,3.0,4305.555556
3,Lingadheeranahalli,1521.0,3.0,95.0,3.0,6245.890861
4,Kothanur,1200.0,2.0,51.0,2.0,4250.0


In [40]:
len(df)

13246

In [41]:
df.isnull().sum()

location           2776
total_sqft           46
bath                  0
price                 0
bedrooms            269
price_per_sqrft      46
dtype: int64

In [42]:
df=df.dropna()

In [43]:
df[df['bedrooms']==45]

Unnamed: 0,location,total_sqft,bath,price,bedrooms,price_per_sqrft


In [44]:
df['price_per_sqrft'].describe()

count    1.028900e+04
mean     7.448882e+03
std      1.183494e+05
min      3.714286e+02
25%      4.210526e+03
50%      5.309735e+03
75%      6.933333e+03
max      1.200000e+07
Name: price_per_sqrft, dtype: float64

In [45]:
df[df['location']=='Chandapura']

Unnamed: 0,location,total_sqft,bath,price,bedrooms,price_per_sqrft
101,Chandapura,650.0,1.0,17.00,2.0,2615.384615
240,Chandapura,645.0,1.0,16.45,1.0,2550.387597
242,Chandapura,645.0,1.0,16.45,1.0,2550.387597
250,Chandapura,800.0,1.0,20.00,2.0,2500.000000
289,Chandapura,1015.0,2.0,25.88,2.0,2549.753695
...,...,...,...,...,...,...
12849,Chandapura,740.0,1.0,22.00,2.0,2972.972973
12927,Chandapura,876.0,2.0,28.47,2.0,3250.000000
13117,Chandapura,1200.0,3.0,65.00,3.0,5416.666667
13147,Chandapura,1095.0,2.0,28.00,3.0,2557.077626


In [46]:
df[df['location']=='Yelahanka New Town']

Unnamed: 0,location,total_sqft,bath,price,bedrooms,price_per_sqrft
816,Yelahanka New Town,650.0,1.0,33.0,1.0,5076.923077
1235,Yelahanka New Town,500.0,1.0,20.0,1.0,4000.0
1428,Yelahanka New Town,440.0,1.0,16.5,1.0,3750.0
1792,Yelahanka New Town,550.0,2.0,26.0,2.0,4727.272727
2404,Yelahanka New Town,960.0,2.0,18.0,1.0,1875.0
2968,Yelahanka New Town,1610.0,3.0,92.0,3.0,5714.285714
3248,Yelahanka New Town,1700.0,3.0,90.0,3.0,5294.117647
3453,Yelahanka New Town,650.0,1.0,20.0,1.0,3076.923077
3525,Yelahanka New Town,1200.0,2.0,130.0,2.0,10833.333333
3741,Yelahanka New Town,350.0,1.0,13.5,1.0,3857.142857


In [47]:
def location_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqrft)
        st = np.std(subdf.price_per_sqrft)
        reduced_df = subdf[(subdf.price_per_sqrft>(m-st)) & (subdf.price_per_sqrft<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
df2 =location_outliers(df)
df2.shape

(8484, 6)

In [48]:
df.shape

(10289, 6)

In [49]:
len(df2[df2.bath>df2.bedrooms+2])

4

In [50]:
df=df2[df2.bath<df2.bedrooms+2]

In [51]:
df.head()

Unnamed: 0,location,total_sqft,bath,price,bedrooms,price_per_sqrft
0,Devarachikkanahalli,1250.0,2.0,44.0,3.0,3520.0
1,Devarachikkanahalli,1250.0,2.0,40.0,2.0,3200.0
2,Devarachikkanahalli,1200.0,2.0,83.0,2.0,6916.666667
3,Devarachikkanahalli,1170.0,2.0,40.0,2.0,3418.803419
4,Devarachikkanahalli,1425.0,2.0,65.0,3.0,4561.403509


In [52]:
df=df.drop(['price_per_sqrft'],axis=1)
df.head()

Unnamed: 0,location,total_sqft,bath,price,bedrooms
0,Devarachikkanahalli,1250.0,2.0,44.0,3.0
1,Devarachikkanahalli,1250.0,2.0,40.0,2.0
2,Devarachikkanahalli,1200.0,2.0,83.0,2.0
3,Devarachikkanahalli,1170.0,2.0,40.0,2.0
4,Devarachikkanahalli,1425.0,2.0,65.0,3.0


## One hot encoding

In [53]:
df3=pd.get_dummies(df.location)

In [54]:
df=pd.concat([df,df3],axis=1)
df.head()

Unnamed: 0,location,total_sqft,bath,price,bedrooms,Devarachikkanahalli,1st Block Jayanagar,1st Block Koramangala,1st Phase JP Nagar,2nd Phase Judicial Layout,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,Devarachikkanahalli,1250.0,2.0,44.0,3.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Devarachikkanahalli,1250.0,2.0,40.0,2.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Devarachikkanahalli,1200.0,2.0,83.0,2.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Devarachikkanahalli,1170.0,2.0,40.0,2.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Devarachikkanahalli,1425.0,2.0,65.0,3.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
df=df.drop(['location'],axis=1)

In [56]:
df.head()

Unnamed: 0,total_sqft,bath,price,bedrooms,Devarachikkanahalli,1st Block Jayanagar,1st Block Koramangala,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1250.0,2.0,44.0,3.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1250.0,2.0,40.0,2.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1200.0,2.0,83.0,2.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1170.0,2.0,40.0,2.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1425.0,2.0,65.0,3.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
y=df.price
X = df.drop(['price'],axis=1)

In [58]:
X.head()

Unnamed: 0,total_sqft,bath,bedrooms,Devarachikkanahalli,1st Block Jayanagar,1st Block Koramangala,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1250.0,2.0,3.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1250.0,2.0,2.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1200.0,2.0,2.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1170.0,2.0,2.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1425.0,2.0,3.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
y.head()

0    44.0
1    40.0
2    83.0
3    40.0
4    65.0
Name: price, dtype: float64

## Model Building

In [60]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [61]:
from sklearn.linear_model import LinearRegression
prediction=LinearRegression()
prediction.fit(X_train,y_train)
prediction.score(X_train,y_train)

0.8583721027965223

In [62]:
from sklearn import tree
data = tree.DecisionTreeRegressor()
data = data.fit(X_test,y_test)
data.score(X_test,y_test)
data.score(X_train,y_train)

0.6714664895303007

In [63]:
from sklearn.linear_model import LinearRegression
prediction=LinearRegression()
prediction.fit(X,y)
prediction.score(X_train,y_train)

0.8566034346539164

In [64]:
def price_prediction(location,sqrft,bath,bedrooms):    
    loc_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = sqrft
    x[1] = bath
    x[2] = bedrooms
    if loc_index >= 0:
        x[loc_index] = 1

    return prediction.predict([x])[0]

In [65]:
price_prediction('1st Phase JP Nagar',1000, 2, 2)

81.89287099544268

In [66]:
price_prediction('1st Block Jayanagar',1000,2,2)

183.9324476028291

In [67]:
price_prediction('1st Block Koramangala',1000,2,2)

148.6224384470877

In [68]:
price_prediction('Indira Nagar',1000,3,3)

159.8534889391454

In [69]:
price_prediction('Indira Nagar',300,1,1)

100.92023889213661

In [70]:
import pickle
with open('Banglore_model.pickle','wb')as f:
    pickle.dump(prediction,f)

In [71]:
import json
columns = {
    'columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))