In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

In [3]:
data = pd.read_csv("Bengaluru_House_Data.csv")
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
data.shape

(13320, 9)

In [5]:
data.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [6]:
data.groupby("area_type")["area_type"].agg("count")

area_type
Built-up  Area          2418
Carpet  Area              87
Plot  Area              2025
Super built-up  Area    8790
Name: area_type, dtype: int64

In [7]:
data.drop(['availability','society','balcony','area_type'],axis=1,inplace=True)
data.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [8]:
data.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [9]:
round(np.mean(data.bath))

3

In [10]:
data["bath"].fillna(round(np.mean(data.bath)),inplace=True)
data["size"].fillna(round(np.mean(data.size)),inplace=True)
data.dropna(inplace=True)
data.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [11]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [12]:
data.dtypes

location       object
size           object
total_sqft     object
bath          float64
price         float64
dtype: object

In [13]:
data["size"].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', 66600, '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [14]:
type(data.iloc[:,1])

pandas.core.series.Series

In [15]:
data["bkh"] = (data["size"].str.split(expand=True)[:][0])
data["bkh"] = pd.to_numeric(data.bkh, errors="coerce")
data.bkh.unique()

array([ 2.,  4.,  3.,  6.,  1.,  8.,  7.,  5., 11.,  9., nan, 27., 10.,
       19., 16., 43., 14., 12., 13., 18.])

In [16]:
data.isnull().sum()

location       0
size           0
total_sqft     0
bath           0
price          0
bkh           16
dtype: int64

In [17]:
data.dropna(inplace=True)
data.drop("size",axis=1,inplace=True)

In [18]:
data.isnull().sum()

location      0
total_sqft    0
bath          0
price         0
bkh           0
dtype: int64

In [19]:
data.head()

Unnamed: 0,location,total_sqft,bath,price,bkh
0,Electronic City Phase II,1056,2.0,39.07,2.0
1,Chikka Tirupathi,2600,5.0,120.0,4.0
2,Uttarahalli,1440,2.0,62.0,3.0
3,Lingadheeranahalli,1521,3.0,95.0,3.0
4,Kothanur,1200,2.0,51.0,2.0


In [20]:
data.total_sqft.unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [21]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [22]:
data[~data["total_sqft"].apply(is_float)].head(10)

Unnamed: 0,location,total_sqft,bath,price,bkh
30,Yelahanka,2100 - 2850,4.0,186.0,4.0
56,Devanahalli,3010 - 3410,3.0,192.0,4.0
81,Hennur Road,2957 - 3450,3.0,224.5,4.0
122,Hebbal,3067 - 8156,4.0,477.0,4.0
137,8th Phase JP Nagar,1042 - 1105,2.0,54.005,2.0
165,Sarjapur,1145 - 1340,2.0,43.49,2.0
188,KR Puram,1015 - 1540,2.0,56.8,2.0
224,Devanahalli,1520 - 1740,3.0,74.82,3.0
410,Kengeri,34.46Sq. Meter,1.0,18.5,1.0
549,Hennur Road,1195 - 1440,2.0,63.77,2.0


In [23]:
def convert_to_num(x):
    tokens = x.split("-")
    if len(tokens)==2 :
        return (float(tokens[0]) + float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

In [24]:
convert_to_num("37-457")

247.0

In [25]:
convert_to_num("45")

45.0

In [26]:
data["total_sqft"] = data["total_sqft"].apply(convert_to_num)

In [27]:
data.isnull().sum()

location       0
total_sqft    46
bath           0
price          0
bkh            0
dtype: int64

In [28]:
data.dropna(inplace=True)

In [29]:
data.dtypes

location       object
total_sqft    float64
bath          float64
price         float64
bkh           float64
dtype: object

In [30]:
data.shape

(13257, 5)

In [31]:
new_data = data.copy()

In [32]:
new_data["price_per_sqft"] = new_data["price"]*100000/new_data["total_sqft"]
new_data.head()

Unnamed: 0,location,total_sqft,bath,price,bkh,price_per_sqft
0,Electronic City Phase II,1056.0,2.0,39.07,2.0,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.0,4.0,4615.384615
2,Uttarahalli,1440.0,2.0,62.0,3.0,4305.555556
3,Lingadheeranahalli,1521.0,3.0,95.0,3.0,6245.890861
4,Kothanur,1200.0,2.0,51.0,2.0,4250.0


In [33]:
len(new_data.location.unique())

1298

In [34]:
new_data.location = new_data.location.apply(lambda x : x.strip())
location_stats = new_data.groupby("location")["location"].agg("count").sort_values(ascending=False)
location_stats

location
Whitefield               538
Sarjapur  Road           397
Electronic City          304
Kanakpura Road           271
Thanisandra              236
                        ... 
1 Giri Nagar               1
Kanakapura Road,           1
Kanakapura main  Road      1
Kannur                     1
whitefiled                 1
Name: location, Length: 1287, dtype: int64

In [35]:
len(location_stats[location_stats<=10])

1047

In [36]:
location_stats_less_than_10 = location_stats[location_stats<=10]
location_stats_less_than_10

location
Sadashiva Nagar          10
Naganathapura            10
Basapura                 10
Nagadevanahalli          10
Kalkere                  10
                         ..
1 Giri Nagar              1
Kanakapura Road,          1
Kanakapura main  Road     1
Kannur                    1
whitefiled                1
Name: location, Length: 1047, dtype: int64

In [37]:
len(new_data.location.unique())

1287

In [38]:
new_data.location = new_data.location.apply(lambda x: "other" if x in location_stats_less_than_10 else x)
len(new_data.location.unique())

241

In [39]:
new_data.head(10)

Unnamed: 0,location,total_sqft,bath,price,bkh,price_per_sqft
0,Electronic City Phase II,1056.0,2.0,39.07,2.0,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.0,4.0,4615.384615
2,Uttarahalli,1440.0,2.0,62.0,3.0,4305.555556
3,Lingadheeranahalli,1521.0,3.0,95.0,3.0,6245.890861
4,Kothanur,1200.0,2.0,51.0,2.0,4250.0
5,Whitefield,1170.0,2.0,38.0,2.0,3247.863248
6,Old Airport Road,2732.0,4.0,204.0,4.0,7467.057101
7,Rajaji Nagar,3300.0,4.0,600.0,4.0,18181.818182
8,Marathahalli,1310.0,3.0,63.25,3.0,4828.244275
9,other,1020.0,6.0,370.0,6.0,36274.509804


In [40]:
new_data[(new_data.total_sqft/new_data.bkh)<300].head()

Unnamed: 0,location,total_sqft,bath,price,bkh,price_per_sqft
9,other,1020.0,6.0,370.0,6.0,36274.509804
45,HSR Layout,600.0,9.0,200.0,8.0,33333.333333
58,Murugeshpalya,1407.0,4.0,150.0,6.0,10660.98081
68,Devarachikkanahalli,1350.0,7.0,85.0,8.0,6296.296296
70,other,500.0,3.0,100.0,3.0,20000.0


In [41]:
new_data.shape

(13257, 6)

In [42]:
final_data = new_data[(new_data.total_sqft/new_data.bkh)>300]
final_data.shape

(12331, 6)

In [43]:
final_data.price_per_sqft.describe()

count     12331.000000
mean       6211.250024
std        4046.224800
min         267.829813
25%        4200.000000
50%        5268.199234
75%        6827.794562
max      176470.588235
Name: price_per_sqft, dtype: float64

In [44]:
def remove_pps_outliers(data):
    data_out = pd.DataFrame()
    for key, subdf in data.groupby("location"):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        data_out = pd.concat([data_out,reduced_df],ignore_index=True)
    return data_out

In [45]:
final_data = remove_pps_outliers(final_data)
final_data.shape

(10067, 6)

In [46]:
final_data = final_data[final_data.bath<final_data.bkh+2]
final_data.shape

(9969, 6)

In [47]:
final_data.drop("price_per_sqft",axis=1,inplace=True)
final_data.head()

Unnamed: 0,location,total_sqft,bath,price,bkh
0,1st Block Jayanagar,2850.0,4.0,428.0,4.0
1,1st Block Jayanagar,1630.0,3.0,194.0,3.0
2,1st Block Jayanagar,1875.0,2.0,235.0,3.0
3,1st Block Jayanagar,1200.0,2.0,130.0,3.0
4,1st Block Jayanagar,1235.0,2.0,148.0,2.0


In [48]:
dummies = pd.get_dummies(final_data.location,dtype=int)
dummies.head()

Unnamed: 0,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,9th Phase JP Nagar,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
final_data = pd.concat([final_data,dummies.drop("other",axis=1)],axis=1)
final_data.head()

Unnamed: 0,location,total_sqft,bath,price,bkh,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1st Block Jayanagar,2850.0,4.0,428.0,4.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1st Block Jayanagar,1630.0,3.0,194.0,3.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1st Block Jayanagar,1875.0,2.0,235.0,3.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1st Block Jayanagar,1200.0,2.0,130.0,3.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1st Block Jayanagar,1235.0,2.0,148.0,2.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
final_data.drop("location",axis=1,inplace=True)


In [51]:
final_data.shape

(9969, 244)

In [52]:
x = final_data.drop("price",axis=1)
y = final_data["price"].values

In [53]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.22,random_state=0)

In [54]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train.values,y_train)
model.score(x_test.values,y_test)

0.8119504676358643

In [55]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5,test_size=0.20,random_state=0)

cross_val_score(LinearRegression(),x,y,cv=cv)

array([0.81440928, 0.80456941, 0.8610965 , 0.8330794 , 0.80439439])

In [56]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridcv(x,y):
    algos = {
        "linear_regression":{
            "model": LinearRegression(),
            "params" : {
                "positive" :[True,False]
            }

        
        },

        "lasso":{
            "model":Lasso(),
            "params":{
                "alpha":[1,2],
                "selection":["random","cyclic"]
            }
        },
        "decision_tree":{
            "model" : DecisionTreeRegressor(),
            "params": {
                "criterion" :["mse","friedman_mse"],
                "splitter" : ["best","random"]
            }
        }
    }
    scores = []

    cv = ShuffleSplit(n_splits=5,test_size=0.22,random_state=0)
    for algo_name ,config in algos.items():
        gs = GridSearchCV(config["model"],config["params"],cv=cv,return_train_score=False)
        gs.fit(x,y)
        scores.append({
            "model":algo_name,
            "best_score":gs.best_score_,
            "best_params" : gs.best_params_
        })
    return pd.DataFrame(scores,columns=["model","best_score","best_params"])

In [57]:
find_best_model_using_gridcv(x,y)



Unnamed: 0,model,best_score,best_params
0,linear_regression,0.80572,{'positive': True}
1,lasso,0.680928,"{'alpha': 1, 'selection': 'random'}"
2,decision_tree,0.697571,"{'criterion': 'mse', 'splitter': 'best'}"


In [58]:
def predict_price(location,sqft,bath,bkh):
    loc_index = np.where(x.columns==location)[0][0]

    a = np.zeros(len(x.columns))
    a[0] = sqft
    a[1] = bath
    a[2] = bkh
    if loc_index >= 0:
        a[loc_index] = 1
    return model.predict([a])[0]

In [59]:
predict_price("1st Phase JP Nagar",1000,2,2)

98.77132557248859

In [60]:
predict_price("1st Phase JP Nagar",1000,3,3)

99.5274429729302

In [61]:
predict_price("Indira Nagar",1000,2,2)

171.73876828712966

In [62]:
predict_price("Indira Nagar",1000,3,3)

172.49488568757127

In [63]:
import pickle
with open("banglore_home_prices_model.pickle","wb") as f:
    pickle.dump(model,f)

In [64]:
import json 
columns = {
    "data_columns" : [col.lower() for col in x.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))