In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Project

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv('bengaluru_house_prices.csv')
df

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00


# Cleaning the dataset 

__Removing unimportant features__

In [5]:
df1 = df.drop(['society','area_type','availability'],1)

__Removing NaN values__

In [6]:
df1.isnull().sum()
df1['balcony'] = df1.balcony.fillna(0)
df1.isnull().sum()
df2 = df1.dropna()

location        1
size           16
total_sqft      0
bath           73
balcony       609
price           0
dtype: int64

location       1
size          16
total_sqft     0
bath          73
balcony        0
price          0
dtype: int64

__Organizing bedroom column__

In [7]:
df2['size'] = df2['size'].apply(lambda x: x.split(' ')[0])
df2.rename(columns={'size':'bedrooms'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['size'] = df2['size'].apply(lambda x: x.split(' ')[0])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


__Organizing the squared feet column__

In [8]:
df2.total_sqft.unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [9]:
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

In [10]:
df2.total_sqft = df2.total_sqft.apply(convert_sqft_to_num)
df2 = df2[df2.total_sqft.notnull()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


__Converting data types__

In [11]:
df2['bedrooms'] = pd.to_numeric(df2['bedrooms'])
df2['price'] = df2['price'] * 100000  # Putting the right scaling in the price column

__Reducing the number of location categories__

In [12]:
location_stats = df2.groupby('location')['location'].agg('count')
location_stats.sort_values(ascending=False)
location_stats_lt_10 = location_stats[location_stats<=10]
df2['location'] = df2['location'].apply(lambda x: 'other' if x in location_stats_lt_10 else x)

location
Whitefield                              532
Sarjapur  Road                          392
Electronic City                         302
Kanakpura Road                          264
Thanisandra                             232
                                       ... 
Kodanda Reddy Layout                      1
Kirloskar layout, Basaveshwarnagar        1
Kengeri Satellite Town Stage II           1
Kengeri Satellite Town KHB Apartment      1
 Anekal                                   1
Name: location, Length: 1298, dtype: int64

__Removing outliers__

In [13]:
df2['price/sqft'] = df2['price']/df2['total_sqft']
df2['sqft/bedroom'] = df2['total_sqft']/df2['bedrooms']
df2['price/sqft.bedroom'] = df2['price']/(df2['total_sqft']*df2['bedrooms'])

In [14]:
df2

Unnamed: 0,location,bedrooms,total_sqft,bath,balcony,price,price/sqft,sqft/bedroom,price/sqft.bedroom
0,Electronic City Phase II,2,1056.0,2.0,1.0,3907000.0,3699.810606,528.00,1849.905303
1,Chikka Tirupathi,4,2600.0,5.0,3.0,12000000.0,4615.384615,650.00,1153.846154
2,Uttarahalli,3,1440.0,2.0,3.0,6200000.0,4305.555556,480.00,1435.185185
3,Lingadheeranahalli,3,1521.0,3.0,1.0,9500000.0,6245.890861,507.00,2081.963620
4,Kothanur,2,1200.0,2.0,1.0,5100000.0,4250.000000,600.00,2125.000000
...,...,...,...,...,...,...,...,...,...
13315,Whitefield,5,3453.0,4.0,0.0,23100000.0,6689.834926,690.60,1337.966985
13316,other,4,3600.0,5.0,0.0,40000000.0,11111.111111,900.00,2777.777778
13317,Raja Rajeshwari Nagar,2,1141.0,2.0,1.0,6000000.0,5258.545136,570.50,2629.272568
13318,Padmanabhanagar,4,4689.0,4.0,1.0,48800000.0,10407.336319,1172.25,2601.834080


In [15]:
stats = df2['sqft/bedroom'].describe()
m = stats[1]
st = stats[2]
df2 = df2[(df2['sqft/bedroom'] <= m+st) & (df2['sqft/bedroom'] > m-st)]
df2

Unnamed: 0,location,bedrooms,total_sqft,bath,balcony,price,price/sqft,sqft/bedroom,price/sqft.bedroom
0,Electronic City Phase II,2,1056.0,2.0,1.0,3907000.0,3699.810606,528.000000,1849.905303
1,Chikka Tirupathi,4,2600.0,5.0,3.0,12000000.0,4615.384615,650.000000,1153.846154
2,Uttarahalli,3,1440.0,2.0,3.0,6200000.0,4305.555556,480.000000,1435.185185
3,Lingadheeranahalli,3,1521.0,3.0,1.0,9500000.0,6245.890861,507.000000,2081.963620
4,Kothanur,2,1200.0,2.0,1.0,5100000.0,4250.000000,600.000000,2125.000000
...,...,...,...,...,...,...,...,...,...
13314,Green Glen Layout,3,1715.0,3.0,3.0,11200000.0,6530.612245,571.666667,2176.870748
13315,Whitefield,5,3453.0,4.0,0.0,23100000.0,6689.834926,690.600000,1337.966985
13316,other,4,3600.0,5.0,0.0,40000000.0,11111.111111,900.000000,2777.777778
13317,Raja Rajeshwari Nagar,2,1141.0,2.0,1.0,6000000.0,5258.545136,570.500000,2629.272568


In [16]:
def remove_outliers(df, group, feature):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby(group):
        m = np.mean(subdf[feature])
        st = np.std(subdf[feature])
        reduced_df = subdf[(subdf[feature]>(m-st)) & (subdf[feature]<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out

In [17]:
df3 = remove_outliers(df2, 'location', 'price/sqft')
df3 = remove_outliers(df3, 'location','price/sqft.bedroom')

df3

Unnamed: 0,location,bedrooms,total_sqft,bath,balcony,price,price/sqft,sqft/bedroom,price/sqft.bedroom
0,Devarachikkanahalli,2,1250.0,2.0,2.0,4000000.0,3200.000000,625.0,1600.000000
1,Devarachikkanahalli,2,1170.0,2.0,2.0,4000000.0,3418.803419,585.0,1709.401709
2,Devarachikkanahalli,3,1425.0,2.0,2.0,6500000.0,4561.403509,475.0,1520.467836
3,Devarachikkanahalli,2,947.0,2.0,2.0,4300000.0,4540.654699,473.5,2270.327350
4,Devarachikkanahalli,2,1130.0,2.0,2.0,3600000.0,3185.840708,565.0,1592.920354
...,...,...,...,...,...,...,...,...,...
7910,other,6,1200.0,5.0,0.0,13000000.0,10833.333333,200.0,1805.555556
7911,other,1,812.0,1.0,0.0,2600000.0,3201.970443,812.0,3201.970443
7912,other,3,1440.0,2.0,2.0,6393000.0,4439.583333,480.0,1479.861111
7913,other,2,1075.0,2.0,2.0,4800000.0,4465.116279,537.5,2232.558140


In [18]:
df4 = df3[df3.bedrooms + 2 >= df3.bath]
df_model = df4.drop(['price/sqft','sqft/bedroom','price/sqft.bedroom'],1)
df_model

Unnamed: 0,location,bedrooms,total_sqft,bath,balcony,price
0,Devarachikkanahalli,2,1250.0,2.0,2.0,4000000.0
1,Devarachikkanahalli,2,1170.0,2.0,2.0,4000000.0
2,Devarachikkanahalli,3,1425.0,2.0,2.0,6500000.0
3,Devarachikkanahalli,2,947.0,2.0,2.0,4300000.0
4,Devarachikkanahalli,2,1130.0,2.0,2.0,3600000.0
...,...,...,...,...,...,...
7910,other,6,1200.0,5.0,0.0,13000000.0
7911,other,1,812.0,1.0,0.0,2600000.0
7912,other,3,1440.0,2.0,2.0,6393000.0
7913,other,2,1075.0,2.0,2.0,4800000.0


# __Model Building__

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import Lasso

# Getting dummies

df = pd.get_dummies(df_model, columns = ['location'],drop_first=True)    
X = df.drop(['price'],1)
y = df['price']

Unnamed: 0,bedrooms,total_sqft,bath,balcony,price,location_1st Block Jayanagar,location_1st Phase JP Nagar,location_2nd Phase Judicial Layout,location_2nd Stage Nagarbhavi,location_5th Block Hbr Layout,...,location_Vishveshwarya Layout,location_Vishwapriya Layout,location_Vittasandra,location_Whitefield,location_Yelachenahalli,location_Yelahanka,location_Yelahanka New Town,location_Yelenahalli,location_Yeshwanthpur,location_other
0,2,1250.0,2.0,2.0,4000000.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1170.0,2.0,2.0,4000000.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1425.0,2.0,2.0,6500000.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,947.0,2.0,2.0,4300000.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,1130.0,2.0,2.0,3600000.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7910,6,1200.0,5.0,0.0,13000000.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7911,1,812.0,1.0,0.0,2600000.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7912,3,1440.0,2.0,2.0,6393000.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7913,2,1075.0,2.0,2.0,4800000.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
# Grid Searching

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

models = {
    'LinearRegression': LinearRegression(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'Lasso': Lasso()    
}

params = {
    'LinearRegression': 
    {
        'normalize':[True,False]        
    },
    'DecisionTreeRegressor':
    {
        'criterion':['mse','friedman_mse','mae'],
        'splitter':['best','random']    
    },
    'Lasso':
    {
        'alpha':[1,2],
        'selection':['random','cyclic']
    }
}

scores = []
model = []

for name in models.keys():
    est = models[name]
    est_params = params[name]
    rscv = RandomizedSearchCV(est,est_params, cv=cv ,n_jobs = -1)
    rscv.fit(X,y)
    scores.append({
        'Name': name,
        'Best params': rscv.best_params_,
        'Best score': rscv.best_score_,
        'Model': rscv.best_estimator_
    })



In [None]:
results = pd.DataFrame(scores, columns = ['Name', 'Best params', 'Best score','Model'])
results = results.sort_values('Best score' ,ascending = False, ignore_index = True)
best_model = results.loc[0,'Model']

In [None]:
def predict_function(location,bedrooms, sqft, bath, balcony):
    features = df[df[location]==1].iloc[0,:]
    features['bedrooms','total_sqft','bath', 'balcony'] = [bedrooms, sqft, bath, balcony]
    features = features.drop(['price'])
    return best_model.predict([features])

In [None]:
predict_function('Vijayanagar', 2 , 1000, 2, 1)