In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('Bengaluru_House_Data.csv')
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
df.shape

(13320, 9)

In [4]:
for column in df.columns:
    print(df[column].value_counts())
    print("-"*25)

Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64
-------------------------
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: availability, Length: 81, dtype: int64
-------------------------
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype: int64
-------------------------
2 BHK         519

In [5]:
df.isna().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [6]:
df.drop(columns=['area_type','availability','society','balcony'],inplace=True)

In [7]:
df['location']=df['location'].fillna('Sarjapur Road')

In [8]:
df['size']=df['size'].fillna('2 BHK')

In [9]:
df['bath']=df['bath'].fillna(df['bath'].median())

In [10]:
df['bhk']=df['size'].str.split().str.get(0).astype(int)

In [11]:
df['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [12]:
def convertrange(x):
    temp=x.split('-')
    if len(temp)==2:
        return (float(temp[0])+float(temp[1]))/2
    try:
        return float(x)
    except:
        return None

In [13]:
df['total_sqft']=df['total_sqft'].apply(convertrange)

In [14]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


In [15]:
df['price_per_sqft']=df['price']*100000/df['total_sqft']

In [16]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [17]:
df['location']=df['location'].apply(lambda x: x.strip())

In [18]:
location_count=df['location'].value_counts()
location_less_10=location_count[location_count<10]

In [19]:
df['location']=df['location'].apply(lambda x:'other' if x in location_less_10 else x)

In [20]:
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


In [21]:
df=df[(df['total_sqft']/df['bhk'])>=300]

In [22]:
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,12530.0,12530.0,12530.0,12530.0,12530.0
mean,1594.564544,2.559537,111.382401,2.650838,6303.979357
std,1261.271296,1.077938,152.077329,0.976678,4162.237981
min,300.0,1.0,8.44,1.0,267.829813
25%,1116.0,2.0,49.0,2.0,4210.526316
50%,1300.0,2.0,70.0,3.0,5294.117647
75%,1700.0,3.0,115.0,3.0,6916.666667
max,52272.0,16.0,3600.0,16.0,176470.588235


In [23]:
def remove_sqft_outliers(dataframe):
    df_output=pd.DataFrame()
    for key,subdf in dataframe.groupby('location'):
        m=np.mean(subdf.price_per_sqft)
        st=np.std(subdf.price_per_sqft)
        gen_df=subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_output=pd.concat([df_output,gen_df],ignore_index=True)
    return df_output
df=remove_sqft_outliers(df)

In [24]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,1st Block Jayanagar,4 BHK,2850.0,4.0,428.0,4,15017.54386
1,1st Block Jayanagar,3 BHK,1630.0,3.0,194.0,3,11901.840491
2,1st Block Jayanagar,3 BHK,1875.0,2.0,235.0,3,12533.333333
3,1st Block Jayanagar,3 BHK,1200.0,2.0,130.0,3,10833.333333
4,1st Block Jayanagar,2 BHK,1235.0,2.0,148.0,2,11983.805668


In [25]:
def bhk_outlier(dataframe):
    exclude_indices=np.array([])
    for location,location_df in dataframe.groupby('location'):
        bhk_stats={}
        for bhk,bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk]={'mean':np.mean(bhk_df.price_per_sqft),'std':np.std(bhk_df.price_per_sqft),'count':bhk_df.shape[0]}
        for bhk,bhk_df in location_df.groupby('bhk'):
            stats=bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                    exclude_indices=np.append(exclude_indices,bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return dataframe.drop(exclude_indices,axis='index')

In [26]:
df=bhk_outlier(df)
df

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,1st Block Jayanagar,4 BHK,2850.0,4.0,428.0,4,15017.543860
1,1st Block Jayanagar,3 BHK,1630.0,3.0,194.0,3,11901.840491
2,1st Block Jayanagar,3 BHK,1875.0,2.0,235.0,3,12533.333333
3,1st Block Jayanagar,3 BHK,1200.0,2.0,130.0,3,10833.333333
4,1st Block Jayanagar,2 BHK,1235.0,2.0,148.0,2,11983.805668
...,...,...,...,...,...,...,...
10273,other,2 BHK,1200.0,2.0,70.0,2,5833.333333
10274,other,1 BHK,1800.0,1.0,200.0,1,11111.111111
10277,other,2 BHK,1353.0,2.0,110.0,2,8130.081301
10278,other,1 Bedroom,812.0,1.0,26.0,1,3201.970443


In [29]:
df.drop(columns=['size','price_per_sqft'],inplace=True)

In [30]:
df.to_csv('cleaned_data.csv')

In [31]:
df.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2


In [32]:
X=df.drop(columns=['price'])
y=df['price']

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [34]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [35]:
column_trans=make_column_transformer((OneHotEncoder(sparse=False),['location']),remainder='passthrough')

In [36]:
scaler=StandardScaler()

In [37]:
lr=LinearRegression()

In [38]:
pipe=make_pipeline(column_trans,scaler,lr)

In [39]:
pipe.fit(X_train,y_train)



In [43]:
y_pred_lr=pipe.predict(X_test)

In [44]:
r2_score(y_test,y_pred_lr)

0.8322289140495145

In [45]:
lasso=Lasso()

In [46]:
pipe_lasso=make_pipeline(column_trans,scaler,lasso)

In [47]:
pipe_lasso.fit(X_train,y_train)



In [48]:
y_pred_lasso=pipe_lasso.predict(X_test)

In [49]:
r2_score(y_test,y_pred_lasso)

0.8213732967110257

In [50]:
ridge=Ridge()

In [51]:
pipe_ridge=make_pipeline(column_trans,scaler,ridge)

In [52]:
pipe_ridge.fit(X_train,y_train)



In [53]:
y_pred_ridge=pipe_ridge.predict(X_test)

In [54]:
r2_score(y_test,y_pred_ridge)

0.8322656114203378

In [55]:
from sklearn.ensemble import RandomForestRegressor

In [65]:
random_forest=RandomForestRegressor(n_estimators=100,random_state=42,min_samples_split=2)

In [66]:
pipe_rf=make_pipeline(column_trans,scaler,random_forest)

In [67]:
pipe_rf.fit(X_train,y_train)



In [68]:
y_pred_rf=pipe_rf.predict(X_test)

In [69]:
r2_score(y_test,y_pred_rf)

0.7374844285758317

In [72]:
from xgboost import XGBRegressor

In [73]:
xgb=XGBRegressor(n_estimators=100,random_state=42)

In [74]:
pipe_xg=make_pipeline(column_trans,scaler,xgb)

In [75]:
pipe_xg.fit(X_train,y_train)



In [76]:
y_pred_xg=pipe_xg.predict(X_test)

In [77]:
r2_score(y_test,y_pred_xg)

0.749520191561956