In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [51]:
data = pd.read_csv('Bengaluru_House_Data.csv')

In [52]:
data.sample(5)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
6207,Built-up Area,Ready To Move,Uttarahalli,2 BHK,Sencys,1160,2.0,2.0,45.0
1459,Plot Area,Ready To Move,Hebbal Kempapura,5 Bedroom,,2280,5.0,,200.0
6994,Super built-up Area,Ready To Move,Virupakshapura,2 BHK,,1075,2.0,1.0,55.0
6293,Super built-up Area,Ready To Move,Green Glen Layout,3 BHK,Solia D,1752,3.0,2.0,105.0
11742,Super built-up Area,Ready To Move,Electronic City,3 BHK,Elodsre,1521,2.0,2.0,57.5


In [53]:
data.shape

(13320, 9)

In [54]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [55]:
data.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [56]:
for column in data.columns:
    print(data[column].value_counts())
    print('*'*50)

Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64
**************************************************
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: availability, Length: 81, dtype: int64
**************************************************
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype:

Dropping the colun having 1 location null

In [57]:
data = data.drop(data[data['location'].isnull()].index, axis=0)

In [58]:
data.shape

(13319, 9)

Filling the rows having bedrooms missing with 2BHK as it is the most frequent data

In [59]:
data['size'] = data['size'].fillna(data['size'].mode()[0])

In [60]:
data['size'].isnull().sum()

0

Filling the missing values in bathroom

In [61]:
data['bath'] = data['bath'].fillna(data['bath'].mode()[0])

In [62]:
data['bath'].isnull().sum()

0

Filling the missing values in balcony

In [63]:
data['balcony'] = data['balcony'].fillna(data['balcony'].mode()[0])

In [64]:
data['balcony'].isnull().sum()

0

Removing the BHK and Bedroom words

In [65]:
data['bhk'] = data['size'].apply(lambda x: int(x.split(' ')[0]))

In [68]:
data['bhk'].value_counts()

2     5544
3     4856
4     1417
1      656
5      356
6      221
7      100
8       89
9       54
10      14
11       4
27       1
19       1
16       1
43       1
14       1
12       1
13       1
18       1
Name: bhk, dtype: int64

Checking for the outliers in the data

In [69]:
data[data['bhk']>20]

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price,bhk
1718,Super built-up Area,Ready To Move,2Electronic City Phase II,27 BHK,,8000,27.0,0.0,230.0,27
4684,Plot Area,Ready To Move,Munnekollal,43 Bedroom,,2400,40.0,0.0,660.0,43


Removing the ranges from total_sqft

In [75]:
# removing the range of values from the total_sqft column by replacing it with the mean of the range
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0]) + float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None
    
data['total_sqft'] = data['total_sqft'].apply(convert_sqft_to_num)                                                              

In [76]:
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price,bhk
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056.0,2.0,1.0,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600.0,5.0,3.0,120.0,4
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440.0,2.0,3.0,62.0,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521.0,3.0,1.0,95.0,3
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200.0,2.0,1.0,51.0,2


Getting price per sq ft

In [77]:
data['price_per_sqft'] = data['price']*100000/data['total_sqft'].astype(float)

In [82]:
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price,bhk,price_per_sqft
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056.0,2.0,1.0,39.07,2,3699.810606
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600.0,5.0,3.0,120.0,4,4615.384615
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440.0,2.0,3.0,62.0,3,4305.555556
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521.0,3.0,1.0,95.0,3,6245.890861
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200.0,2.0,1.0,51.0,2,4250.0


In [83]:
data['location'] = data['location'].apply(lambda x: x.strip())
location_count = data['location'].value_counts()

In [84]:
location_count_less_than_10 = location_count[location_count<=10]

In [85]:
data['location'] = data['location'].apply(lambda x: 'other' if x in location_count_less_than_10 else x)

## Outlier Detection & Removal


Removing the houses not having area less than 300 sqft per room

In [86]:
data = data[((data['total_sqft']/data['bhk'])>=300)]

In [87]:
data.shape

(12527, 11)

In [88]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf['price_per_sqft'])
        st = np.std(subdf['price_per_sqft'])
        reduced_df = subdf[(subdf['price_per_sqft']>(m-st)) & (subdf['price_per_sqft']<=(m+st))]
        df_out = pd.concat([df_out, reduced_df], ignore_index=True)
    return df_out

data = remove_pps_outliers(data)

In [89]:
data.describe()

Unnamed: 0,total_sqft,bath,balcony,price,bhk,price_per_sqft
count,10323.0,10323.0,10323.0,10323.0,10323.0,10323.0
mean,1505.978945,2.473021,1.606413,91.341547,2.574445,5670.071022
std,877.843465,0.981481,0.786429,86.34595,0.89756,2265.917
min,300.0,1.0,0.0,10.0,1.0,1250.0
25%,1107.83,2.0,1.0,49.0,2.0,4250.0
50%,1285.0,2.0,2.0,67.0,2.0,5188.679245
75%,1650.0,3.0,2.0,100.0,3.0,6445.733088
max,30400.0,16.0,3.0,2200.0,16.0,24509.803922


In [90]:
def bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df['price_per_sqft']),
                'std': np.std(bhk_df['price_per_sqft']),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df['price_per_sqft']<(stats['mean'])].index.values)
    return df.drop(exclude_indices, axis='index')

In [91]:
data = bhk_outliers(data)

In [95]:
data.drop(['area_type', 'size', 'price_per_sqft', 'availability', 'society', 'balcony'], axis=1, inplace=True)

In [136]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7357 entries, 0 to 10322
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    7357 non-null   object 
 1   total_sqft  7357 non-null   float64
 2   bath        7357 non-null   float64
 3   price       7357 non-null   float64
 4   bhk         7357 non-null   int64  
dtypes: float64(3), int64(1), object(1)
memory usage: 344.9+ KB


In [97]:
data.to_csv('cleaned_data.csv', index=False)

## Model

In [99]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score

In [100]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('price', axis=1), data['price'], test_size=0.2, random_state=42)

In [101]:
print(X_train.shape)
print(X_test.shape)

(5885, 4)
(1472, 4)


### Applying Linear Regression

In [107]:
column_trans = make_column_transformer(
    (OneHotEncoder(sparse=False), ['location']),
    remainder='passthrough'
)

In [108]:
scaler = StandardScaler()

In [109]:
lr = LinearRegression()

In [110]:
pipe = make_pipeline(column_trans, scaler, lr)

In [111]:
pipe.fit(X_train, y_train)



In [122]:
y_pred_lr = pipe.predict(X_test)

In [123]:
r2_score(y_test, y_pred_lr)

0.7988331436985064

### Applying Lasso

In [114]:
lasso = Lasso()

In [115]:
pipe = make_pipeline(column_trans, scaler, lasso)

In [116]:
pipe.fit(X_train, y_train)



In [124]:
y_predict_lasso = pipe.predict(X_test)

In [125]:
r2_score(y_test, y_predict_lasso)

0.7988331436985064

### Applying Ridge

In [126]:
ridge = Ridge()

In [127]:
pipe = make_pipeline(column_trans, scaler, ridge)

In [128]:
pipe.fit(X_train, y_train)



In [131]:
y_predict_ridge = pipe.predict(X_test)

In [132]:
r2_score(y_test, y_predict_ridge)

0.8078924649087355

Thus we are getting the best r2 score in the ridge regressor

## Pipelining

In [133]:
import pickle

In [134]:
pickle.dump(pipe, open('model.pkl', 'wb'))