## Importing libraries

In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing dataset

In [15]:
df = pd.read_csv("data/PuneHouseData.csv")
df.head(10)

Unnamed: 0,area_type,availability,size,society,total_sqft,bath,balcony,price,site_location
0,Super built-up Area,19-Dec,2 BHK,Coomee,1056,2.0,1.0,39.07,Alandi Road
1,Plot Area,Ready To Move,4 Bedroom,Theanmp,2600,5.0,3.0,120.0,Ambegaon Budruk
2,Built-up Area,Ready To Move,3 BHK,,1440,2.0,3.0,62.0,Anandnagar
3,Super built-up Area,Ready To Move,3 BHK,Soiewre,1521,3.0,1.0,95.0,Aundh
4,Super built-up Area,Ready To Move,2 BHK,,1200,2.0,1.0,51.0,Aundh Road
5,Super built-up Area,Ready To Move,2 BHK,DuenaTa,1170,2.0,1.0,38.0,Balaji Nagar
6,Super built-up Area,18-May,4 BHK,Jaades,2732,4.0,,204.0,Baner
7,Super built-up Area,Ready To Move,4 BHK,Brway G,3300,4.0,,600.0,Baner road
8,Super built-up Area,Ready To Move,3 BHK,,1310,3.0,1.0,63.25,Bhandarkar Road
9,Plot Area,Ready To Move,6 Bedroom,,1020,6.0,,370.0,Bhavani Peth


## EDA

In [16]:
df.shape

(13320, 9)

In [17]:
df.columns

Index(['area_type', 'availability', 'size', 'society', 'total_sqft', 'bath',
       'balcony', 'price', 'site_location'],
      dtype='object')

In [18]:
df.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [19]:
df.dtypes

area_type         object
availability      object
size              object
society           object
total_sqft        object
bath             float64
balcony          float64
price            float64
site_location     object
dtype: object

In [20]:
df.society.value_counts()

GrrvaGr    80
PrarePa    76
Sryalan    59
Prtates    59
GMown E    56
           ..
Amionce     1
JaghtDe     1
Jauraht     1
Brity U     1
RSntsAp     1
Name: society, Length: 2688, dtype: int64

In [21]:
df.site_location.value_counts()

Alandi Road            139
Kalyani Nagar          139
Mahatma Gandhi Road    139
Lulla Nagar            139
Laxmi Road             139
                      ... 
Sadashiv Peth          138
Raviwar Peth           138
Rasta Peth             138
Ghorpadi               138
other                    1
Name: site_location, Length: 97, dtype: int64

In [22]:
df.isnull().sum()

area_type           0
availability        0
size               16
society          5502
total_sqft          0
bath               73
balcony           609
price               0
site_location       1
dtype: int64

## Data preprocessing and handling missing data

In [26]:
cdf = df.drop(["area_type","availability","society"], axis=1)

In [27]:
cdf.isnull().sum()

size              16
total_sqft         0
bath              73
balcony          609
price              0
site_location      1
dtype: int64

In [28]:
cdf.head(10)

Unnamed: 0,size,total_sqft,bath,balcony,price,site_location
0,2 BHK,1056,2.0,1.0,39.07,Alandi Road
1,4 Bedroom,2600,5.0,3.0,120.0,Ambegaon Budruk
2,3 BHK,1440,2.0,3.0,62.0,Anandnagar
3,3 BHK,1521,3.0,1.0,95.0,Aundh
4,2 BHK,1200,2.0,1.0,51.0,Aundh Road
5,2 BHK,1170,2.0,1.0,38.0,Balaji Nagar
6,4 BHK,2732,4.0,,204.0,Baner
7,4 BHK,3300,4.0,,600.0,Baner road
8,3 BHK,1310,3.0,1.0,63.25,Bhandarkar Road
9,6 Bedroom,1020,6.0,,370.0,Bhavani Peth


In [29]:
# Filling the null values of bath with the median value of the entire dataset
import math
median_bath = math.floor(cdf.bath.median())
cdf.bath = cdf.bath.fillna(median_bath)

In [30]:
# Assuming that null value of balcony means 0 balcony in the house
cdf.balcony = cdf.balcony.fillna('0')

In [40]:
df2 = cdf.dropna()

In [41]:
df2['Bedrooms'] = df2['size'].apply(lambda x: int(x.split(' ')[0]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['Bedrooms'] = df2['size'].apply(lambda x: int(x.split(' ')[0]))


In [42]:
df2.head(10)

Unnamed: 0,size,total_sqft,bath,balcony,price,site_location,Bedrooms
0,2 BHK,1056,2.0,1.0,39.07,Alandi Road,2
1,4 Bedroom,2600,5.0,3.0,120.0,Ambegaon Budruk,4
2,3 BHK,1440,2.0,3.0,62.0,Anandnagar,3
3,3 BHK,1521,3.0,1.0,95.0,Aundh,3
4,2 BHK,1200,2.0,1.0,51.0,Aundh Road,2
5,2 BHK,1170,2.0,1.0,38.0,Balaji Nagar,2
6,4 BHK,2732,4.0,0.0,204.0,Baner,4
7,4 BHK,3300,4.0,0.0,600.0,Baner road,4
8,3 BHK,1310,3.0,1.0,63.25,Bhandarkar Road,3
9,6 Bedroom,1020,6.0,0.0,370.0,Bhavani Peth,6


In [43]:
df2.shape

(13303, 7)

In [44]:
df3 = df2.drop('size', axis=1)
df3.head()

Unnamed: 0,total_sqft,bath,balcony,price,site_location,Bedrooms
0,1056,2.0,1.0,39.07,Alandi Road,2
1,2600,5.0,3.0,120.0,Ambegaon Budruk,4
2,1440,2.0,3.0,62.0,Anandnagar,3
3,1521,3.0,1.0,95.0,Aundh,3
4,1200,2.0,1.0,51.0,Aundh Road,2


## Visualization

In [None]:
plt.scatter(cdf.total_sqft, cdf.price, color='blue')
plt.xlabel("Total land area(in sqft)")
plt.ylabel("Price")

In [None]:
plt.scatter(cdf.bath, cdf.price, color='blue')
plt.xlabel("No. of bathrooms")
plt.ylabel("Price")

## Encoding categorical data

## Splitting dataset into training set and testing set

## Feature scaling