In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df = pd.read_csv("Bengaluru_House_Data.csv")
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
df = df.drop(columns=["area_type","society","balcony","availability"])
df = df.dropna()
df.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [4]:
df["bhk"] = df["size"].apply(lambda x: float(x.split()[0]))
df = df.drop(columns="size")
df.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,Electronic City Phase II,1056,2.0,39.07,2.0
1,Chikka Tirupathi,2600,5.0,120.0,4.0
2,Uttarahalli,1440,2.0,62.0,3.0
3,Lingadheeranahalli,1521,3.0,95.0,3.0
4,Kothanur,1200,2.0,51.0,2.0


In [5]:
df.shape

(13246, 5)

In [6]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [7]:
df[~((df.total_sqft).apply(is_float))]

Unnamed: 0,location,total_sqft,bath,price,bhk
30,Yelahanka,2100 - 2850,4.0,186.000,4.0
122,Hebbal,3067 - 8156,4.0,477.000,4.0
137,8th Phase JP Nagar,1042 - 1105,2.0,54.005,2.0
165,Sarjapur,1145 - 1340,2.0,43.490,2.0
188,KR Puram,1015 - 1540,2.0,56.800,2.0
...,...,...,...,...,...
12975,Whitefield,850 - 1060,2.0,38.190,2.0
12990,Talaghattapura,1804 - 2273,3.0,122.000,3.0
13059,Harlur,1200 - 1470,2.0,72.760,2.0
13265,Hoodi,1133 - 1384,2.0,59.135,2.0


In [8]:
def convert_float(x):
    y = x.split("-")
    if len(y)==2:
        return (float(y[0])+float(y[1]))/2
    try:
        return float(x)
    except:
        return None

In [9]:
df["total_sqft"] = df["total_sqft"].apply(convert_float)
df.shape

(13246, 5)

In [10]:
df[~df["total_sqft"].apply(is_float)]

Unnamed: 0,location,total_sqft,bath,price,bhk


In [11]:
df = df[~((df.total_sqft/(df.bhk+2))<250)] 
df.shape

(10370, 5)

In [12]:
df.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,Electronic City Phase II,1056.0,2.0,39.07,2.0
1,Chikka Tirupathi,2600.0,5.0,120.0,4.0
2,Uttarahalli,1440.0,2.0,62.0,3.0
3,Lingadheeranahalli,1521.0,3.0,95.0,3.0
4,Kothanur,1200.0,2.0,51.0,2.0


In [13]:
location_stats = df.groupby("location")["location"].agg("count").sort_values(ascending=False)
location_stats

location
Whitefield              496
Sarjapur  Road          329
Electronic City         216
Thanisandra             209
Kanakpura Road          192
                       ... 
Jp nagar 8th Phase .      1
Jogupalya                 1
Jaymahal Road             1
Jayanti Nagar             1
white field,kadugodi      1
Name: location, Length: 1080, dtype: int64

In [14]:
location_stats_less_than_10 = location_stats[location_stats<=10]
location_stats_less_than_10

location
Tindlu                  10
Cox Town                10
Dodsworth Layout        10
Doddaballapur           10
Dairy Circle            10
                        ..
Jp nagar 8th Phase .     1
Jogupalya                1
Jaymahal Road            1
Jayanti Nagar            1
white field,kadugodi     1
Name: location, Length: 892, dtype: int64

In [15]:
df["location"] = df["location"].apply(lambda x: "other" if x in location_stats_less_than_10 else x)
df.location.nunique()

189

In [16]:
df.shape

(10370, 5)

In [17]:
df["price_per_sqft"] = df["price"]*100000/df["total_sqft"]
df.head(10)

Unnamed: 0,location,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,1056.0,2.0,39.07,2.0,3699.810606
1,Chikka Tirupathi,2600.0,5.0,120.0,4.0,4615.384615
2,Uttarahalli,1440.0,2.0,62.0,3.0,4305.555556
3,Lingadheeranahalli,1521.0,3.0,95.0,3.0,6245.890861
4,Kothanur,1200.0,2.0,51.0,2.0,4250.0
5,Whitefield,1170.0,2.0,38.0,2.0,3247.863248
6,Old Airport Road,2732.0,4.0,204.0,4.0,7467.057101
7,Rajaji Nagar,3300.0,4.0,600.0,4.0,18181.818182
8,Marathahalli,1310.0,3.0,63.25,3.0,4828.244275
10,Whitefield,1800.0,2.0,70.0,3.0,3888.888889


In [18]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby("location"):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))] 
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out

In [19]:
df = remove_pps_outliers(df)
df.shape

(8404, 6)

In [20]:
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby("location"):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby("bhk"):
            bhk_stats[bhk] = {
                "mean" : np.mean(bhk_df.price_per_sqft),
                "sdt" : np.std(bhk_df.price_per_sqft),
                "count" : bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby("bhk"):
            stats = bhk_stats.get(bhk-1)
            if stats and stats["count"]>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats["mean"])].index.values)
    return df.drop(exclude_indices,axis="index")

In [21]:
#df = remove_bhk_outliers(df)
df.shape

(8404, 6)

In [22]:
df = df[df.bath<df.bhk+1]
df.shape

(7874, 6)

In [23]:
df1 = df.copy()

In [24]:
dummy = pd.get_dummies(df["location"])
dummy = dummy.drop(columns="other")
dummy.head()

Unnamed: 0,Devarachikkanahalli,1st Phase JP Nagar,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,9th Phase JP Nagar,AECS Layout,Abbigere,Akshaya Nagar,...,Varthur,Vidyaranyapura,Vijayanagar,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
df = df.drop(columns = ["price_per_sqft","location"])
df.head()

Unnamed: 0,total_sqft,bath,price,bhk
0,1250.0,2.0,44.0,3.0
1,1170.0,2.0,40.0,2.0
2,1425.0,2.0,65.0,3.0
3,1417.0,2.0,76.0,3.0
4,1230.0,2.0,58.0,2.0


In [26]:
df = pd.concat([df,dummy], axis=1)
df.head()

Unnamed: 0,total_sqft,bath,price,bhk,Devarachikkanahalli,1st Phase JP Nagar,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,...,Varthur,Vidyaranyapura,Vijayanagar,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1250.0,2.0,44.0,3.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1170.0,2.0,40.0,2.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1425.0,2.0,65.0,3.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1417.0,2.0,76.0,3.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1230.0,2.0,58.0,2.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
df.shape

(7874, 192)

In [28]:
x = df.drop(columns="price")
y = df["price"]

In [29]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=1)

In [30]:
from sklearn.linear_model import LinearRegression
linear = LinearRegression()
linear.fit(x_train,y_train)
linear.score(x_test,y_test)

0.799775173919405

In [31]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

cv = ShuffleSplit(n_splits=5,test_size=0.2)
cross_val_score(LinearRegression(),x,y, cv=cv).mean()

0.830011577840105

In [32]:
from sklearn.linear_model import Lasso
cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state=1)
cross_val_score(Lasso(),x,y).mean()

0.663088918933359

In [33]:
from sklearn.linear_model import Ridge
cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state=1)
cross_val_score(Ridge(),x,y).mean()

0.6687479537085118