In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('../input/bengaluru-house-price-data/Bengaluru_House_Data.csv')

In [3]:
data.head(3)

In [4]:
data.shape

In [5]:
data.info()

In [6]:
for column in data.columns:
    print(data[column].value_counts())
    print("*"*50)

In [7]:
data.isnull().sum()

In [8]:
data.drop(columns=['area_type', 'availability', 'society', 'balcony'], inplace=True)

In [9]:
data.describe()

In [10]:
data.info()

In [11]:
data['location'].value_counts()

In [12]:
data['location']=data['location'].fillna('Sarjapur  Road')

In [13]:
data['size'].value_counts()

In [14]:
data['size']=data['size'].fillna('2 BHK')

In [15]:
data['bath']=data['bath'].fillna(data['bath'].median())

In [16]:
data.isnull().sum()

In [17]:
data['bhk']=data['size'].str.split().str.get(0).astype(int)

In [18]:
data[data.bhk>20]

In [19]:
data['total_sqft'].unique()

In [20]:
def convertrange(x):
    temp=x.split('-')
    if len(temp)==2:
        return (float(temp[0])+float(temp[1]))/2
    try:
        return float(x)
    except:
        return None

In [21]:
data['total_sqft']=data['total_sqft'].apply(convertrange)

In [22]:
data.head()

In [23]:
data['price_per_sqrt']=data['price']*100000/data['total_sqft']

In [24]:
data['price_per_sqrt']

In [25]:
data.describe()

In [26]:
data['location'].value_counts()

In [27]:
data['location']=data['location'].apply(lambda x:x.strip())
location_count = data['location'].value_counts()

In [28]:
location_count_less_10=location_count[location_count<=10]

In [29]:
location_count_less_10

In [30]:
data['location']=data['location'].apply(lambda x: 'other' if x in location_count_less_10 else x)

In [31]:
data['location'].value_counts()

## Outlier detection

In [32]:
data.describe()

In [33]:
(data['total_sqft']/data['bhk']).describe()

In [34]:
data=data[((data['total_sqft']/data['bhk'])>=300)]

In [35]:
data.describe()

In [36]:
data.shape

In [37]:
def remove_outliers_sqft(df):
    df_output = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqrt)
        st = np.std(subdf.price_per_sqrt)
        gen_df = subdf[(subdf.price_per_sqrt>(m-st))&(subdf.price_per_sqrt<=(m+st))]
        df_output=pd.concat([df_output, gen_df], ignore_index=True)
    return df_output
data=remove_outliers_sqft(data)
data.describe()

In [38]:
def bhk_outlier_remove(df):
    exclude_indices=np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk]={
                'mean':np.mean(bhk_df.price_per_sqrt),
                'std':np.std(bhk_df.price_per_sqrt),
                'count':bhk_df.shape[0]
            }
            
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats=bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices=np.append(exclude_indices, bhk_df[bhk_df.price_per_sqrt<(stats['mean'])].index.values)
    return df.drop(exclude_indices, axis='index')

In [39]:
data=bhk_outlier_remove(data)

In [40]:
data

In [41]:
data.shape

In [42]:
data.drop(columns=['size', 'price_per_sqrt'], inplace=True)

In [43]:
data.head()

In [44]:
data.to_csv('Cleaned_data.csv')

In [45]:
X = data.drop(columns=['price'])

In [46]:
y = data['price']

In [47]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

In [49]:
X_train.shape

In [50]:
X_test.shape

## Building Model using Linear Regression

In [51]:
column_trans = make_column_transformer((OneHotEncoder(sparse=False), ['location']), remainder='passthrough')

In [52]:
scaler = StandardScaler()

In [53]:
lr = LinearRegression(normalize=True)

In [54]:
pipe = make_pipeline(column_trans, scaler, lr)

In [55]:
pipe.fit(X_train, y_train)

In [56]:
y_pred_lr = pipe.predict(X_test)

In [57]:
r2_score(y_test, y_pred_lr)

## Applying Lasso

In [58]:
lasso = Lasso()

In [59]:
pipe = make_pipeline(column_trans, scaler, lasso)

In [60]:
pipe.fit(X_train, y_train)

In [61]:
y_pred_lasso = pipe.predict(X_test)

In [62]:
r2_score(y_test, y_pred_lasso)

## Applying Ridge

In [63]:
ridge = Ridge()

In [64]:
pipe = make_pipeline(column_trans, scaler, ridge)

In [65]:
pipe.fit(X_train, y_train)

In [66]:
y_pred_ridge = pipe.predict(X_test)

In [67]:
r2_score(y_test, y_pred_ridge)

In [68]:
print('No Regularization : ', r2_score(y_test, y_pred_lr))
print('Lasso Regularization : ', r2_score(y_test, y_pred_lasso))
print('Ridge Regularization : ', r2_score(y_test, y_pred_ridge))