In [105]:
# Import packages we need for data exploration.
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

## Data Cleaning

In [106]:
# Load the data into a pandas dataframe.
df = pd.read_csv(r"../input/bengaluru-house-price-data/Bengaluru_House_Data.csv")
df.head()

In [107]:
#  rows, columns
df.shape

In [108]:
# Number of houses in each area type. 
df.groupby('area_type')['area_type'].agg('count')

Some columns seem to be redundant.
("area_type columns", "availability", "society" and "balcony") I'll drop these columns, but in best practice you should consult your real estate manager and discuss on how important these columns are in deciding the price of a house

In [109]:
# Get rid of unnecessary columns.
df1 = df.drop(['area_type', 'availability', 'society', 'balcony'], axis=1)
df1.head()

In [110]:
# A count of the null data points in each column.
df1.isnull().sum()

In [111]:
# Drop every row that has a null data point or perhaps, You can run any interpolation method 
# of your choice if you do not wish to lose data entries.
df2 = df1.dropna()
print(df2.shape)
df2.isnull().sum()

The values in the size columns are conflicting. Let's take a look at them.

In [112]:
df2['size'].unique()

In [113]:
# I'll create a new column, that takes the number from size column, e.g '2 BHK' will be '2' in our new column.
df2['bhk'] = df2['size'].apply(lambda x: int(x.split(' ')[0]))
df2.head()

In [114]:
df2.bhk.unique()

In [115]:
# The entries in the dataset where bhk is > 20.
df2[df2.bhk>15]

The values in total_sqft column seem to be conflicting as well. Let's have a look.

In [116]:
df2.total_sqft.unique()

There are several types of data points in the said column. Like range values (0000-1111), and other <br>units of length measurement. We'd address that right away.

In [117]:
# This function takes x (total_sqft column) and convert each data point to a float, 
# and returns False if it's not feasible. 
def convertToFloat(x):
    try:
        float(x)
    except:
        return False
    return True

In [118]:
# The entries in the dataset where the total_sqft is not a float; using the (~).
df2[~df2['total_sqft'].apply(convertToFloat)]

A simple approach to address this matter is to take the mean average of the range numbers. <br>
(a+b)/2

In [119]:
# This function takes x (total_sqft) and takes the float average of the range data points.
def convertRangeToFloat(x):
    numbers = x.split(' - ')
    if len(numbers)==2:
        return (float(numbers[0]) + float(numbers[1]))/len(numbers)
    try:
        return float(x)
    except:
        return None # We return None here, if the value cannot be converted to float.

In [120]:
convertRangeToFloat('2334 - 7890')

In [121]:
# I'll create a new dataframe and apply the above function to the total_sqft column.
df3 = df2.copy()
df3.total_sqft = df3.total_sqft.apply(convertRangeToFloat)
df3.head()

In [122]:
# Entries in the 30th index.
df3.loc[30]

In [123]:
# A little comparison of the function's ouput.
(2100+2850)/2

Perfect!

In [124]:
df3.head()

In [125]:
# A new column 'price per sqft'; the quotient of price/total_sqft.
df4 = df3.copy()
df4['price_per_sqft'] = df4['price']*100000/df4['total_sqft'] # I multiply the price by 100000; 1 lakh = 100000
df4.head()

This new column gives a general sense of the actual cost of a house and can be used in outlier<br>
filtering as it gives a clear relative comparison of each data entry. 

In [126]:
# Number of unique locations in the dataset.
len(df4.location.unique())

In [127]:
# Number of houses in each location.
df4.location = df4.location.apply(lambda x: x.strip())

location_tally = df4.groupby('location')['location'].agg('count').sort_values(ascending=False)
location_tally.head(30)

In [128]:
# Number of locations that have <= 10 houses in the dataset.
len(location_tally[location_tally<=10]) # This is a series so this type of syntax works.

In [129]:
sparse_locations = location_tally[location_tally<=10]
sparse_locations

I think it's fair to tie up these sparse locations into one collective variable "others"

In [130]:
# I'm creating a new location variable "other", for all the locations that have less than 10 houses.
df4.location = df4.location.apply(lambda x: 'others' if x in sparse_locations else x)
len(df4.location.unique())

The locations are now more concise than earlier.

In [131]:
df4.head(10)

There are some anomalies in the data where entries have a high bhk value and a low total_sqft<br>
value. For example, the loc(9) of the above dataframe, bhk=6 and total_sqft=1020. <br>
The square feet per bhk should be of a certain value that is comprehendable.<br>
Normally you would ask your real estate manager for a good threshold to use. <br>
I'll use 320

In [132]:
# The entries in the dataset where the quotient of the total_sqft/bhk < 320.
df4[df4.total_sqft/df4.bhk<320].head()

## Outlier filtering

In [133]:
# I create a new dataframe where I filter entries that have a (total_sqft/bhk) < 320 .
df5 = df4[~(df4.total_sqft/df4.bhk<300)]
df5.shape

In [134]:
# Some statistical info about the price_per_sqft column.
df5.price_per_sqft.describe()

From the above we can see that there are houses that are overly cheap(min) and some that are<br> overly expensive(max). Since I'm trying to build a generic model that will serve the larger majority,<br>it will be fair to get rid of these extreme entries (outliers).

In [135]:
# I'll create a function that filters out overly expensive and overly cheap houses.
def filter_outliers1(dataframe):
    ''' 
        This function takes the dataframe, and groupby the location.

        For each location, we calculate the mean and standard deviation of the 
        price_per_sqft in that location.

        Then we use the mean and standard deviation to filter entries that lie beyond some 
        specified threshold.
    '''
    df = pd.DataFrame()
    for location, minidf in dataframe.groupby('location'):
        mean = np.mean(minidf.price_per_sqft)
        std = np.std(minidf.price_per_sqft)
        filtered_df = minidf[(minidf.price_per_sqft>(mean-std)) & (minidf.price_per_sqft<=(mean+std))]
        df = pd.concat([df, filtered_df], ignore_index=True)
    return df

In [136]:
df6 = filter_outliers1(df5)
df6.shape

We got rid of quite some entries there.

In [137]:
# Number of house in each bhk size.
df6.groupby('bhk')['bhk'].agg('count')

Apparently, most houses have 2 or 3 bhk.

In [138]:
# Let's see how the prices of 2 bhk and 3 bhk vary in the same location.
def scatter_plot(dataframe, location):
    bhk2 = dataframe[(dataframe.location==location) & (dataframe.bhk==2)]
    bhk3 = dataframe[(dataframe.location==location) & (dataframe.bhk==3)]
    plt.scatter(bhk2.total_sqft, bhk2.price, color='cyan', label='2 BHK', s=50)
    plt.scatter(bhk3.total_sqft, bhk3.price, marker='+', color='pink', label='3 BHK', s=50)
    plt.xlabel('Total square feet area')
    plt.ylabel('Price per square feet')
    plt.title(f'Price comparison of houses in {location} with 2 or 3 bedrooms')
    plt.legend()

scatter_plot(df6, 'Marathahalli')

See how some 3 bhk houses cost less than 2 bhk houses with the same total square feet area.<br>
Normally you'll think that, in the same location, for more bhk, the price of a given house should<br>cost more than one with less bhk, provided the square feet area is fairly same.
<br>
So I'll go ahead and write a function that filters these outliers.

In [139]:
def filter_outliers2(dataframe):
    '''First, I create an array of indices of the outliers to filter/remove; filter_indices.
       
       Second, I group the dataframe by location; location_df
       
       Third, I create an empty dictionary 'bhk_info', and then group 'location_df' by 'bhk'
       
       Fourth, I get the mean and standard deviation of the price per square feet of all the
       house with a particular bhk, and in the same Location. Also the count of the houses.
       
       Finally, I'll pass the stats info to the dictionary and use the information in this
       bhk_info dictionary to get the indices of the entries where the price per square feet
       of a given house, in a given location with more bhk, is less than that of a house 
       with less bhk.
      '''
    filter_indices = np.array([])
    for location, location_df in dataframe.groupby('location'):
        bhk_info = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_info[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            info = bhk_info.get(bhk-1)
            if info and info['count']>5: # Only ones greater than 5.
                filter_indices = np.append(filter_indices, 
                             bhk_df[bhk_df.price_per_sqft<(info['mean'])].index.values)
    return dataframe.drop(filter_indices, axis='index')

df7 = filter_outliers2(df6)
df7.shape

In [140]:
# I'd run the scatter plot function again to see how well we did with filtering the outlier. 
scatter_plot(df7, 'Marathahalli')

We succeeded in filtering those outliers.<br>
Lets see the bath columns, there might be a few outliers in that column.

In [142]:
df7.bath.unique()

In [143]:
# The entries in the dataset where bath is greater than 9.
df7[df7.bath>9]

In [144]:
# I'll use a histogtram to see the most common number of bathrooms in the dataset.
plt.hist(df7.bath, rwidth=.7)
plt.xlabel('Number of bathrooms')
plt.ylabel('Counts')

Now there are some houses that have an usual number of bathrooms. Let's check them out.

In [145]:
# Entries in the dataset where the number of bathrooms is more than the bhk+2.
df7[df7.bath>df7.bhk+2]

In [146]:
# I create a new dataframe where the number of the bathroom is not more than 
# the bhk+2(prefered threshold).
df8 = df7[df7.bath<df7.bhk+2]
df8.shape

I used the price_per_sqft column to do alot of outlier filtering but I would have to drop<br> it as it won't be an important feature for training the regression model.<br>
As well as the size column.

In [147]:
df9 = df8.drop(['size', 'price_per_sqft'], axis=1)
df9.head(5)

There are over 200 locations in the dataset, and they are all string data type.<br>
Machine learning models don't understand string for the most part, hence<br>
I will use a one-hot encoding approach facilitated by pd.get_dummies, to create<br>
a new dataframe of zeros and ones that represent the location in a certain way.<br>
For example, if the location is 'Electric City', the 'Electric City' column will have a<br> value 1 and every other columns will be zeros.

In [148]:
dummies = pd.get_dummies(df9.location)
dummies.tail()

In [149]:
# We concatenate our dummies-dataframe to out main dataframe.
df10 = pd.concat([df9, dummies.drop('others', axis=1)], axis=1)
df10.head()

I dropped the 'others' column to prevent the dummy variable trap. <br>
So when all the columns are zero then it will entail 'others' location.

In [150]:
# Create a new dataframe and drop the location column as it isn't needed any longer. 
df11 = df10.drop(['location'], axis=1)
df11.head()

In [151]:
df11.shape

In [152]:
# Create a feature variable 'X', which holds the data for prediction with 'price' dropped.
X = df11.drop('price', axis=1)
X

In [153]:
# We create the targets variable 'Y' which holds the price.
Y = df11.price
Y

In [154]:
# We'll split the data into training and testing chunks using train_test_split.
from sklearn.model_selection import train_test_split as tts
X_train, X_test, Y_train, Y_test = tts(X, Y, test_size=.1, random_state=42)

In [155]:
from sklearn.linear_model import LinearRegression
clf = LinearRegression()
clf.fit(X_train.values, Y_train.values) # Training step!
accuracy = (clf.score(X_test.values, Y_test.values))*100 
print(f'Accuracy of the model: {(accuracy):.1f}%')

In [156]:
# I've implemented a function that uses the model to predict price.
def predictPrice(location, sqft, bath, bhk):
    loc_index = np.where(X.columns==location)[0][0]  # Get index of the location passed
    x = np.zeros(len(X.columns)) # Create an array of zeros
    x[0] = sqft  # assign sqft as the first input
    x[1] = bath # assign bath as the second input
    x[2] = bhk  # assign bhk as the third input
    if loc_index > 0:
        x[loc_index] = 1 # assign 1 wherever the location should be in the array.
    return clf.predict([x])[0] 

Note: The arrangement of the values in the __x__ is very important because that's<br>
the structure of the data when we were training the model.

In [159]:
price = predictPrice('1st Block Jayanagar', 2000, 4, 5)
print(f"Price: {price:.2f}Lakh")

In [160]:
price = predictPrice('Electronic City', 1200, 4, 4)
print(f"Price: {price:.2f}Lakh")

In [161]:
price = predictPrice('2nd Stage Nagarbhavi', 900, 2, 3)
print(f"Price: {price:.2f}Lakh")

Clearly, houses appear to be more expensive in some locations and cheaper in some too. <br>
And the model picked that too.

In [None]:
# ifunanyaScript