In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = (20,10) # width, height in inches

# Load the CSV file
df1 = pd.read_csv('C:\\Users\\HP\\Downloads\\House-Price-Prediction-Project\\Bengaluru_House_Data.csv')
df1.head()

# Get the no of rows and columns
df1.shape

# Get all the column names
df1.columns

# Lets check the unique values 'area_type' column
df1.area_type.unique()

# Let get the count of training examples for each area type
df1.area_type.value_counts()

# Note everytime we make change in dataset we store it in new dataframe
df2 = df1.drop(['area_type', 'availability', 'society', 'balcony'], axis='columns')

print('Rows and columns are = ', df2.shape)
df2.head()

# Get the sum of all na values from dataset
df2.isna().sum()

df3 = df2.dropna()
df3.isnull().sum()

# Since all training examples containing null values are dropped lets check the shape of the dataset again
df3.shape

df3['size'].unique()

df4 = df3.copy()

# Using lambda function we can get the BHK numeric value
df4['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))
df4.bhk.unique()

# Get the training examples with home size more than 20 BHK
df4[df4.bhk > 20]

df4.total_sqft.unique()

def is_float(x):
    try:
        float(x)
        return True
    except:
        return False

# Test the function
print('is this (123) float value = %s' % (is_float(123)))
print('is this (1133 - 1384) float value = %s' % (is_float('1133 - 1384')))

# Showing training examples where 'total_sqft' value is not float
df4[~df4['total_sqft'].apply(is_float)].head(10)

def convert_range_to_sqft(x):
    try:
        tokens = x.split('-')
        if len(tokens) == 2:
            return (float(tokens[0]) + float(tokens[1])) / 2
        return float(x)
    except:
        return None

# Testing the function
print('converting range to sq feet = %s' % (convert_range_to_sqft('1233 - 1445')))
print('converting range to sq feet = %s' % (convert_range_to_sqft('1333')))
print('converting range to sq feet = %s' % (convert_range_to_sqft('1333.3')))
print('converting range to sq feet = %s' % (convert_range_to_sqft(3333.3)))

# Now we apply the conversion function to 'total_sqft' column
df5 = df4.copy()
df5['total_sqft'] = df5['total_sqft'].apply(convert_range_to_sqft)

# Get training examples with invalid values for 'total_sqft'
df5.loc[30]

# Lets check our dataset again
df5.head()

df5.total_sqft.unique()

# Get the training examples where total_sqft is not numeric
df5[~df5['total_sqft'].apply(is_float)].head(10)

df6 = df5.copy()

# Removing outliers
df6['price_per_sqft'] = df6['price']*100000 / df6['total_sqft']

# Corrected function for removing outliers based on price_per_sqft
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf['price_per_sqft'])
        st = np.std(subdf['price_per_sqft'])
        reduced_df = subdf[(subdf['price_per_sqft'] > (m - st)) & (subdf['price_per_sqft'] <= (m + st))]
        df_out = pd.concat([df_out, reduced_df], ignore_index=True)
    return df_out

# Applying the function after calculating price_per_sqft
df7 = remove_pps_outliers(df6)
df7.shape

# Checking data after removing outliers for price_per_sqft
df7.head()

def plot_scatter_chart(df, location):
    bhk2 = df[(df.location == location) & (df.bhk == 2)]
    bhk3 = df[(df.location == location) & (df.bhk == 3)]
    matplotlib.rcParams['figure.figsize'] = (15, 10)
    plt.scatter(bhk2.total_sqft, bhk2.price, color='blue', label='2 BHK', s=50)
    plt.scatter(bhk3.total_sqft, bhk3.price, color='green', label='3 BHK', s=50)
    plt.xlabel("Total Square Feet Area")
    plt.ylabel("Price")
    plt.title(location)
    plt.legend()

plot_scatter_chart(df7, "Rajaji Nagar")

plot_scatter_chart(df7, "Hebbal")

# Remove bhk outliers
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df['price_per_sqft']),
                'std': np.std(bhk_df['price_per_sqft']),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk - 1)
            if stats and stats['count'] > 5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df['price_per_sqft'] < (stats['mean'])].index.values)
    return df.drop(exclude_indices, axis='index')

# Applying the function
df8 = remove_bhk_outliers(df7)
df8.shape

# Checking data after removing outliers for BHK
df8.head()

# Get the plotting function for plotting histogram for price_per_sqft
def plot_histogram(df):
    matplotlib.rcParams["figure.figsize"] = (20,10)
    plt.hist(df['price_per_sqft'], rwidth=0.8, bins=20)
    plt.xlabel("Price Per Square Feet")
    plt.ylabel("Count")

plot_histogram(df8)

# We will be using one hot encoding for location

df9 = df8.copy()

df9['location'] = df9.location.apply(lambda x: x.strip())
location_stats = df9['location'].value_counts(ascending=False)
location_stats

# For checking the location with 10 or less training examples
location_stats[location_stats <= 10]

# Put location with less than 10 examples in other category
location_stats_less_than_10 = location_stats[location_stats <= 10]
location_stats_less_than_10

# New location feature where location with less than 10 examples is converted to other category
df9.location = df9.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
df9.head(10)

# Check the new location count
len(df9.location.unique())

# Lets check the data after removing outliers
df9.head()

# Since we have to implement machine learning we must convert our training examples into numeric
# We will use the pandas get_dummies method to convert the location into one hot encoding values
dummies = pd.get_dummies(df9['location'])
dummies.head(10)

# For easy implementation we will concatenate dummies into df10
df10 = pd.concat([df9, dummies.drop('other', axis='columns')], axis='columns')

df10.head()

# Dropping the location column since we no longer need it
df10 = df10.drop('location', axis='columns')

df10.head(2)

df10.shape

X = df10.drop(['price', 'size'], axis='columns')
X.head()

y = df10.price
y.head()

# Splitting the data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train, y_train)
lr_clf.score(X_test, y_test)

# Use K fold cross validation to measure accuracy of our LinearRegression model
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

# Find best model using GridSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(X, y):
    algos = {
        'linear_regression': {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1, 2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion': ['mse', 'friedman_mse'],
                'splitter': ['best', 'random']
            }
        }
    }

    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

    for algo_name, config in algos.items():
        gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X, y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])

find_best_model_using_gridsearchcv(X, y)

def predict_price(location, sqft, bath, bhk):
    loc_index = np.where(X.columns == location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] = 1

    return lr_clf.predict([x])[0]

predict_price('1st Phase JP Nagar', 1000, 2, 2)

predict_price('1st Phase JP Nagar', 1000, 3, 3)

predict_price('Indira Nagar', 1000, 2, 2)

predict_price('Indira Nagar', 1000, 3, 3)

import pickle

with open('Real_Estate_Price_Prediction_Project.pickle', 'wb') as f:
    pickle.dump(lr_clf, f)

import json
columns = {
    'data_columns': [col.lower() for col in X.columns]
}
with open("columns.json", "w") as f:
    f.write(json.dumps(columns))

print("Model and columns JSON saved successfully.")
