Case Study House Price Prediction 

# Set Up

In [171]:
# imporitng the dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import (datasets,linear_model,model_selection,metrics)
import warnings
warnings.filterwarnings("ignore")

In [172]:
# Loading the Dataset
house = pd.read_csv("Housing.csv")    

# Checking the first 5 rows of the dataset  
house.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [173]:
# Checking the info the Dataset
house.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [174]:
# Data Preparation
house['mainroad'] = house['mainroad'].map({'yes': 1, 'no': 0})
house['guestroom'] = house['guestroom'].map({'yes': 1, 'no': 0})
house['basement'] = house['basement'].map({'yes': 1, 'no': 0})
house['hotwaterheating'] = house['hotwaterheating'].map({'yes': 1, 'no': 0})    
house['airconditioning'] = house['airconditioning'].map({'yes': 1, 'no': 0})
house['prefarea'] = house['prefarea'].map({'yes': 1, 'no': 0})

In [175]:
# Looking the head
house.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished


In [176]:
# Handling the categorical Values
status = pd.get_dummies(house['furnishingstatus'])

In [177]:
status = status.astype(int)

In [178]:
status

Unnamed: 0,furnished,semi-furnished,unfurnished
0,1,0,0
1,1,0,0
2,0,1,0
3,1,0,0
4,1,0,0
...,...,...,...
540,0,0,1
541,0,1,0
542,0,0,1
543,1,0,0


In [179]:
# Removing the Firts Column
status  = pd.get_dummies(house['furnishingstatus'], drop_first=True)
status = status.astype(int)
status

Unnamed: 0,semi-furnished,unfurnished
0,0,0
1,0,0
2,1,0
3,0,0
4,0,0
...,...,...
540,0,1
541,1,0
542,0,1
543,0,0


In [180]:
# Combining the Data
housing = pd.concat([house, status], axis=1)
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished,1,0
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished,0,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished,0,0


In [181]:
housing.drop(['furnishingstatus'],axis =1 ,inplace = True)

In [182]:
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1,0
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0,0


In [183]:
# Creating the Varibales
housing['areaperbedroom'] = housing['area'] / housing['bedrooms']

housing['bbratio'] = housing['bathrooms'] / housing['bedrooms'] 

housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished,areaperbedroom,bbratio
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0,0,1855.0,0.5
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0,0,2240.0,1.0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1,0,3320.0,0.666667
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0,0,1875.0,0.5
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0,0,1855.0,0.25


In [184]:
# Rescaling the Data3
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(housing)

scaled_data = pd.DataFrame(scaled_data, columns=housing.columns)

In [185]:
scaled_data

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished,areaperbedroom,bbratio
0,1.000000,0.396564,0.6,0.333333,0.666667,1.0,0.0,0.0,0.0,1.0,0.666667,1.0,0.0,0.0,0.237016,0.4
1,0.909091,0.502405,0.6,1.000000,1.000000,1.0,0.0,0.0,0.0,1.0,1.000000,0.0,0.0,0.0,0.298923,1.0
2,0.909091,0.571134,0.4,0.333333,0.333333,1.0,0.0,1.0,0.0,0.0,0.666667,1.0,1.0,0.0,0.472584,0.6
3,0.906061,0.402062,0.6,0.333333,0.333333,1.0,0.0,1.0,0.0,1.0,1.000000,1.0,0.0,0.0,0.240232,0.4
4,0.836364,0.396564,0.6,0.000000,0.333333,1.0,1.0,1.0,0.0,1.0,0.666667,0.0,0.0,0.0,0.237016,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,0.006061,0.092784,0.2,0.000000,0.000000,1.0,0.0,1.0,0.0,0.0,0.666667,0.0,0.0,1.0,0.179932,0.4
541,0.001485,0.051546,0.4,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,1.0,0.0,0.067374,0.2
542,0.000000,0.135395,0.2,0.000000,0.000000,1.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,1.0,0.229780,0.4
543,0.000000,0.086598,0.4,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.094710,0.2


In [186]:
housing.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'semi-furnished', 'unfurnished',
       'areaperbedroom', 'bbratio'],
      dtype='object')

In [187]:
x = scaled_data[['area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'semi-furnished', 'unfurnished',
       'areaperbedroom', 'bbratio']]
y = scaled_data['price'].values.reshape(-1, 1)

In [188]:
# Splitting the Data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=100)

In [189]:
import statsmodels.api as sm
x_train = sm.add_constant(x_train)
ols  = sm.OLS(y_train, x_train).fit()
print(ols.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.686
Model:                            OLS   Adj. R-squared:                  0.673
Method:                 Least Squares   F-statistic:                     53.12
Date:                Sun, 13 Apr 2025   Prob (F-statistic):           4.56e-82
Time:                        00:18:10   Log-Likelihood:                 384.40
No. Observations:                 381   AIC:                            -736.8
Df Residuals:                     365   BIC:                            -673.7
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.0603      0.059     

In [190]:
# Multicollinearity 
# Multicollinearity occurs when two or more independent variables in a regression model are highly correlated with each other

VIF

VIF = 1: No correlation.

VIF = 2-4: Acceptable level 

VIF = 5-10: Higher multicollinearity

VIF > 10: Severe multicollinearity

In [191]:
# Calculating the VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

x_train = add_constant(x_train)
vif = pd.DataFrame()
vif['Feature'] = x_train.columns
vif['VIF'] = [variance_inflation_factor(x_train.values, i) for i in range(x_train.shape[1])]
print(vif)

            Feature         VIF
0             const  161.030359
1              area   16.875720
2          bedrooms    8.769076
3         bathrooms   18.696773
4           stories    1.520904
5          mainroad    1.182234
6         guestroom    1.254574
7          basement    1.313231
8   hotwaterheating    1.090792
9   airconditioning    1.254590
10          parking    1.251214
11         prefarea    1.179810
12   semi-furnished    1.583655
13      unfurnished    1.652640
14   areaperbedroom   19.410346
15          bbratio   18.856422


In [192]:
# Dropping the Variables bbratio
x_train = x_train.drop(['bbratio'], axis=1)

# Creating the Model
lm_2 = sm.OLS(y_train, x_train).fit()
print(lm_2.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.686
Model:                            OLS   Adj. R-squared:                  0.674
Method:                 Least Squares   F-statistic:                     57.03
Date:                Sun, 13 Apr 2025   Prob (F-statistic):           6.46e-83
Time:                        00:18:10   Log-Likelihood:                 384.31
No. Observations:                 381   AIC:                            -738.6
Df Residuals:                     366   BIC:                            -679.5
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.0799      0.036     

In [193]:
vif = pd.DataFrame()
vif['Feature'] = x_train.columns
vif['VIF'] = [variance_inflation_factor(x_train.values, i) for i in range(x_train.shape[1])]
print(vif)

            Feature        VIF
0             const  59.772087
1              area  13.242509
2          bedrooms   4.965341
3         bathrooms   1.263345
4           stories   1.516637
5          mainroad   1.179692
6         guestroom   1.242227
7          basement   1.311034
8   hotwaterheating   1.078050
9   airconditioning   1.254508
10          parking   1.250492
11         prefarea   1.175177
12   semi-furnished   1.582991
13      unfurnished   1.652630
14   areaperbedroom  14.991231


In [194]:
x_train = x_train.drop(['areaperbedroom'], axis=1)

vif = pd.DataFrame()
vif['Feature'] = x_train.columns
vif['VIF'] = [variance_inflation_factor(x_train.values, i) for i in range(x_train.shape[1])]
print(vif)

            Feature        VIF
0             const  20.691598
1              area   1.383515
2          bedrooms   1.370875
3         bathrooms   1.262986
4           stories   1.483358
5          mainroad   1.179477
6         guestroom   1.221461
7          basement   1.310250
8   hotwaterheating   1.076673
9   airconditioning   1.253686
10          parking   1.235269
11         prefarea   1.157845
12   semi-furnished   1.575186
13      unfurnished   1.648956


In [195]:
x_train = sm.add_constant(x_train)

lm_3 = sm.OLS(y_train, x_train).fit()

print(lm_3.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.681
Model:                            OLS   Adj. R-squared:                  0.670
Method:                 Least Squares   F-statistic:                     60.40
Date:                Sun, 13 Apr 2025   Prob (F-statistic):           8.83e-83
Time:                        00:18:10   Log-Likelihood:                 381.79
No. Observations:                 381   AIC:                            -735.6
Df Residuals:                     367   BIC:                            -680.4
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.0162      0.021     

In [196]:
x_train = x_train.drop(['semi-furnished'], axis=1)

vif = pd.DataFrame()
vif['Feature'] = x_train.columns
vif['VIF'] = [variance_inflation_factor(x_train.values, i) for i in range(x_train.shape[1])]
print(vif)

            Feature        VIF
0             const  16.818191
1              area   1.366571
2          bedrooms   1.370775
3         bathrooms   1.261795
4           stories   1.482277
5          mainroad   1.177943
6         guestroom   1.221078
7          basement   1.310077
8   hotwaterheating   1.076396
9   airconditioning   1.244056
10          parking   1.234872
11         prefarea   1.157738
12      unfurnished   1.066012


In [197]:
x_train = sm.add_constant(x_train)

lm_4 = sm.OLS(y_train, x_train).fit()

print(lm_4.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.681
Model:                            OLS   Adj. R-squared:                  0.671
Method:                 Least Squares   F-statistic:                     65.61
Date:                Sun, 13 Apr 2025   Prob (F-statistic):           1.07e-83
Time:                        00:18:35   Log-Likelihood:                 381.79
No. Observations:                 381   AIC:                            -737.6
Df Residuals:                     368   BIC:                            -686.3
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.0169      0.019     