# House Prediction Model

<div style="text-align: center;  ">
    <img src="attachment:b83babb4-95f2-4206-a30f-fdd439cc3866.jpg" width=50% />
</div>



In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

In [2]:
data=pd.read_csv('Housing.csv')

In [3]:
data.head(100)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,6300000,4100,3,2,3,yes,no,no,no,yes,2,no,semi-furnished
96,6300000,9000,3,1,1,yes,no,yes,no,no,1,yes,furnished
97,6300000,6400,3,1,1,yes,yes,yes,no,yes,1,yes,semi-furnished
98,6293000,6600,3,2,3,yes,no,no,no,yes,0,yes,unfurnished


# Data Preprocessing

In [4]:
data.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.693578
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.861586
min,1750000.0,1650.0,1.0,1.0,1.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,3.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [6]:
data.shape

(545, 13)

In [7]:
data.isnull().sum()
#Hence already cleaned

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

### EDA 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Outlier Analysis
fig, axs = plt.subplots(2,3, figsize = (10,5))
plt1 = sns.boxplot(data['price'], ax = axs[0,0])
plt2 = sns.boxplot(data['area'], ax = axs[0,1])
plt3 = sns.boxplot(data['bedrooms'], ax = axs[0,2])
plt1 = sns.boxplot(data['bathrooms'], ax = axs[1,0])
plt2 = sns.boxplot(data['stories'], ax = axs[1,1])
plt3 = sns.boxplot(data['parking'], ax = axs[1,2])

plt.tight_layout()

In [None]:
# outlier treatment for price
plt.boxplot(data.price)
Q1 = data.price.quantile(0.25)
Q3 = data.price.quantile(0.75)
IQR = Q3 - Q1
data = data[(data.price >= Q1 - 1.5*IQR) & (data.price <= Q3 + 1.5*IQR)]

In [None]:
# outlier treatment for area
plt.boxplot(data.area)
Q1 = data.area.quantile(0.25)
Q3 = data.area.quantile(0.75)
IQR = Q3 - Q1
data = data[(data.area >= Q1 - 1.5*IQR) & (data.area <= Q3 + 1.5*IQR)]

In [None]:
# Outlier Analysis
fig, axs = plt.subplots(2,3, figsize = (10,5))
plt1 = sns.boxplot(data['price'], ax = axs[0,0])
plt2 = sns.boxplot(data['area'], ax = axs[0,1])
plt3 = sns.boxplot(data['bedrooms'], ax = axs[0,2])
plt1 = sns.boxplot(data['bathrooms'], ax = axs[1,0])
plt2 = sns.boxplot(data['stories'], ax = axs[1,1])
plt3 = sns.boxplot(data['parking'], ax = axs[1,2])

plt.tight_layout()

In [None]:
sns.pairplot(data)
plt.show()

In [None]:
plt.figure(figsize=(20, 12))
plt.subplot(2,3,1)
sns.boxplot(x = 'mainroad', y = 'price', data = data)
plt.subplot(2,3,2)
sns.boxplot(x = 'guestroom', y = 'price', data = data)
plt.subplot(2,3,3)
sns.boxplot(x = 'basement', y = 'price', data = data)
plt.subplot(2,3,4)
sns.boxplot(x = 'hotwaterheating', y = 'price', data = data)
plt.subplot(2,3,5)
sns.boxplot(x = 'airconditioning', y = 'price', data = data)
plt.subplot(2,3,6)
sns.boxplot(x = 'furnishingstatus', y = 'price', data = data)
plt.show()

In [None]:
plt.figure(figsize = (10, 5))
sns.boxplot(x = 'furnishingstatus', y = 'price', hue = 'airconditioning', data = data)
plt.show()

# Data preperation

In [None]:
# List of variables to map

varlist =  ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']

# Defining the map function
def binary_map(x):
    return x.map({'yes': 1, "no": 0})

# Applying the function to the data list
data[varlist] = data[varlist].apply(binary_map)

In [None]:
data.head()

### Dumping Values

In [None]:
# Get the dummy variables for the feature 'furnishingstatus' and store it in a new variable - 'status'
status = pd.get_dummies(data['furnishingstatus'])

In [None]:
status.head()

In [None]:
# Let's drop the first column from status df using 'drop_first = True'

status = pd.get_dummies(data['furnishingstatus'], drop_first = True)

In [None]:
# Add the results to the original data dataframe

data = pd.concat([data, status], axis = 1)

In [None]:
data.head()

In [None]:
# Drop 'furnishingstatus' as we have created the dummies for it

data.drop(['furnishingstatus'], axis = 1, inplace = True)
data.drop(['semi-furnished'], axis = 1, inplace = True)
data.drop(['unfurnished'], axis = 1, inplace = True)

In [None]:
data.head()

In [None]:
data.shape

In [None]:
#as the columns contain the multiple types of values so we can'nt create the price per sq column

# Saving final data set

In [None]:
data.to_csv("final_dataset.csv")

In [None]:
x=data.drop(columns=['price'])
y=data['price']

### Model Building

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y, test_size=0.2,random_state=0)

In [None]:
print(x_train.shape,y_train.shape)

In [None]:
print(x_test,y_test)
x_test

### Apply Linear Regression

In [None]:
column_trans=make_column_transformer((OneHotEncoder(sparse_output=False),['bedrooms']),remainder='passthrough')

In [None]:
scaler=StandardScaler()

In [None]:
lr=LinearRegression()

In [None]:
#x being feature matrics

In [None]:
x_scaled= scaler.fit_transform(x)

In [None]:
lr.fit(x_scaled,y)

In [None]:
pipe=make_pipeline(column_trans,scaler,lr)

In [None]:
pipe.fit(x_train,y_train)

In [None]:
y_pred_lr = pipe.predict(x_test)

In [None]:
r2_score(y_test,y_pred_lr)

### Using Lasso

In [None]:
lasso=Lasso()

In [None]:
pipe=make_pipeline(column_trans,scaler,lasso)

In [None]:
pipe.fit(x_train,y_train)

In [None]:
y_pred_lasso = pipe.predict(x_test)

In [None]:
r2_score(y_test,y_pred_lasso)

### using ridge

In [None]:
ridge=Ridge()

In [None]:
pipe=make_pipeline(column_trans,scaler,ridge)

In [None]:
pipe.fit(x_train,y_train)

In [None]:
y_pred_redge = pipe.predict(x_test)
y_pred_redge

In [None]:
r2_score(y_test,y_pred_redge)

In [None]:
print(r2_score(y_test,y_pred_redge))
print(r2_score(y_test,y_pred_lasso))
print(r2_score(y_test,y_pred_lr))

In [None]:
import pickle

In [None]:
pickle.dump(pipe,open('RidgeModel.pkl','wb'))