# Housing price prediction using Machine Learning

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,KFold
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from mlxtend.regressor import StackingRegressor
from sklearn.metrics import mean_squared_error

In [2]:
data=pd.read_csv("D:\\AI,ML,DS\\Projects\\house_price_predictions\Housing Price.csv")

In [4]:
data.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
0,79545.45857,5.682861,7.009188,4.09,23086.8005,1059034.0
1,79248.64245,6.0029,6.730821,3.09,40173.07217,1505891.0
2,61287.06718,5.86589,8.512727,5.13,36882.1594,1058988.0
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1260617.0
4,59982.19723,5.040555,7.839388,4.23,26354.10947,630943.5


# Data Cleaning

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

# Exploratory Analysis

In [None]:
data.describe()

In [None]:
sns.pairplot(data)
plt.plot()

In [None]:
sns.heatmap(data.corr(),vmin=-1,vmax=1,annot=True)
plt.show()

# Machine Learning

In [None]:
data.head()

In [None]:
x=data.iloc[:,:5].values
y=data.iloc[:,5].values

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

# Linear Regression

In [None]:
lm=LinearRegression()

In [None]:
lm.fit(x_train,y_train)

In [None]:
lm.coef_

In [None]:
data.columns[0:5]

In [None]:
pd.DataFrame(lm.coef_,index=data.columns[0:5],columns=["Coefficient"])

In [None]:
lm.intercept_

In [None]:
lm.score(x_train,y_train)

In [None]:
y_pred=lm.predict(x_test)

In [None]:
x_with_constant=sm.add_constant(x_train)

In [None]:
lm_sm=sm.OLS(y_train,x_with_constant)

In [None]:
result=lm_sm.fit()

In [None]:
print(result.summary())

In [None]:
print(variance_inflation_factor(x_train,0))
print(variance_inflation_factor(x_train,1))
print(variance_inflation_factor(x_train,2))
print(variance_inflation_factor(x_train,3))
print(variance_inflation_factor(x_train,4))

In [None]:
resid=y_train-lm.predict(x_train)

In [None]:
sns.distplot(resid)
plt.show()

In [None]:
sns.scatterplot(lm.predict(x_train),resid)
plt.show()

# KNN Regression

In [None]:
Errors=[]
K=np.arange(1,21)

for k in K:
    model=KNeighborsRegressor(n_neighbors=k)
    cvals=np.sqrt(-cross_val_score(model,x_train,y_train,cv=10,scoring="neg_mean_squared_error"))
    Errors.append(cvals.mean())

In [None]:
plt.plot(K,Errors,"rx-")
plt.show()

In [None]:
knn = KNeighborsRegressor(n_neighbors=4)

In [None]:
knn.fit(x_train, y_train)

In [None]:
y_pred=lm.predict(x_test)

# Random Forest Regression

In [None]:
params={"n_estimators":[100,200,300,400,500]}
model=RandomForestRegressor()
cval=KFold(n_splits=5)

In [None]:
gsearch=GridSearchCV(model,params,cv=cval)

In [None]:
results=gsearch.fit(x_train,y_train)
results.best_params_

In [None]:
rf = RandomForestRegressor(n_estimators=500)

In [None]:
rf.fit(x_train, y_train)

In [None]:
y_pred=rf.predict(x_test)

In [None]:
sns.scatterplot(y_test,y_pred)
plt.plot()

In [None]:
idx=np.argsort(rf.feature_importances_)

In [None]:
sns.barplot(x=rf.feature_importances_[idx],y=data.columns[:5][idx])
plt.show()

# Stacking Regression

In [None]:
bmodel1=LinearRegression()
bmodel2=KNeighborsRegressor(n_neighbors=6)

In [None]:
metamodel=RandomForestRegressor(n_estimators=500)

In [None]:
st=StackingRegressor(regressors=[bmodel1,bmodel2],meta_regressor=metamodel)

In [None]:
st.fit(x_train, y_train)

In [None]:
y_pred=st.predict(x_test)

In [None]:
sns.scatterplot(y_test,y_pred)
plt.plot()

# Comparing performance

In [None]:
y_pred=lm.predict(x_test)
lm_rmse=np.sqrt(mean_squared_error(y_pred,y_test))

In [None]:
y_pred=knn.predict(x_test)
knn_rmse=np.sqrt(mean_squared_error(y_pred,y_test))

In [None]:
y_pred=rf.predict(x_test)
rf_rmse=np.sqrt(mean_squared_error(y_pred,y_test))

In [None]:
y_pred=st.predict(x_test)
st_rmse=np.sqrt(mean_squared_error(y_pred,y_test))

In [None]:
pd.DataFrame({"Model":["Linear Regression","KNN","Random Forest","Stacking"],"RMSE":[lm_rmse,knn_rmse,rf_rmse,st_rmse]})