# House Price Prediction  

### 1. Create Linear Regression Model for predicting the HousePrice

In [1]:
# Importing all the required packages 

import time
import pandas as pd
import requests
import re
import matplotlib.pyplot as plt
import numpy as np
import math
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
pd.pandas.set_option('display.max_columns', None)

In [2]:
housedf=pd.read_csv("HousePriceDetails_Cleaned.csv")
housedf.head()

Unnamed: 0,Price,Bed,Bath,FloorSize,City,ZipCode,LotSize,AgentInfo,PropertyType,CLIndex,popDensity,MAge,MIncome,BPovertyLine,MRent,CrimeIndex,SchoolCount,Restaurants,Groceries,Age,AgeCategory,HOAfees_0
0,1250000.0,4.0,3.0,2209.0,SanJose,95136,6141.0,ML81840754,Single Family Home,149.7,10251.0,36.7,117191.0,7.3,2376.0,277.0,3,51.0,10.0,45.0,MIDDLE AGED,1
1,985000.0,4.0,2.0,1883.0,SanJose,95123,7130.0,ML81841879,Single Family Home,150.0,8331.0,37.8,122947.0,4.9,2608.0,277.0,4,63.0,9.0,53.0,MIDDLE AGED,1
2,925000.0,4.0,2.0,1154.0,SanJose,95127,6838.0,ML81842519,Single Family Home,148.3,5151.0,36.1,103198.0,6.8,2075.0,277.0,1,57.0,12.0,67.0,MIDDLE AGED,1
3,979000.0,4.0,3.0,1743.0,SanJose,95111,6520.0,ML81842454,Single Family Home,148.2,11536.0,35.2,76430.0,11.0,1631.0,277.0,4,46.0,9.0,54.0,MIDDLE AGED,1
4,1148880.0,5.0,2.0,1715.0,SanJose,95136,8800.0,40947217,ResidentialProperty Subtype,149.7,10251.0,36.7,117191.0,7.3,2376.0,277.0,3,41.0,9.0,52.0,MIDDLE AGED,0


In [3]:
#Dropping columns which is not helpful in predicting the House Price
housedf.drop(['ZipCode','City','AgentInfo','PropertyType','AgeCategory'],axis=1,inplace=True)
## Check if any column contain 0 as numerical values
zeroValuedColumns=[]
for column in list(housedf.columns):
    if 0 in housedf[column].unique():
        zeroValuedColumns.append(column)
zeroValuedColumns      

['LotSize', 'CLIndex', 'Age', 'HOAfees_0']

In [4]:
#Age and HOAfees_0 can have 0 as values. SO Handling other two columns

housedf.loc[housedf['LotSize']==0,'LotSize']=housedf['LotSize'].mean()
housedf.loc[housedf['CLIndex']==0,'CLIndex']=housedf['CLIndex'].mean()
housedf.loc[housedf['Age']==0,'Age']=0.5


##### Data Preprocessing before training the  Model

In [5]:
#Defining the features and dependent variable
cat_cols=['HOAfees_0']
target_val=['Price']
numerical_cols=list(set(housedf.columns)-set(cat_cols))
numerical_cols,cat_cols,target_val
features=numerical_cols+cat_cols
features.remove('Price')


In [9]:
housedf[numerical_cols]

Unnamed: 0,popDensity,Price,CLIndex,LotSize,SchoolCount,Bath,FloorSize,MIncome,Age,Groceries,BPovertyLine,MRent,CrimeIndex,MAge,Bed,Restaurants
0,10251.0,1250000.0,149.7,6141.0,3,3.0,2209.0,117191.0,45.0,10.0,7.3,2376.0,277.0,36.7,4.0,51.0
1,8331.0,985000.0,150.0,7130.0,4,2.0,1883.0,122947.0,53.0,9.0,4.9,2608.0,277.0,37.8,4.0,63.0
2,5151.0,925000.0,148.3,6838.0,1,2.0,1154.0,103198.0,67.0,12.0,6.8,2075.0,277.0,36.1,4.0,57.0
3,11536.0,979000.0,148.2,6520.0,4,3.0,1743.0,76430.0,54.0,9.0,11.0,1631.0,277.0,35.2,4.0,46.0
4,10251.0,1148880.0,149.7,8800.0,3,2.0,1715.0,117191.0,52.0,9.0,7.3,2376.0,277.0,36.7,5.0,41.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456,394.0,375000.0,156.1,2748.0,3,2.0,1651.0,143001.0,48.0,14.0,5.1,1950.0,118.4,46.7,3.0,48.0
457,394.0,500000.0,156.1,20037.6,3,2.0,1651.0,143001.0,48.0,2.0,5.1,1950.0,118.4,46.7,3.0,9.0
458,394.0,425000.0,156.1,6134.0,3,2.0,1440.0,143001.0,45.0,1.0,5.1,1950.0,118.4,46.7,2.0,5.0
459,394.0,225000.0,156.1,2500.0,3,2.0,1651.0,143001.0,48.0,15.0,5.1,1950.0,118.4,46.7,3.0,47.0


In [326]:
#Log Transformation of the numerical variables
for feature in numerical_cols:
    housedf[feature]=np.log(housedf[feature])
housedf.head()


Unnamed: 0,Price,Bed,Bath,FloorSize,LotSize,CLIndex,popDensity,MAge,MIncome,BPovertyLine,MRent,CrimeIndex,SchoolCount,Restaurants,Groceries,Age,HOAfees_0
0,14.038654,1.386294,1.098612,7.700295,8.722743,5.008633,9.235131,3.602777,11.67156,1.987874,7.773174,5.624018,1.098612,3.931826,2.302585,3.806662,1
1,13.800397,1.386294,0.693147,7.540622,8.872067,5.010635,9.027739,3.632309,11.719509,1.589235,7.866339,5.624018,1.386294,4.143135,2.197225,3.970292,1
2,13.737549,1.386294,0.693147,7.050989,8.830251,4.999237,8.546946,3.586293,11.544405,1.916923,7.637716,5.624018,0.0,4.043051,2.484907,4.204693,1
3,13.794287,1.386294,1.098612,7.463363,8.78263,4.998563,9.353228,3.561046,11.244131,2.397895,7.396949,5.624018,1.386294,3.828641,2.197225,3.988984,1
4,13.954298,1.609438,0.693147,7.447168,9.082507,5.008633,9.235131,3.602777,11.67156,1.987874,7.773174,5.624018,1.098612,3.713572,2.197225,3.951244,0


##### Dividing the dataset into Training and Test

In [327]:
#Dividing the dataset into Training and Testing set.We are selecting some important features which can effect the price based on the
#analysis we did while exploring the relationship between different features and House Price

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(housedf[features],housedf['Price'],test_size=0.1,random_state=0)

In [328]:
X_train.head()

Unnamed: 0,Restaurants,FloorSize,MAge,MIncome,CLIndex,popDensity,Bath,CrimeIndex,BPovertyLine,SchoolCount,MRent,Bed,LotSize,Groceries,Age,HOAfees_0
407,4.997212,7.390181,3.673766,11.668663,4.989086,8.694837,0.693147,5.338499,1.791759,1.098612,7.698936,1.386294,8.612503,2.833213,4.26268,0
54,3.401197,7.864804,3.632309,11.94526,4.984976,8.116716,1.386294,4.94805,1.029619,1.098612,7.880426,1.609438,8.752898,2.079442,1.098612,0
76,4.094345,7.020191,3.691376,11.466662,4.969119,7.336937,0.693147,5.60175,1.757858,1.098612,7.586296,1.098612,8.544808,2.197225,4.189655,1
100,0.0,8.379309,3.850148,12.043583,4.992471,5.817111,1.386294,5.60175,0.875469,1.609438,8.062748,1.609438,10.689465,0.0,2.944439,0
344,1.386294,8.2938,3.799974,12.035071,4.984976,7.768956,1.609438,4.863681,1.458615,1.098612,7.767264,1.609438,10.32522,2.484907,3.044522,0


In [329]:
# fit scaler on training data
#Not scaling the Dependent Variable
feature_scale=[feature for feature in features if feature not in ['HOAfees_0']]
norm = MinMaxScaler().fit(X_train[feature_scale])


In [330]:
# transform the train and test set, and add on the Id and SalePrice variables
X_train = pd.concat([X_train[['HOAfees_0']].reset_index(drop=True),
                    pd.DataFrame(norm.transform(X_train[feature_scale]), columns=feature_scale)],
                    axis=1)
X_test= pd.concat([X_test[['HOAfees_0']].reset_index(drop=True),
                    pd.DataFrame(norm.transform(X_test[feature_scale]), columns=feature_scale)],
                    axis=1)



#### Feature Selection

In [332]:
feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=0)) # remember to set the seed, the random state in this function
feature_sel_model.fit(X_train, y_train)

SelectFromModel(estimator=Lasso(alpha=0.005, copy_X=True, fit_intercept=True,
                                max_iter=1000, normalize=False, positive=False,
                                precompute=False, random_state=0,
                                selection='cyclic', tol=0.0001,
                                warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [333]:
feature_sel_model.get_support()

#print the number of total and selected features
selected_feat = X_train.columns[(feature_sel_model.get_support())]
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
selected_feat


total features: 16
selected features: 7


Index(['HOAfees_0', 'FloorSize', 'MIncome', 'CLIndex', 'popDensity', 'MRent',
       'Bed'],
      dtype='object')

##### Training the model  with selected features

In [334]:

from sklearn.linear_model import LinearRegression
X_train=X_train[selected_feat]
X_test=X_test[selected_feat]

std_reg = LinearRegression()
std_reg.fit(X_train, y_train)

# Create Y Predictions
y_predict = std_reg.predict(X_test)

rmse = math.sqrt(mean_squared_error(y_test, y_predict))

In [335]:
mlr_coefficient = std_reg.coef_
mlr_intercept = std_reg.intercept_
print("Coefficients are",mlr_coefficient)
print("In the order",selected_feat)

Coefficients are [0.06448846 3.14548313 0.39429342 0.93882157 0.85277486 0.44121933
 0.672787  ]
In the order Index(['HOAfees_0', 'FloorSize', 'MIncome', 'CLIndex', 'popDensity', 'MRent',
       'Bed'],
      dtype='object')


###### Observations:
##### The coefficients for all the predictors are positive which conveys that increase in one unit of Value of those features will lead to increase in Average Cost of House.
##### For example With increase in 1 units of Floor Size the AVearge price is expected to increase by 3 units.

In [336]:
y_test_orig=np.exp(y_test)
y_predict_orig=np.exp(y_predict)
np.set_printoptions(suppress=True)
pd.DataFrame(np.c_[y_test_orig,y_predict_orig],columns=["Original","Predicted"]).head()


Unnamed: 0,Original,Predicted
0,2145000.0,2056529.0
1,799000.0,1001730.0
2,1049000.0,943859.8
3,799000.0,936111.9
4,999888.0,1080371.0
