In [1]:
#importing the necessary libraries
import numpy as np
import pandas as pda
from sklearn.model_selection import train_test_split
from statistics import mean

In [2]:
#reading the CSV file with pandas
housingData = pda.read_csv('Housing_price_data_set.csv')

In [3]:
housingData.columns

Index(['Unnamed: 0', 'price', 'lotsize', 'bedrooms', 'bathrms', 'stories',
       'driveway', 'recroom', 'fullbase', 'gashw', 'airco', 'garagepl',
       'prefarea'],
      dtype='object')

In [4]:
#dropping the row 'Unnamed: 0'
housingData.drop('Unnamed: 0',axis = 1, inplace=True)

In [5]:
housingData

Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,driveway,recroom,fullbase,gashw,airco,garagepl,prefarea
0,42000.0,5850,3,1,2,yes,no,yes,no,no,1,no
1,38500.0,4000,2,1,1,yes,no,no,no,no,0,no
2,49500.0,3060,3,1,1,yes,no,no,no,no,0,no
3,60500.0,6650,3,1,2,yes,yes,no,no,no,0,no
4,61000.0,6360,2,1,1,yes,no,no,no,no,0,no
...,...,...,...,...,...,...,...,...,...,...,...,...
541,91500.0,4800,3,2,4,yes,yes,no,no,yes,0,no
542,94000.0,6000,3,2,4,yes,no,no,no,yes,0,no
543,103000.0,6000,3,2,4,yes,yes,no,no,yes,1,no
544,105000.0,6000,3,2,2,yes,yes,no,no,yes,1,no


In [6]:
#this function changes yes to 1 and no to 0
def yesToOneNoToZero(data,cols):
    for col in cols:
        data[col].replace(('yes','no'),(1,0),inplace = True)

In [7]:
yesToOneNoToZero(housingData,['driveway', 'recroom', 'fullbase', 'gashw', 'airco', 'prefarea'])

In [8]:
housingData

Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,driveway,recroom,fullbase,gashw,airco,garagepl,prefarea
0,42000.0,5850,3,1,2,1,0,1,0,0,1,0
1,38500.0,4000,2,1,1,1,0,0,0,0,0,0
2,49500.0,3060,3,1,1,1,0,0,0,0,0,0
3,60500.0,6650,3,1,2,1,1,0,0,0,0,0
4,61000.0,6360,2,1,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
541,91500.0,4800,3,2,4,1,1,0,0,1,0,0
542,94000.0,6000,3,2,4,1,0,0,0,1,0,0
543,103000.0,6000,3,2,4,1,1,0,0,1,1,0
544,105000.0,6000,3,2,2,1,1,0,0,1,1,0


In [9]:
#diving the data into 70% training and 30% testing
trainingSet,testingSet = train_test_split(housingData,train_size = 0.7,random_state = 10)

In [10]:
trainingSet

Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,driveway,recroom,fullbase,gashw,airco,garagepl,prefarea
175,57500.0,3630,3,2,2,1,0,0,1,0,2,0
25,42300.0,3000,2,1,2,1,0,0,0,0,0,0
403,80000.0,6360,3,1,3,1,0,0,0,0,0,1
529,108000.0,6000,3,2,3,1,0,0,0,1,0,0
514,58900.0,6060,2,1,1,1,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
369,79000.0,6100,3,2,1,1,0,1,0,0,2,1
320,83900.0,5500,3,1,3,1,1,0,0,1,1,0
527,106000.0,6325,3,1,4,1,0,0,0,1,1,0
125,95000.0,4260,4,2,2,1,0,0,1,0,0,0


In [11]:
testingSet

Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,driveway,recroom,fullbase,gashw,airco,garagepl,prefarea
435,75000.0,5300,4,2,1,1,0,0,0,1,0,1
315,78000.0,6210,4,1,4,1,1,0,0,1,0,0
385,78900.0,6900,3,1,1,1,1,1,0,0,0,1
266,52500.0,5640,2,1,1,0,0,0,0,0,0,0
43,92000.0,8580,5,3,2,1,0,0,0,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
26,43500.0,3800,2,1,1,1,0,0,0,0,0,0
212,47900.0,2700,3,1,1,0,0,0,0,0,0,0
508,87000.0,8372,3,1,3,1,0,0,0,1,2,0
227,48000.0,3185,2,1,1,1,0,1,0,0,2,0


In [12]:
trainingX = trainingSet[housingData.columns[1:4]].copy()

trainingY = trainingSet['price'].copy()
testingX = testingSet[housingData.columns[1:4]].copy()

testingY = testingSet['price'].copy()

In [13]:
trainingX

Unnamed: 0,lotsize,bedrooms,bathrms
175,3630,3,2
25,3000,2,1
403,6360,3,1
529,6000,3,2
514,6060,2,1
...,...,...,...
369,6100,3,2
320,5500,3,1
527,6325,3,1
125,4260,4,2


In [14]:
trainingY

175     57500.0
25      42300.0
403     80000.0
529    108000.0
514     58900.0
         ...   
369     79000.0
320     83900.0
527    106000.0
125     95000.0
265     51000.0
Name: price, Length: 382, dtype: float64

In [15]:
#NORMAL EQUATION WITHOUT REGULARIZATION

In [16]:
var = np.dot(trainingX.transpose(),trainingX) #dot product of transpose of trainingX & trainingX

In [17]:
varInverse = np.linalg.inv(var) #taking inverse of var

In [18]:
temp = np.dot(trainingX.transpose(),trainingY) #dot product of transpose of trainingX & trainingY

In [19]:
w = np.dot(varInverse,temp) #dot product of varInverse & temp

In [20]:
w

array([4.60158788e+00, 6.06362592e+03, 2.02446688e+04])

In [21]:
#this gives us list of parameters w0,w1 & w2

In [22]:
#testing
ansPred = w[0]*5300 + w[1]*4 + w[2]*2
print(ansPred)
print('75000')#actual value of price
print(abs(ansPred-75000))

89132.25709576887
75000
14132.257095768873


In [23]:
#computing w0*x0 for each sample
listw0 = []
for x in testingX['lotsize']:
    listw0.append(w[0]*x)

In [24]:
#computing w1*x1 for each sample
listw1 = []
for x in testingX['bedrooms']:
    listw1.append(w[1]*x)

In [25]:
#computing w2*x2 for each sample
listw2 = []
for x in testingX['bathrms']:
    listw2.append(w[2]*x)

In [26]:
#appending the actual value of price to list
actualY = []
for x in testingY:
    actualY.append(x)

In [27]:
#computing w0*x0 + w1*x1 + w2*x2 for each sample and appending the value to the list
#that will be the predicted value of price for each sample
predictionList=[]
for i in range(len(actualY)):
    act = int(actualY[i])
    pred = listw0[i]+listw1[i]+listw2[i]
    predictionList.append([act,pred])

In [28]:
#Calculating the accuracy of each sample by using the formula => (1-abs(actualY - predictedY)/actualY)*100
meanList = []
for x in predictionList:
    meanList.append((1-(abs(x[1]-x[0])/x[0]))*100)

In [29]:
# calculating accuracy of our predictor
error1 = mean(meanList)
error1

76.79825707950803

In [30]:
#Our predictor is 76.79% accurate

In [31]:
#NORMAL EQUATION WITH REGULARIZATION

In [32]:
#creating an Identity Matrix with size equal to number of features , i.e three
identityMatrix = np.identity(3)
identityMatrix[0][0]=0


In [33]:
#printing identity matrix
identityMatrix

array([[0., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [34]:
hyperParameter = 2000 #declaring hyper parameter (lambda) as 2000

In [35]:
var = np.dot(trainingX.transpose(),trainingX) #dot product of transpose of trainingX & trainingX

In [36]:
varNew = var + hyperParameter*identityMatrix #adding lambda*identityMatrix to var

In [37]:
varInverse = np.linalg.inv(varNew) #taking inverse of varNew

In [38]:
temp = np.dot(trainingX.transpose(),trainingY) #dot product of transpose of trainingX & trainingY

In [39]:
w = np.dot(varInverse,temp) #dot product of varInverse and temp

In [40]:
w

array([  10.06615462, 2894.99679357, 1767.30881103])

In [41]:
#This will give value of w0,w1,w2

In [42]:
#testing
ansPred = w[0]*5300 + w[1]*4 + w[2]*2
print(ansPred)
print('75000')#actual value of price
print(abs(ansPred-75000))

68465.22429903378
75000
6534.775700966216


In [43]:
#computing w0*x0 for each sample
listw0 = []
for x in testingX['lotsize']:
    listw0.append(w[0]*x)

In [44]:
#computing w1*x1 for each sample
listw1 = []
for x in testingX['bedrooms']:
    listw1.append(w[1]*x)

In [45]:
#computing w2*x2 for each sample
listw2 = []
for x in testingX['bathrms']:
    listw2.append(w[2]*x)

In [46]:
#appending the actual value of price to list
actualY = []
for x in testingY:
    actualY.append(x)

In [47]:
#computing w0*x0 + w1*x1 + w2*x2 for each sample and appending the value to the list
#that will be the predicted value of price for each sample
predictionList=[]
for i in range(len(actualY)):
    act = int(actualY[i])
    pred = listw0[i]+listw1[i]+listw2[i]
    predictionList.append([act,pred])

In [48]:
#Calculating the accuracy of each sample by using the formula => (1-abs(actualY - predictedY)/actualY)*100
meanList = []
for x in predictionList:
    meanList.append((1-(abs(x[1]-x[0])/x[0]))*100)

In [49]:
# calculating accuracy of our predictor
error2 = mean(meanList)
error2

77.32767202548783

In [50]:
#Our predictor is 77.32 % accurate

In [51]:

#Result: The Accuracy of NORMAL EQUATION WITHOUT REGULARIZATION IS 76.79 %
#        The Accuracy of NORMAL EQUATION WITH REGULARIZATION IS 77.32 %
