In [1]:
import pandas as pd
housing = pd.read_csv('Housing.csv')
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [2]:
# converting Yes to 1 and No 0
housing['mainroad'] = housing['mainroad'].map({'yes':1, 'no':0})
housing['guestroom'] = housing['guestroom'].map({'yes':1, 'no':0})
housing['basement'] = housing['basement'].map({'yes':1, 'no':0})
housing['hotwaterheating'] = housing['hotwaterheating'].map({'yes':1, 'no':0})
housing['airconditioning'] = housing['airconditioning'].map({'yes':1, 'no':0})
housing['prefarea'] = housing['prefarea'].map({'yes':1, 'no':0})

In [3]:
# converting furnishing status column to binary column using get_dummies

status = pd.get_dummies(housing['furnishingstatus'], drop_first=True)
housing = pd.concat([housing,status],axis = 1)
housing.drop(['furnishingstatus'],axis=1,inplace=True)

In [4]:
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1,0
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0,0


In [5]:
# Normalising the data 

housing = (housing - housing.mean())/housing.std()
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
0,4.562174,1.045766,1.402131,1.420507,1.376952,0.405251,-0.464888,-0.733865,-0.219063,1.471267,1.516299,1.803284,-0.844113,-0.69579
1,4.000809,1.755397,1.402131,5.400847,2.5297,0.405251,-0.464888,-0.733865,-0.219063,1.471267,2.67695,-0.553526,-0.844113,-0.69579
2,4.000809,2.216196,0.047235,1.420507,0.224204,0.405251,-0.464888,1.360148,-0.219063,-0.678439,1.516299,1.803284,1.182502,-0.69579
3,3.982096,1.08263,1.402131,1.420507,0.224204,0.405251,-0.464888,1.360148,-0.219063,1.471267,2.67695,1.803284,-0.844113,-0.69579
4,3.551716,1.045766,1.402131,-0.569663,0.224204,0.405251,2.14711,1.360148,-0.219063,1.471267,1.516299,-0.553526,-0.844113,-0.69579


In [6]:
# multi linear regression impl
X = housing[['area','bedrooms']]

# Assign response variable to Y
y = housing['price']

In [7]:
# Add a columns of ls as an intercept to X
# The intercept column is needed for convinent matrix representation of colums
X['intercept']=1
X = X.reindex_axis(['intercept','area','bedrooms'],axis=1)
X.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


Unnamed: 0,intercept,area,bedrooms
0,1,1.045766,1.402131
1,1,1.755397,1.402131
2,1,2.216196,0.047235
3,1,1.08263,1.402131
4,1,1.045766,1.402131


In [8]:
# Apply gradient descent function from scratch we need our X and Y variables
# as numpy arrays.
import numpy as np
X = np.array(X)
y = np.array(y)

In [9]:
# Theta is the vector representation coefficients(intercept,area,bedrooms)
theta = np.matrix(np.array([0,0,0]))
alpha = 0.01
iterations = 1000

In [10]:
# define cost function
# takes in theta (current values of coefficients b0,b1,b2), x and y
# returns total cost at current b0,b1,b2
# need undestanding of defferntial calculus

# Learn numpy matmul

def compute_cost(X,y,theta):
    return np.sum(np.square(np.matmul(X,theta)-y))/(2*len(y))

In [11]:
def gradient_descent_multi(X, y, theta,alpha, iterations):
    theta = np.zeros(X.shape[1])
    m = len(X)
    gdm_df = pd.DataFrame( columns = ['Bets','cost'])
    for i in range(iterations):
        gradient = (1/m) * np.matmul(X.T,np.matmul(X,theta)-y)
        cost = compute_cost(X,y, theta)
        gdm_df.loc[i] = [theta,cost]
    return gdm_df
    

In [12]:
# print cost with various values of coefficients b0,b1,b2
gradient_descent_multi(X,y, theta, alpha, iterations)

Unnamed: 0,Bets,cost
0,"[0.0, 0.0, 0.0]",0.499083
1,"[0.0, 0.0, 0.0]",0.499083
2,"[0.0, 0.0, 0.0]",0.499083
3,"[0.0, 0.0, 0.0]",0.499083
4,"[0.0, 0.0, 0.0]",0.499083
5,"[0.0, 0.0, 0.0]",0.499083
6,"[0.0, 0.0, 0.0]",0.499083
7,"[0.0, 0.0, 0.0]",0.499083
8,"[0.0, 0.0, 0.0]",0.499083
9,"[0.0, 0.0, 0.0]",0.499083
