# Applying various regression algorithm on housing datasets

## Import all necessary packages

In [1]:


# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)
import pandas as pd
# To plot pretty figures
%matplotlib inline
import time
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12


## Fetching data from disk

In [20]:
data=pd.read_csv('../../datasets/Assignment_data/Data_Q1/housing.csv')


## Preprocessing on  datasets

### Displaying Data

In [21]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


### Replacing null value by mean

In [22]:
#cleaning

columns=['longitude','latitude','housing_median_age','total_rooms','total_bedrooms','population','households'
         ,'median_income','median_house_value','ocean_proximity']

for i in columns:
    if(np.where(data.isnull()[i]==True)[0].shape!=(0,)):
        data[i]=data[i].fillna(data[i].mean())
data['intercept']=1

### Convert category column into one hot vectors

In [23]:
#Category

data['ocean_proximity']=data['ocean_proximity'].astype('category').cat.codes
data=pd.get_dummies(data,columns=['ocean_proximity'])
for i in range(5):
    s='ocean_proximity_'+str(i)
    data[s]=data[s].astype('float64')
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,intercept,ocean_proximity_0,ocean_proximity_1,ocean_proximity_2,ocean_proximity_3,ocean_proximity_4
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,1,0.0,0.0,0.0,1.0,0.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,1,0.0,0.0,0.0,1.0,0.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,1,0.0,0.0,0.0,1.0,0.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,1,0.0,0.0,0.0,1.0,0.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,1,0.0,0.0,0.0,1.0,0.0


### Split data into train and test 

In [24]:
#train-test split

train=data.sample(frac=0.8,random_state=200)
test=data.drop(train.index)

trainy=train['median_house_value'].as_matrix()
trainy=trainy.reshape(trainy.shape[0],1)
trainx=train.drop(['median_house_value'],axis=1).as_matrix()

testy=test['median_house_value'].as_matrix()
testx=test.drop(['median_house_value'],axis=1).as_matrix()

## Regression algorithm by various method

### Closed form

In [26]:
#closed form
a=time.clock()
W=np.dot(np.linalg.inv(np.dot(X.T,X)),np.dot(X.T,Y))
b=time.clock()
pred=np.dot(X,W)
print("train=",np.square(pred-Y).sum())
predtest=np.dot(testx,W)
print("test=",np.square(predtest-testy).sum())

print("training Time=",b-a)

train= 1.57415829209e+16
test= 1.52516183813e+19
training Time= 0.06035785454514553


Using general closed from equation (X^T.X)^-1.X^T.Y

### Gradient descent

In [31]:
#gradient descent
W=np.random.rand(X.shape[1],Y.shape[1])
alpha=0.5
n=X.shape[0]
reg=0.1
a=time.clock()
for i in range(10):
    grad=2*(np.dot(X.T,np.dot(X,W)-Y))/n
    W-=alpha*grad

b=time.clock()
pred=np.dot(X,W)

print("train=",np.square(pred-Y).sum())
predtest=np.dot(testx,W)
print("test=",np.square(predtest-testy).sum())
print("training Time=",b-a)

train= 3.02272568048e+158
test= 3.16741406167e+161
training Time= 0.002078392299154075


Gradient descent is iteratively used to update weight for best result.Equation used is 2/n*(X^T*w-y)

### Newtons Method

In [38]:
#Hessian

W=np.random.rand(X.shape[1],Y.shape[1])
alpha=0.5
n=X.shape[0]
a=time.clock()
H=np.linalg.inv(np.dot(X.T,X))
print(H.shape)
grad=np.dot(X.T,np.dot(X,W)-Y)/n
W=W-np.dot(H,grad)
b=time.clock()
pred=np.dot(X,W)
print("train=",np.square(pred-Y).sum())
predtest=np.dot(testx,W)
print("test=",np.square(predtest-testy).sum())
print("training Time=",b-a)

(14, 14)
train= 9.10382273855e+14
test= 9.32637413611e+17
training Time= 0.0014957318944652798


Newton Method is being applied calculating the hasseien matrix and then computing gradient and training time.

### Ridge Method

In [46]:
#Ridge

alpha=[0.1,0.5,1,0.01]
for i in alpha:
    print("alpha=",i)
    a=time.clock()

    W=np.dot(np.linalg.inv(np.dot(X.T,X)+i*np.identity(14)),np.dot(X.T,Y))
    b=time.clock()
    pred=np.dot(X,W)
    print("train=",np.square(pred-Y).sum())
    predtest=np.dot(testx,W)

    print("test=",np.square(predtest-testy).sum())

    print("training Time=",b-a)
    print("")

alpha= 0.1
train= 7.89484435137e+13
test= 3.64859034523e+17
training Time= 0.0007437210042553488

alpha= 0.5
train= 7.93937428289e+13
test= 3.63841079002e+17
training Time= 0.0005984506606182549

alpha= 1
train= 7.98332569699e+13
test= 3.63463250036e+17
training Time= 0.0006671382689091843

alpha= 0.01
train= 7.89000399487e+13
test= 3.65398696861e+17
training Time= 0.0007062191507429816



### Lasso Method

In [43]:
#Lasso

from sklearn import linear_model
alpha=[0.1,0.5,1,0.01]
for i in alpha:
    print("alpha=",i)
    clf=linear_model.Lasso(alpha=i)
    a=time.clock()
    clf.fit(X,Y)
    b=time.clock()
    pred=clf.predict(X)
    print("train=",np.square(pred-Y).sum())
    predtest=clf.predict(testx)
    print("test=",np.square(predtest-testy).sum())
    print("training Time=",b-a)
    print("")

alpha= 0.1




train= 6.00764607905e+18
test= 1.85532912672e+13
training Time= 0.16465761990366445

alpha= 0.5
train= 6.00758315837e+18
test= 1.85537269995e+13
training Time= 0.19145802476487006

alpha= 1
train= 6.00750501873e+18
test= 1.85542794166e+13
training Time= 0.19038862975321535

alpha= 0.01
train= 6.0076602863e+18
test= 1.85531939871e+13
training Time= 0.16646797262728796



Ridge and lasso help in solving the problem of overfitting.Ridge has been solved explicitly without using libraries while lasso is solved using library of scikit learn.