In [1]:
import numpy as np
data = np.array([map(float, l.split()) for l in open('data.txt')])
y = data[:,-1]
print 'Data points', len(y)
print y.T

Data points 28
[ 25.9  29.5  27.9  25.9  29.9  29.9  30.9  28.9  84.9  82.9  35.9  31.5
  31.   30.9  30.   28.9  36.9  41.9  40.5  43.9  37.5  37.9  44.5  37.9
  38.9  36.9  45.8  41. ]


In [3]:
# First define some handy shortcuts 
dot = np.dot
inv = np.linalg.inv
# Dummy features
X = np.ones((28,1))
# Fitting the parameters: theta = (X'*X)^-1*X'*y
theta = dot(dot(inv(dot(X.T, X)), X.T), y)
print theta

[ 38.15714286]


In [4]:
# MSE = (1/N)*sum((y-X*theta)^2) 
print sum((y-dot(X, theta))**2) / len(y)

193.256734694


In [2]:
!head -n 33 x26.txt|tail -n 12

#      A1,  the local selling prices, in hundreds of dollars;
#      A2,  the number of bathrooms;
#      A3,  the area of the site in thousands of square feet;
#      A4,  the size of the living space in thousands of square feet;
#      A5,  the number of garages;
#      A6,  the number of rooms;
#      A7,  the number of bedrooms;
#      A8,  the age in years;
#      A9,  1 = brick, 2 = brick/wood, 3 = aluminum/wood, 4 = wood.
#      A10, 1 = two story, 2 = split level, 3 = ranch
#      A11, number of fire places.
#      B,   the selling price.


In [5]:
# Let's add a continuous variable, like the area of the house
X = np.hstack((X, data[:,2].reshape(len(y),1)))
print X.shape

(28, 2)


In [6]:
from scipy.linalg import lstsq
theta = lstsq(X,y)[0]
print sum((y-dot(X, theta))**2) / len(y)

107.387964132


In [7]:
# Let's add the number of garages as anoher variable
X = np.hstack((X, data[:,4].reshape(len(y),1)))
theta = lstsq(X,y)[0]
print sum((y-dot(X, theta))**2) / len(y)

83.6998330565


In [8]:
#To add a categorical variable, like "construction type" we have to decide how to represent it. Two options: 
# 1) Giving a value to every category choice (1, 2, 3, ...).
# 2) Using an indicator vector, with a zero for each dimension, except for the active that is one.
X1 = np.hstack((X, data[:,8].reshape(len(y),1)))
ind = np.zeros((len(y), max(data[:,8]).astype(int)))
ind[range(0,len(y)), (data[:,8]-1).astype(int)]=1
X2 = np.hstack((X, ind))
print 'Representation 1,2,3,4,...', '\n', X1
print 'Representation [0,1,0,0]', '\n', X2

Representation 1,2,3,4,... 
[[  1.       3.472    1.       3.    ]
 [  1.       3.531    2.       1.    ]
 [  1.       2.275    1.       2.    ]
 [  1.       4.05     1.       4.    ]
 [  1.       4.455    1.       3.    ]
 [  1.       4.455    1.       2.    ]
 [  1.       5.85     1.       2.    ]
 [  1.       9.52     0.       1.    ]
 [  1.       9.8      2.       2.    ]
 [  1.      12.8      2.       4.    ]
 [  1.       6.435    2.       1.    ]
 [  1.       4.9883   1.       1.    ]
 [  1.       5.52     1.       1.    ]
 [  1.       6.666    2.       2.    ]
 [  1.       5.       0.       4.    ]
 [  1.       9.52     0.       1.    ]
 [  1.       5.15     2.       4.    ]
 [  1.       6.902    1.5      1.    ]
 [  1.       7.102    1.       2.    ]
 [  1.       7.8      1.5      3.    ]
 [  1.       5.52     2.       4.    ]
 [  1.       4.       1.       1.    ]
 [  1.       9.89     2.       1.    ]
 [  1.       6.7265   1.       4.    ]
 [  1.       9.15     2.       1.   

In [9]:
theta = lstsq(X1,y)[0]
print 'Representation 1,2,3,...', sum((y-dot(X1, theta))**2) / len(y)
theta = lstsq(X2,y)[0]
print 'Representation [0,1,0,0]', sum((y-dot(X2, theta))**2) / len(y)
# Just one word of caution with indicator vectors: if we use more dimensions 
# than we should, we may end up with a low-rank matrix that we can't invert!

Representation 1,2,3,... 73.5050669967
Representation [0,1,0,0] 66.4356761119


In [None]:
print 'Back to the slides...'

In [12]:
R = np.random.random((len(y), 30))
theta = lstsq(R,y)[0]
print 'Random', sum((y-dot(R, theta))**2) / len(y)

Random 2.54193522545e-27


In [15]:
n = len(y) / 2
R = np.random.random((n, 500))
theta = lstsq(R,y[:n])[0]
print 'Random', sum((y[n:]-dot(R, theta))**2) / n

Random 343.104285714
