# Pattern Recognition: Linear Regression Exercises

In [1]:
print('These are the header titles')
!head -n 47 ../data_housing.names|tail -n 17

These are the header titles
    1. CRIM      per capita crime rate by town
    2. ZN        proportion of residential land zoned for lots over 
                 25,000 sq.ft.
    3. INDUS     proportion of non-retail business acres per town
    4. CHAS      Charles River dummy variable (= 1 if tract bounds 
                 river; 0 otherwise)
    5. NOX       nitric oxides concentration (parts per 10 million)
    6. RM        average number of rooms per dwelling
    7. AGE       proportion of owner-occupied units built prior to 1940
    8. DIS       weighted distances to five Boston employment centres
    9. RAD       index of accessibility to radial highways
    10. TAX      full-value property-tax rate per $10,000
    11. PTRATIO  pupil-teacher ratio by town
    12. B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks 
                 by town
    13. LSTAT    % lower status of the population
    14. MEDV     Median value of owner-occupied homes in $1000's

> Q1) Get the Housing Data Set form the repository [Names, Data] (it is different from the one used in the notebook!).

> Q2) Load the data (using numpy.loadtxt) and separate the last column (target value, MEDV). Compute the average of the target value and the MSE obtained using it as a constant prediction.

In [2]:
# Import the needed library
import numpy as np

# Get the data
data = np.loadtxt('../housing.data')

# Get the real data
real_data = np.array([d[:-1] for d in data])

# Get the last column values
target_value = np.array([d[-1] for d in data])

# Average of this values
target_value_avg = np.average(target_value)

# Array length of target_value
target_value_length = len(target_value)

# First define some handy shortcuts 
dot = np.dot
inv = np.linalg.inv

# Dummy features
X = np.ones((target_value_length, 1))

# theta = ((X.T * X)^-1) * X.T * y
theta = dot(dot(inv(dot(X.T, X)), X.T), target_value)

# MSE = (1/N)*sum((y-X*theta)^2)
mse = sum((target_value-dot(X, theta))**2) / target_value_length

print ('QUESTION 1:')
print ('- Data got and loaded')
print ('- Last column separated, Median value of owner-occupied homes in $1000\'s (MEDV)')
print ('- Avg of MEDV')
print ('- MSE of MEDV')
print ('')
print ('QUESTION 1:')
print ('-------------------')
print ('Number of target values:', target_value_length)
print ('Avg:', target_value_avg)
print ('Minimum Squared Error (MSE):', mse)

QUESTION 1:
- Data got and loaded
- Last column separated, Median value of owner-occupied homes in $1000's (MEDV)
- Avg of MEDV
- MSE of MEDV

QUESTION 1:
-------------------
Number of target values: 506
Avg: 22.532806324110677
Minimum Squared Error (MSE): 84.41955615616563


> Q3) Split the data in two parts (50%-50%) for training and testing (first half for training, second half for testing). Train a linear regressor model for each variable individually (plus a bias term) and compute the MSE on the training and the testing set. Which variable is the most informative? which one makes the model generalize better? and worse? Compute the coefficient of determination (R^2) for the test set.

In [3]:
training_data  = []
testing_data = []
real_data_length = len(real_data)
half_real_data_length = real_data_length/2

for idx, d in enumerate(real_data):
    if idx <= half_real_data_length:
        training_data.append(d)
    else:
        testing_data.append(d)

training_data = np.array(training_data)
testing_data = np.array(testing_data)

print('Training data\n')
print(training_data)

print('\nTesting data\n')
print(testing_data)

Training data

[[6.3200e-03 1.8000e+01 2.3100e+00 ... 1.5300e+01 3.9690e+02 4.9800e+00]
 [2.7310e-02 0.0000e+00 7.0700e+00 ... 1.7800e+01 3.9690e+02 9.1400e+00]
 [2.7290e-02 0.0000e+00 7.0700e+00 ... 1.7800e+01 3.9283e+02 4.0300e+00]
 ...
 [2.1409e-01 2.2000e+01 5.8600e+00 ... 1.9100e+01 3.7707e+02 3.5900e+00]
 [8.2210e-02 2.2000e+01 5.8600e+00 ... 1.9100e+01 3.8609e+02 3.5300e+00]
 [3.6894e-01 2.2000e+01 5.8600e+00 ... 1.9100e+01 3.9690e+02 3.5400e+00]]

Testing data

[[4.8190e-02 8.0000e+01 3.6400e+00 ... 1.6400e+01 3.9289e+02 6.5700e+00]
 [3.5480e-02 8.0000e+01 3.6400e+00 ... 1.6400e+01 3.9518e+02 9.2500e+00]
 [1.5380e-02 9.0000e+01 3.7500e+00 ... 1.5900e+01 3.8634e+02 3.1100e+00]
 ...
 [6.0760e-02 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9690e+02 5.6400e+00]
 [1.0959e-01 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9345e+02 6.4800e+00]
 [4.7410e-02 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9690e+02 7.8800e+00]]


<br /><br /><br /><br /><hr />
# So far, are the exercises fine?
<hr /><br /><br /><br /><br />

In [None]:
# Let's add a continuous variable, like the area of the house
X = np.hstack((X, data[:,2].reshape(target_value_length,1)))
print (X.shape)

In [None]:
from scipy.linalg import lstsq
theta = lstsq(X, target_value)[0]
print (sum((target_value-dot(X, theta))**2) / target_value_length)

In [None]:
# Let's add the number of garages as anoher variable
X = np.hstack((X, data[:,4].reshape(target_value_length,1)))
theta = lstsq(X, target_value)[0]
print (sum((target_value-dot(X, theta))**2) / target_value_length)

In [None]:
#To add a categorical variable, like "construction type" we have to decide how to represent it. Two options: 
# 1) Giving a value to every category choice (1, 2, 3, ...).
# 2) Using an indicator vector, with a zero for each dimension, except for the active that is one.
X1 = np.hstack((X, data[:,8].reshape(target_value_length, 1)))
ind = np.zeros((target_value_length, max(data[:,8]).astype(int)))
ind[range(0, target_value_length), (data[:,8]-1).astype(int)]=1
X2 = np.hstack((X, ind))
print ('Representation 1,2,3,4,...', '\n', X1)
print ('Representation [0,1,0,0]', '\n', X2)

In [None]:
theta = lstsq(X1, target_value)[0]
print ('Representation 1,2,3,...', sum((target_value-dot(X1, theta))**2) / target_value_length)
theta = lstsq(X2, target_value)[0]
print ('Representation [0,1,0,0]', sum((target_value-dot(X2, theta))**2) / target_value_length)
# Just one word of caution with indicator vectors: if we use more dimensions 
# than we should, we may end up with a low-rank matrix that we can't invert!

In [None]:
R = np.random.random((target_value_length, 30))
theta = lstsq(R, target_value)[0]
print ('Random', sum((target_value-dot(R, theta))**2) / target_value_length)

In [None]:
n = int(target_value_length / 2)
R = np.random.random((n, 500))
theta = lstsq(R, target_value[:n])[0]
print ('Random', sum((target_value[n:]-dot(R, theta))**2) / n)