In this part, you will implement linear regression with multiple variables to predict the prices of houses. Suppose you are selling your house and you want to know what a good market price would be. One way to do this is to first collect information on recent houses sold and make a model of housing prices.

The file ex1data2.txt contains a training set of housing prices in Portland, Oregon. The first column is the size of the house (in square feet), the second column is the number of bedrooms, and the third column is the price of the house.

In [55]:
# 2.1 Feature Normalization
from statistics import mean
import numpy as np

# Read the Data
dataset = open("ex1data2.txt", "r")
data = dataset.readlines()

size = []
bedrooms = []
price = []
for line in data:
    line = line.strip()
    split = line.split(',')

    size.append(int(split[0]))
    bedrooms.append(int(split[1]))
    price.append(int(split[2]))

# Normalize Features
mean_s = mean(size)
mean_b = mean(bedrooms)

size = [s - mean_s for s in size]           # Subtract Mean
bedrooms = [b - mean_b for b in bedrooms]

s_array = np.array(size)
b_array = np.array(bedrooms)

std_s = np.std(s_array)                     # Divide by STD
std_b = np.std(b_array)

s_array / std_s
b_array / std_b

array([-0.22609337, -0.22609337, -0.22609337, -1.5543919 ,  1.10220517,
        1.10220517, -0.22609337, -0.22609337, -0.22609337, -0.22609337,
        1.10220517, -0.22609337, -0.22609337,  2.4305037 , -0.22609337,
        1.10220517, -1.5543919 , -0.22609337,  1.10220517,  1.10220517,
       -0.22609337, -1.5543919 , -0.22609337,  1.10220517, -0.22609337,
       -0.22609337, -0.22609337, -0.22609337, -0.22609337, -0.22609337,
       -1.5543919 , -2.88269044,  1.10220517, -0.22609337,  1.10220517,
       -0.22609337, -0.22609337,  1.10220517,  1.10220517,  1.10220517,
       -1.5543919 , -0.22609337,  1.10220517, -0.22609337, -1.5543919 ,
        1.10220517, -0.22609337])

In [56]:
#2.2 Multivariable Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Reshape Arrays
s_array = s_array.reshape((-1,1))
b_array = b_array.reshape((-1,1))
features = np.column_stack((s_array, b_array))
p_array = np.array(price).reshape((-1,1))

#Split and Train the Data
x_train, x_test, y_train, y_test = train_test_split(features, p_array, test_size = 0.3)

model = LinearRegression()
model.fit(x_train, y_train)

# Predictions
y_pred = model.predict(x_test)
y_pred_train = model.predict(x_train)
accuracy = model.score(x_test, y_test)
print("Accuracy: ", accuracy)

#Report Mean Squared Error
mse_test = mean_squared_error(y_test, y_pred)
mse_train = mean_squared_error(y_train, y_pred_train)

print("MSE Test: ", mse_test)
print("MSE Train: ", mse_train)


Accuracy:  0.7084365660884471
MSE Test:  3989801968.927936
MSE Train:  4366692356.773712
