# Part 5 - Multiple Linear Regression

In the previous notebook we trained a Simple Linear Regressor with a single feature (`sqft`).  
In this notebook we aim to extend the capability of our model by using multiple features for our independent variable `X`.  
Our equation is the same however our `X` is now a matrix and the equation no longer represents a line but rather a hyperplane in N-dimensional space where **N** is the number of features in `X`.
  
$$
y = mX + b
$$ 
  


In [None]:
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
%matplotlib inline

## Let's load the data and remind ourselves of the contents

In [None]:
df = pd.read_csv('./data/rew_van_jan12_clean_engineered.csv')
df.head()

## Prepare our training and validation data

In [None]:
df.columns

In [None]:
features = [feature for feature in df.columns if feature != 'price']
X = df[features]
y = df['price']
X_np = X.values
y_np = y.values.reshape((len(df), 1))

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_np, y_np, test_size=0.30, random_state=123) # split 70% train, 30% validation

In [None]:
regressor = LinearRegression(normalize=True)

In [None]:
model = regressor.fit(X_train, y_train)

In [None]:
def evaluate_model(model, X, y):
    y_pred = model.predict(X) # predict y values from input X
    mse = mean_squared_error(y_true=y, y_pred=y_pred)
    print("Mean Squared Error: {}".format(mse))
    print("Accuracy: {}%".format(model.score(X, y)*100.0))
evaluate_model(model, X_val, y_val)

In [None]:
# try brand new data
actual_price = '$5,688,000'
sqft = 3790
bed = 4
bath = 4
sub_area = "Quilchena"
area = "Vancouver West"
property_type = "House"
strata_type = "Freehold NonStrata"
new_data = {'sqft': sqft,
            'bed': bed,
            'bath': bath,
            'sub_area_{}'.format(sub_area): 1,
            'area_{}'.format(area): 1,
            'property_type_{}'.format(property_type): 1,
            'strata_type_{}'.format(strata_type): 1
           }
new_df = pd.get_dummies(pd.DataFrame(data=[new_data], columns=X.columns).fillna(0))
print(X.shape)
predicted_price = model.predict(new_df)
print("predicted price: ${}M".format(predicted_price[0]/1e6))
print("actual price: {}".format(actual_price))

## Retrain on entire dataset and save model to disk

In [None]:
import pickle
model = regressor.fit(X, y)
with open('./models/multiple_linear.pkl', 'wb') as f:
    pickle.dump(model, f)