# House Prices Prediction with Llinear Regression and Regularization

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#importing-dataset-using-panda" data-toc-modified-id="importing-dataset-using-panda-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>importing dataset using panda</a></span></li><li><span><a href="#EDA-&amp;-Data-Preprocessing:" data-toc-modified-id="EDA-&amp;-Data-Preprocessing:-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>EDA &amp; Data Preprocessing:</a></span><ul class="toc-item"><li><span><a href="#Checking-for-Missing-Values" data-toc-modified-id="Checking-for-Missing-Values-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Checking for Missing Values</a></span></li><li><span><a href="#Checking-for-Categorical-Data" data-toc-modified-id="Checking-for-Categorical-Data-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Checking for Categorical Data</a></span></li><li><span><a href="#Dropping-Columns" data-toc-modified-id="Dropping-Columns-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Dropping Columns</a></span></li><li><span><a href="#Understanding-Data-Distribution" data-toc-modified-id="Understanding-Data-Distribution-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>Understanding Data Distribution</a></span></li><li><span><a href="#Correlation-Between-Columns" data-toc-modified-id="Correlation-Between-Columns-2.5"><span class="toc-item-num">2.5&nbsp;&nbsp;</span>Correlation Between Columns</a></span></li><li><span><a href="#Outliers" data-toc-modified-id="Outliers-2.6"><span class="toc-item-num">2.6&nbsp;&nbsp;</span>Outliers</a></span></li><li><span><a href="#Normalizing-Data" data-toc-modified-id="Normalizing-Data-2.7"><span class="toc-item-num">2.7&nbsp;&nbsp;</span>Normalizing Data</a></span></li></ul></li><li><span><a href="#Multiple-Linear-Regression" data-toc-modified-id="Multiple-Linear-Regression-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Multiple Linear Regression</a></span><ul class="toc-item"><li><span><a href="#Separating-Independent-and-Dependent-Variable" data-toc-modified-id="Separating-Independent-and-Dependent-Variable-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Separating Independent and Dependent Variable</a></span></li><li><span><a href="#Splitting-Dataset-into-Training-and-Testing-Dataset" data-toc-modified-id="Splitting-Dataset-into-Training-and-Testing-Dataset-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Splitting Dataset into Training and Testing Dataset</a></span></li><li><span><a href="#Fit-Linear-Regression-Model" data-toc-modified-id="Fit-Linear-Regression-Model-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Fit Linear Regression Model</a></span></li><li><span><a href="#Predicting-the-Test-set-Results" data-toc-modified-id="Predicting-the-Test-set-Results-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>Predicting the Test set Results</a></span></li><li><span><a href="#Model-Evaluation" data-toc-modified-id="Model-Evaluation-3.5"><span class="toc-item-num">3.5&nbsp;&nbsp;</span>Model Evaluation</a></span></li><li><span><a href="#Predictions-from-our-Model" data-toc-modified-id="Predictions-from-our-Model-3.6"><span class="toc-item-num">3.6&nbsp;&nbsp;</span>Predictions from our Model</a></span></li><li><span><a href="#Residual-Histogram" data-toc-modified-id="Residual-Histogram-3.7"><span class="toc-item-num">3.7&nbsp;&nbsp;</span>Residual Histogram</a></span></li><li><span><a href="#Regression-Evaluation-Metrics" data-toc-modified-id="Regression-Evaluation-Metrics-3.8"><span class="toc-item-num">3.8&nbsp;&nbsp;</span>Regression Evaluation Metrics</a></span></li></ul></li><li><span><a href="#Backward-Elimination" data-toc-modified-id="Backward-Elimination-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Backward Elimination</a></span></li><li><span><a href="#Ridge-and-LASSO-Regression" data-toc-modified-id="Ridge-and-LASSO-Regression-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Ridge and LASSO Regression</a></span><ul class="toc-item"><li><span><a href="#Ridge-Regression-(L2-Regularization)" data-toc-modified-id="Ridge-Regression-(L2-Regularization)-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Ridge Regression (L2-Regularization)</a></span></li><li><span><a href="#Lasso-Regression-(L1-Regularization)" data-toc-modified-id="Lasso-Regression-(L1-Regularization)-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Lasso Regression (L1-Regularization)</a></span></li></ul></li></ul></div>

In [None]:
# installing the required libraries this way:
#!pip install <library>

#uncomment if needed:
#!pip install statsmodel

In [None]:
#importing all the libraries
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
import pandas as pd
import seaborn as sns

%matplotlib inline

## importing dataset using panda

In [None]:
dataset = pd.read_csv('house_data.csv')
#to see what my dataset is comprised of
dataset.head()

## EDA & Data Preprocessing:

### Checking for Missing Values

In [None]:
# check for any missing values
dataset.isnull().any().any()

In [None]:
print(dataset.isnull().any())

### Checking for Categorical Data

In [None]:
print(dataset.dtypes)

### Dropping Columns

In [None]:
dataset = dataset.drop(['id','date'], axis = 1)

### Understanding Data Distribution

Let's create some simple plots to check out the data!
Seaborn Paiplot will automatically create not just histograms of all the columns but also correllation scatter plots.

In [None]:
# Distributions Map no. bedrooms
with sns.plotting_context("notebook",font_scale=2.5):
    g = sns.pairplot(dataset[['sqft_lot','sqft_above','price','sqft_living','bedrooms']], 
                 hue='bedrooms', palette='tab20',height=6)
g.set(xticklabels=[]);

### Correlation Between Columns

In [None]:
dataset.corr()

In [None]:
#heat map of the correlation matrix between each of the columns
sns.heatmap(dataset.corr())

In [None]:
#annotating the correlation values
sns.heatmap(dataset.corr(), linewidths=.1, annot=True)

In [None]:
#let's pick only 5 features/variables/columns
sns.heatmap(dataset[['sqft_lot','sqft_above','price','sqft_living','bedrooms']].corr())

In [None]:
#changing the figure size
fig, ax = plt.subplots(figsize=(8,5))
sns.heatmap(dataset[['sqft_lot','sqft_above','price','sqft_living','bedrooms']].corr(), annot=True, linewidths=.5, ax=ax)

### Outliers

We must be careful when choosing to drop outliers of the risk of losing valuable information, but we see here in the plot below 2 clear outliers toward the bottom right of the plot representing "bad" deals for sellers (low price for large area).

In [None]:
rcParams['figure.figsize'] = (6.0, 6.0) # define size of figure
sns.scatterplot(x='sqft_living', y='price', data=dataset)
plt.show()

### Normalizing Data

By plotting the distribution of our target feature, we quickly notice that the distribution appears to be righlty skewed.

In [None]:
sns.distplot(dataset["price"])

Let's use the skew function from scipy.stats to determine the "skewness" of the Price Feature better.

In [None]:
from scipy.stats import skew

# plot histogram of "SalePrice"
rcParams['figure.figsize'] = (12.0, 6.0) # define size of figure
g = sns.distplot(dataset["price"], label="Skewness: %.2f"%(dataset["price"].skew()))
g = g.legend(loc="best")
plt.show()

Typically, our regression models will perform best with normally distributed data. Thus for best results, let's attempt to normalize the feature with a log transform. (For rightly skewed data, a log transform has the effect of shifting the distribution to appear more "normal", while for leftly skewed data, a log transform will only make the distribution even more leftly skewed.)

In [None]:
normalizedSalePrice = np.log1p(dataset["price"])

# plot histogram of log transformed "price"
rcParams['figure.figsize'] = (12.0, 6.0) # define size of figure
g = sns.distplot(normalizedSalePrice, label="Skewness: %.2f"%(normalizedSalePrice.skew()))
g = g.legend(loc="best")
plt.show()

Now, we see our log transform did surprisingly well, and had the intended effect - the new distribution looks much more "normal". Let's go ahead and apply this log transformation of "SalePrice" to our training data.

In [None]:
# apply log transform to target
dataset["price"] = np.log1p(dataset["price"])

As we'll see, several of the non-target numerical features are also heavily skewed, both rightly and leftly. For each of these, this time we'll choose to use a blanket "yeo-johnson" power transform to attempt to "normalize" each of them, since this tranform "normalizes" both righlty and leftly skewed data. (Here we consider all features with a "skewness" magnitude above 0.75 as "heavily" skewed.)

In [None]:
# determine features that are heavily skewed
def get_skewed_features():
    numeric_feats = dataset.dtypes[dataset.dtypes != "object"].index
    skewed_feats = dataset[numeric_feats].apply(lambda x: skew(x.dropna())) # computes "skewness"
    skewed_feats = skewed_feats[abs(skewed_feats) > 0.75]
    return skewed_feats.index

In [None]:
from sklearn.preprocessing import power_transform

# find heavily skewed numerical features
skewed_feats = get_skewed_features()
print("{} heavily skewed features.".format(len(skewed_feats)))

# apply power transform to all heavily skewed numeric features
dataset[skewed_feats] = power_transform(dataset[skewed_feats], method='yeo-johnson')
print("Applied power transform.")

**Q. Why are we "normalizing" the numerical features?**

In general, standardized or normally distributed data is nice to have, and provides various benefits in different situations. All the specific benefits and situations goes beyond the scope of this notebook, but typically normalizing your data is a good idea in the absence of any other information against the case. In our situation where we plan to use regularization methods, the more extreme observation values in the highly skewed features create a bias that can cause different explanatory variables to be treated not so equally by the regularization penalty term. By normalizing these skewed distributions, it is believed the regularization penalty will then treat different explanatory variables on a more equal footing. Ideally, we want all observations and variables to be treated perfectly equally by our models.

## Multiple Linear Regression

The typcial Ordinary Least Squares Linear Regression model aims to optimize the residual sum of squares (RSS), which is defined as:

![Residual Sum of Squares](https://wikimedia.org/api/rest_v1/media/math/render/svg/2f6526aa487b4dc460792bf1eeee79b2bba77709)

To analyze how well this model performs for this data set, we will fit the model using the training data, and then estimate the model's average root mean square error (RMSE).

### Separating Independent and Dependent Variable

In [None]:
#features:
X = dataset.iloc[:,1:]

#if need to be changed to array:
#X = dataset.iloc[:,1:].values

In [None]:
#target variable that we try to predict:
y = dataset.iloc[:,0]

### Splitting Dataset into Training and Testing Dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 0) #you could have: test_size = 0.33

### Fit Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

### Predicting the Test set Results

In [None]:
y_pred = regressor.predict(X_test)

### Model Evaluation

In [None]:
# print the intercept
print(regressor.intercept_)

The next thing you can check out are the coefficients that are related to each feature in our dataset.

In [None]:
regressor.coef_

In [None]:
X.columns

In [None]:
#let's create dataframe from these coefficients -> pd.DataFrame(data, index, call index name)
coeff_df = pd.DataFrame(regressor.coef_ , X.columns,columns=['Coefficient'])
coeff_df

# <span style="color:red">Discussion Question 1) Choose 5 Coefficients from the above table and Interpret them.</span>

-

In [None]:
# get largest magnitude coefficients
coef_1 = pd.Series(regressor.coef_, index = X_train.columns)
imp_coef = pd.concat([coef_1.sort_values().head(10), coef_1.sort_values().tail(10)])

In [None]:
rcParams['figure.figsize'] = (8.0, 10.0) # define size of figure
coef_1.plot(kind = "barh")
plt.title("Most Important Coefficients Selected by Linear Regression")
plt.show()

Apart from non-relevant "lat" which is the latitude, we don't see any really high coefficient values chosen here because we did a fairly good job preprocessing our data. Had we not removed outliers and normalized the skewed numerical features for example, there would have been higher variance and a high chance of the model picking some noticably high coefficient values in comparison to these.

### Predictions from our Model

In [None]:
#with prediction we just pass the features on unseen data
y_predict = regressor.predict(X_test)

In [None]:
#predicted pricing of the house
y_predict

In [None]:
#we know that y_test contains the correct prices of the house
y_test

- Now, we want to know how far off are the predictions from the tests prices that are the actual prices.
- By checking the scatterplot we can visually compare y_test versus the predictions you just made.

In [None]:
plt.scatter(y_test,y_predict)

### Residual Histogram

- Residuals are the difference between the actual values y_test and the predicted values.

- if residuals are normally distributed, it is a good sign and It means your model was a correct choice for the data.

In [None]:
sns.distplot((y_test-y_predict),bins=50);

### Regression Evaluation Metrics


Here are three common evaluation metrics for regression problems:

**Mean Absolute Error** (MAE) is the mean of the absolute value of the errors:

$$\frac 1n\sum_{i=1}^n|y_i-\hat{y}_i|$$

**Mean Squared Error** (MSE) is the mean of the squared errors:

$$\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2$$

**Root Mean Squared Error** (RMSE) is the square root of the mean of the squared errors:

$$\sqrt{\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2}$$

Comparing these metrics:

- **MAE** is the easiest to understand, because it's the average error.
- **MSE** is more popular than MAE, because MSE "punishes" larger errors, which tends to be useful in the real world.
- **RMSE** is even more popular than MSE, because RMSE is interpretable in the "y" units.

All of these are **loss functions**, because we want to minimize them.

In [None]:
#we can calculate all these metrics
from sklearn import metrics

In [None]:
metrics.mean_squared_error(y_test, y_predict)

In [None]:
np.sqrt((metrics.mean_squared_error(y_test, y_predict)))

In [None]:
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_predict)))
print('MSE:', metrics.mean_squared_error(y_test, y_predict))
print('MAE:', metrics.mean_absolute_error(y_test, y_predict))

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt

k = X_test.shape[1]
n = len(X_test)

RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predict)),'.3f'))
MSE = mean_squared_error(y_test, y_predict)
MAE = mean_absolute_error(y_test, y_predict)
r2 = r2_score(y_test, y_predict)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)

print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2)

# <span style="color:red">Discussion Question 2) Interpret the abobe 3 Regression Evaluation Metrics.</span>

- 

## Backward Elimination

In [None]:
df = pd.read_csv('house_data.csv')
# check for any missing values
df.isnull().any().any()
df = df.drop(['id','date'], axis = 1)

In [None]:
#features to array:
features = df.iloc[:,1:].values
#target variable that we try to predict is price here:
y = df.iloc[:,0].values

In [None]:
df.shape

In [None]:
import statsmodels.api as sm
def backwardElimination(x, SL):
    numVars = len(x[0])
    temp = np.zeros((21613,19)).astype(int)
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        adjR_before = regressor_OLS.rsquared_adj.astype(float)
        if maxVar > SL:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    temp[:,j] = x[:, j]
                    x = np.delete(x, j, 1)
                    tmp_regressor = sm.OLS(y, x).fit()
                    adjR_after = tmp_regressor.rsquared_adj.astype(float)
                    if (adjR_before >= adjR_after):
                        x_rollback = np.hstack((x, temp[:,[0,j]]))
                        x_rollback = np.delete(x_rollback, j, 1)
                        print (regressor_OLS.summary())
                        return x_rollback
                    else:
                        continue
    regressor_OLS.summary()
    return x
 
SL = 0.05  # this is significance level Alpha
X_opt = features[:, [0, 1, 2, 3, 4, 5,6,7,8,9,10,11,12,13,14,15,16,17]]
X_Modeled = backwardElimination(X_opt, SL)

# <span style="color:red">Discussion Question 3) According to the above Summary Table which Variable/s are not significant after doing the Backward Elimination?</span>

-

## Ridge and LASSO Regression

Both L1 and L2 regularization aims to optimize the residual sum of squares (RSS) plus a regularization term. For ridge regression (L2), this regularization term is the **sum of the squared coefficients** times a non-negative scaling factor lambda (or alpha in our sklearn model). 

### Ridge Regression (L2-Regularization) 

In [None]:
df = pd.read_csv('house_data.csv')
# check for any missing values
df.isnull().any().any()
df = df.drop(['id','date'], axis = 1)

In [None]:
df.info()

In [None]:
#features to array:
X = df.iloc[:,1:]
#target variable that we try to predict is price here:
y = df.iloc[:,0]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0) #you could have: test_size = 0.33

In [None]:
from sklearn.linear_model import Lasso, Ridge
regressor_ridge = Ridge(alpha = 50)
regressor_ridge.fit(X_train, y_train)
print('Linear Model Coefficient (m): ', regressor_ridge.coef_)
print('Linear Model Coefficient (b): ', regressor_ridge.intercept_)

y_predict = regressor_ridge.predict( X_test)
y_predict

In [None]:
plt.plot(y_test, y_predict, "o", color = 'r')
plt.xlim(0, 3000000)
plt.ylim(0, 3000000)

plt.xlabel("Model Predictions")
plt.ylabel("True Value (ground Truth)")
plt.title('Linear Regression Predictions')
plt.show()

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt

k = X_test.shape[1]
n = len(X_test)

RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predict)),'.3f'))
MSE = mean_squared_error(y_test, y_predict)
MAE = mean_absolute_error(y_test, y_predict)
r2 = r2_score(y_test, y_predict)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)

print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2) 

### Lasso Regression (L1-Regularization)

In [None]:
from sklearn.linear_model import Lasso
regressor_lasso = Lasso(alpha = 50)
regressor_lasso.fit(X_train,y_train)
print('Linear Model Coefficient (m): ', regressor_lasso.coef_)
print('Linear Model Coefficient (b): ', regressor_lasso.intercept_)

y_predict = regressor_lasso.predict( X_test)
y_predict

In [None]:
plt.plot(y_test, y_predict, ".", color = 'r')
plt.xlim(0, 3000000)
plt.ylim(0, 3000000)

plt.xlabel("Model Predictions")
plt.ylabel("True Value (ground Truth)")
plt.title('Linear Regression Predictions')
plt.show()

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt

RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predict)),'.3f'))
MSE = mean_squared_error(y_test, y_predict)
MAE = mean_absolute_error(y_test, y_predict)
r2 = r2_score(y_test, y_predict)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)

print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2) 



# <span style="color:red">Discussion Question 4) Compare Adjusted R2 for the Multiple Linear, Backward Elimination, Ridge, and Lasso Regression Models. Which Model Performs better, Why?</span>

-