# A real estate company wants to build homes at different locations in Boston. They have data for historical prices but haven’t decided the actual prices yet. They want to price the homes so that they are affordable to the general public.
Objective:

• Import the Boston data from sklearn and read the description using DESCR.

• Analyze the data and predict the approximate prices for the houses.

In [93]:
#For this project we will use Linear Regression to predict the prices for boston houses because the target is numerical.

#Importing required libraries
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
import matplotlib as plt

%matplotlib inline

In [94]:
#Fetching the Data
Boston = load_boston()
#viewing the data
print(Boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [95]:
#Getting keys of the dataset
Boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [96]:
#Putting the data in a dataframe
df = pd.DataFrame(Boston.data)
#Viewing the dataset
df.head()
#Notice the column names are missing. Lets get that

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [97]:
#Giving the dataset column names
df.columns = Boston.feature_names
#Viewing the dataset
df.head()
#Now lets check to see if there is any null values in these columns

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [98]:
#Appending the price target to the dataframe
df['Price'] = Boston.target
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [99]:
#Checking for null values
df.isna().any()
#No null values
#Getting dimension of the data
df.shape

(506, 14)

In [102]:
#Now all the data is fine, we will now predict the house prices in boston
#First lets extract the features and target value
x_features = df.drop(['Price'], axis = 1)
y_target = df['Price']

In [103]:
#Checking x_features
x_features.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [104]:
#Checking y_targets
y_target.head()

0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: Price, dtype: float64

In [105]:
#We will do a train_test_split on this model
#First importing LinearRegression to create and fit the model
from sklearn.linear_model import LinearRegression
LinReg = LinearRegression()
LinReg.fit(x_features, y_target)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [106]:
#Printing the coefficient and Intercept of this model
print("The Intercept is: ", LinReg.intercept_)
print("The Coefficient is: ", LinReg.coef_)

The Intercept is:  36.45948838509015
The Coefficient is:  [-1.08011358e-01  4.64204584e-02  2.05586264e-02  2.68673382e+00
 -1.77666112e+01  3.80986521e+00  6.92224640e-04 -1.47556685e+00
  3.06049479e-01 -1.23345939e-02 -9.52747232e-01  9.31168327e-03
 -5.24758378e-01]


In [109]:
#Spliting and testing the model, we will split 20% and train 80%
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x_features,y_target, test_size = 0.2, random_state = 42)

In [110]:
#Checking to see if data is split properly
xtrain.shape

(404, 13)

In [111]:
#Checking to see if data is split properly
xtest.shape
#model is split properly

(102, 13)

In [112]:
#lets fit the split data
LinReg.fit(xtrain, ytrain)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [114]:
#Printing intercept and coefficient of train data
print('The intercept of the train model is: ', LinReg.intercept_)
print('The coefficient of the train model is: ', LinReg.coef_)

The intercept of the train model is:  30.24675099392408
The coefficient of the train model is:  [-1.13055924e-01  3.01104641e-02  4.03807204e-02  2.78443820e+00
 -1.72026334e+01  4.43883520e+00 -6.29636221e-03 -1.44786537e+00
  2.62429736e-01 -1.06467863e-02 -9.15456240e-01  1.23513347e-02
 -5.08571424e-01]


In [123]:
#importing sqrt and r2 and mean squared error
from sklearn.metrics import r2_score, mean_squared_error
from math import sqrt

#Predicting the model
pred = LinReg.predict(xtest)
print('The RMSE is: ',sqrt(mean_squared_error(ytest, pred)))

The RMSE is:  4.928602182665355


In [124]:
#importing the ridge model from sklearn
from sklearn.linear_model import Ridge
#Initializing the ridge variable with alpha set to .001
ridge = Ridge(alpha = 0.001, normalize = True)

In [127]:
#Fitting the model on xtrain and ytrain
ridge.fit(xtrain,ytrain)
#printing the root mean squared error
print('Train Model: ', sqrt(mean_squared_error(ytrain,ridge.predict(xtrain))))
print('Test Model: ', sqrt(mean_squared_error(ytest,ridge.predict(xtest))))
#printing the R2 value
print('R2 Value: ', ridge.score(xtest,ytest))

Train Model:  4.6520574267448405
Test Model:  4.929494222699342
R2 Value:  0.6686395785895635


In [128]:
#importing the ridge model from sklearn
from sklearn.linear_model import Lasso
#Initializing the ridge variable with alpha set to .001
lasso = Lasso(alpha = 0.001, normalize = True)

In [129]:
#Fitting the model on xtrain and ytrain
lasso.fit(xtrain,ytrain)
#printing the root mean squared error
print(sqrt(mean_squared_error(ytrain,lasso.predict(xtrain))))
print(sqrt(mean_squared_error(ytest,lasso.predict(xtest))))
#printing the R2 value
print('R2 Value: ', lasso.score(xtest,ytest))

4.655116805850516
4.939124750554196
R2 Value:  0.6673435864040371


In [130]:
#importing the ridge model from sklearn
from sklearn.linear_model import ElasticNet
#Initializing the ridge variable with alpha set to .001
elastic = ElasticNet(alpha = 0.001, normalize = True)

In [131]:
#Fitting the model on xtrain and ytrain
elastic.fit(xtrain,ytrain)
#printing the root mean squared error
print(sqrt(mean_squared_error(ytrain,elastic.predict(xtrain))))
print(sqrt(mean_squared_error(ytest,elastic.predict(xtest))))
#printing the R2 value
print('R2 Value: ', elastic.score(xtest,ytest))

4.830012769168519
5.0277905046737175
R2 Value:  0.6552928777229082
