In [1]:
import pandas as pd 
import numpy as np 
from google.colab import drive
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
drive.mount('drive', force_remount=True)

Mounted at drive


In [3]:
df = pd.read_csv('drive/My Drive/MetroVan.csv')

In [4]:
# Converting Town as a category attribute to numbers
df.Town = pd.Categorical(df.Town)
df['Towncode'] = df.Town.cat.codes

#print(df.describe())
print(df.dtypes)

Unnamed: 0       int64
Address         object
Town          category
ZipCode         object
Area             int64
Price            int64
Beds             int64
Baths            int64
Broker          object
latitude       float64
longitude      float64
Towncode          int8
dtype: object


In [5]:
# Absolute value of correlation
cor = df.corr()
cor_target = abs(cor['Price']) 
# Highly correlated attributes
relevant_features = cor_target[cor_target>0.2] 
# Getting the names of the attributes
names = [index for index, value in relevant_features.iteritems()] 
# Erase the target attribute
names.remove('Price') 
# printing the correlated attributes
print(names) 
print(len(names))

['Area', 'Beds', 'Baths', 'longitude', 'Towncode']
5


In [6]:
from sklearn.model_selection import train_test_split

# Droping rows contain NaN and unrequired columns
df.dropna(inplace=True, axis=0)

x = df.drop(['Price', 'Address', 'Town', 'ZipCode', 'Broker'], axis=1)
y = df['Price']

# Dividing the data into training and test set
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=42)

#Print Num of rows and Columns of test and training tet
print("X-Train data set :", x_train.shape)
print("X-Test data set :", x_test.shape)
print("Y-Train data set :", y_train.shape)
print("Y-Test data set :", y_test.shape)

X-Train data set : (2328, 7)
X-Test data set : (583, 7)
Y-Train data set : (2328,)
Y-Test data set : (583,)


In [18]:
from sklearn.linear_model import LinearRegression # Ordinary Least Squares(OLS) algorithm
from sklearn.linear_model import Ridge # Ridge algorithm
from sklearn.linear_model import Lasso # Lasso algorithm
from sklearn.linear_model import BayesianRidge # Bayesian algorithm
from sklearn.linear_model import ElasticNet # Elastic Net algorithm
from sklearn.svm import SVR # Support Vector Machine algorithm

# OLS Algorithm
ols = LinearRegression()
ols.fit(x_train, y_train)
ols_yhat = ols.predict(x_test)
print(f'Score of OLS regression : {ols.score(x, y):,.4f}')
print(f'Average value of the house by OLS regression: CA$ {y_test.mean():,.2f}')  # Present Price
print(f'Model predicted average value by OLS regression: CA$ {ols_yhat.mean():,.2f}') # Predicted Price
print('----------------------------------------------------')
# Ridge Algorithm
ridge = Ridge(alpha = 0.5).fit(x_train, y_train)
ridge_yhat = ridge.predict(x_test)

# Lasso Algorithm
lasso = Lasso(alpha = 0.01).fit(x_train, y_train)
lasso_yhat = lasso.predict(x_test)

# Bayesian Algorithm
bayesian = BayesianRidge().fit(x_train, y_train)
bayesian_yhat = bayesian.predict(x_test)

# Elastic Net Algorithm
en = ElasticNet(alpha = 0.01).fit(x_train, y_train)
en_yhat = en.predict(x_test)

# SVR Algorithm
svm = SVR(C=100000, gamma = 150000).fit(x_train, y_train)
svm_yhat = svm.predict(x_test) #Predictions of House Prices
print(f'Average value of the house by SVM: CA$ {y_test.mean():,.2f}')  # Present Price
print(f'Model predicted average value by: CA$ {svm_yhat.mean():,.2f}') # Predicted Price

Score of OLS regression : 0.5189
Average value of the house by OLS regression: CA$ 3,082,483.79
Model predicted average value by OLS regression: CA$ 3,157,120.92
----------------------------------------------------
Average value of the house by SVM: CA$ 3,082,483.79
Model predicted average value by: CA$ 2,747,125.32


In [21]:
 # evaluation metric : Explained Variance Score
from sklearn.metrics import explained_variance_score as evs
from sklearn.metrics import mean_squared_error

print('EXPLAINED VARIANCE SCORE values:')
print('Explained Variance Score of OLS model is: {:.4f}'.format(evs(y_test, ols_yhat)))
print('Mean squared error of OLS is: {:,.2f}'.format(mean_squared_error(y_test, ols_yhat,squared = False)))
print('------------------------------------------------------')
print('Explained Variance Score of Ridge model is: {:.4f}'.format(evs(y_test, ridge_yhat)))
print('Mean squared error of Ridge is: {:,.2f}'.format(mean_squared_error(y_test, ridge_yhat,squared = False)))
print('------------------------------------------------------')
print('Explained Variance Score of Lasso model is: {:.4f}'.format(evs(y_test, lasso_yhat)))
print('Mean squared error of Lasso is: {:,.2f}'.format(mean_squared_error(y_test, lasso_yhat,squared = False)))
print('------------------------------------------------------')
print('Explained Variance Score of Bayesian model is: {:.4f}'.format(evs(y_test, bayesian_yhat)))
print('Mean squared error of Bayesian model is: {:,.2f}'.format(mean_squared_error(y_test, bayesian_yhat,squared = False)))
print('------------------------------------------------------')
print('Explained Variance Score of Elastic Net is: {:.4f}'.format(evs(y_test, en_yhat)))
print('Mean squared error of Elastic Net is: {:,.2f}'.format(mean_squared_error(y_test, en_yhat,squared = False)))
print('------------------------------------------------------')
print('Explained Variance Score of SVM is: {:.4f}'.format(evs(y_test, svm_yhat)))
print('Mean squared error of SVM is: {:,.4f}'.format(mean_squared_error(y_test, svm_yhat,squared = False)))

EXPLAINED VARIANCE SCORE values:
Explained Variance Score of OLS model is: 0.6214
Mean squared error of OLS is: 2,097,939.20
------------------------------------------------------
Explained Variance Score of Ridge model is: 0.6214
Mean squared error of Ridge is: 2,097,955.65
------------------------------------------------------
Explained Variance Score of Lasso model is: 0.6214
Mean squared error of Lasso is: 2,097,939.20
------------------------------------------------------
Explained Variance Score of Bayesian model is: 0.6211
Mean squared error of Bayesian model is: 2,098,799.21
------------------------------------------------------
Explained Variance Score of Elastic Net is: 0.6213
Mean squared error of Elastic Net is: 2,098,213.15
------------------------------------------------------
Explained Variance Score of SVM is: 0.0000
Mean squared error of SVM is: 3,423,809.3367


In [15]:
# evaluation metric : R-squared

from sklearn.metrics import r2_score as r2 

print('R-SQUARED values:')
print('Explained Variance Score of OLS model is: {:.4f}'.format(r2(y_test, ols_yhat)))
print('Explained Variance Score of Ridge model is: {:.4f}'.format(r2(y_test, ridge_yhat)))
print('Explained Variance Score of Lasso model is: {:.4f}'.format(r2(y_test, lasso_yhat)))
print('Explained Variance Score of Bayesian model is: {:.4f}'.format(r2(y_test, bayesian_yhat)))
print('Explained Variance Score of Elastic Net is: {:.4f}'.format(r2(y_test, en_yhat)))
print('Explained Variance Score of SVM is: {:.4f}'.format(r2(y_test, svm_yhat)))

R-SQUARED values:
Explained Variance Score of OLS model is: 0.6209
Explained Variance Score of Ridge model is: 0.6209
Explained Variance Score of Lasso model is: 0.6209
Explained Variance Score of Bayesian model is: 0.6206
Explained Variance Score of Elastic Net is: 0.6208
Explained Variance Score of SVM is: -0.0090
