In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from statistics import mean

In [2]:
# Loading the data
data = pd.read_csv('./data/kc_house_data.csv')
 
# Drop the non-numerical variables and those with missing values
dropColumns = ['id', 'date', 'sqft_above', 'zipcode']
data = data.drop(dropColumns, axis = 1)

In [3]:
data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,0,1955,0,47.5112,-122.257,1340,5650
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,400,1951,1991,47.721,-122.319,1690,7639
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,0,1933,0,47.7379,-122.233,2720,8062
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,910,1965,0,47.5208,-122.393,1360,5000
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,0,1987,0,47.6168,-122.045,1800,7503


In [4]:
 
# Determine the dependent and independent variables
y = data['price']
X = data.drop('price', axis = 1)
 
# Divide the data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
 
# Develop a Linear Regression model
linearModel = LinearRegression()
linearModel.fit(X_train, y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
 
# Evaluate the Linear Regression model
print(linearModel.score(X_test, y_test))

0.6869491056935153


In [5]:
# Develop Ridge(L2) Regression Model:
# Estimate different values for lamda
alpha = []
cross_val_scores_ridge = []
 
# Loop to compute the different scores
for i in range(1, 9):
 ridgeModel = Ridge(alpha = i * 0.25)
 ridgeModel.fit(X_train, y_train)
 scores = cross_val_score(ridgeModel, X, y, cv = 10)
 avg_cross_val_score = mean(scores)
 cross_val_scores_ridge.append(avg_cross_val_score)
 alpha.append(i * 0.25)

In [6]:
# Loop to print the different scores
for i in range(0, len(alpha)):
 print(str(alpha[i])+' : '+str(cross_val_scores_ridge[i]))

0.25 : 0.6908949708276657
0.5 : 0.690896736853784
0.75 : 0.6908983055353468
1.0 : 0.6908996788999442
1.25 : 0.6909008589519248
1.5 : 0.6909018476727099
1.75 : 0.6909026470210976
2.0 : 0.6909032589335603


In [7]:
# the best value of lamda for the data is 2
# Build the Ridge Regression model for the best lamda
ridgeModelChosen = Ridge(alpha = 2)
ridgeModelChosen.fit(X_train, y_train)
# Evaluate the Ridge Regression model
print(ridgeModelChosen.score(X_test, y_test))

0.6869849244964364


In [8]:
# Develop Lasso(L1) Regression Model:
# Estimate different values for lamda
lamda = []
cross_val_scores_lasso = []
# Loop to compute the different scores
for i in range(1, 9):
 lassoModel = Lasso(alpha = i * 0.25, tol = 0.0925)
 lassoModel.fit(X_train, y_train)
 scores = cross_val_score(lassoModel, X, y, cv = 10)
 avg_cross_val_score = mean(scores)
 cross_val_scores_lasso.append(avg_cross_val_score)
 lamda.append(i * 0.25)

In [9]:
# Loop to print the different scores
for i in range(0, len(alpha)):
 print(str(alpha[i])+' : '+str(cross_val_scores_lasso[i]))

0.25 : 0.6908930606738561
0.5 : 0.6908931157130693
0.75 : 0.6908931704585158
1.0 : 0.6908932249597732
1.25 : 0.6908932791842862
1.5 : 0.6908933331350492
1.75 : 0.690893386827507
2.0 : 0.6908934402400364


In [10]:
# the best value of lamda for the data is 2
# Build the Lasso Regression model for the best lamda
lassoModelChosen = Lasso(alpha = 2, tol = 0.0925)
lassoModelChosen.fit(X_train, y_train)
# Evaluate the Lasso Regression model
print(lassoModelChosen.score(X_test, y_test))

0.6869499646998982
