# Experiment 10: Implement Lasso and Ridge Regression by using Melbourne Dataset

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
dataset = pd.read_csv('melb_data.csv')
dataset.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [11]:
dataset.nunique() # Finding unique values in a particular column

Suburb             314
Address          13378
Rooms                9
Type                 3
Price             2204
Method               5
SellerG            268
Date                58
Distance           202
Postcode           198
Bedroom2            12
Bathroom             9
Car                 11
Landsize          1448
BuildingArea       602
YearBuilt          144
CouncilArea         33
Lattitude         6503
Longtitude        7063
Regionname           8
Propertycount      311
dtype: int64

In [12]:
dataset.shape

(13580, 21)

In [13]:
col_to_use = ['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Regionname', 'Propertycount', 'Distance', 'CouncilArea', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'Price']

In [14]:
dataset = dataset[col_to_use]

In [15]:
dataset.shape

(13580, 15)

In [16]:
dataset.head()

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price
0,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra,2.0,1.0,1.0,202.0,,1480000.0
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra,2.0,1.0,0.0,156.0,79.0,1035000.0
2,Abbotsford,3,h,SP,Biggin,Northern Metropolitan,4019.0,2.5,Yarra,3.0,2.0,0.0,134.0,150.0,1465000.0
3,Abbotsford,3,h,PI,Biggin,Northern Metropolitan,4019.0,2.5,Yarra,3.0,2.0,1.0,94.0,,850000.0
4,Abbotsford,4,h,VB,Nelson,Northern Metropolitan,4019.0,2.5,Yarra,3.0,1.0,2.0,120.0,142.0,1600000.0


In [17]:
dataset.shape

(13580, 15)

In [18]:
dataset.isna().sum()

Suburb              0
Rooms               0
Type                0
Method              0
SellerG             0
Regionname          0
Propertycount       0
Distance            0
CouncilArea      1369
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
Price               0
dtype: int64

In [19]:
cols_to_fill_zero = ['Car']
dataset[cols_to_fill_zero] = dataset[cols_to_fill_zero].fillna(0)
dataset.isna().sum()

Suburb              0
Rooms               0
Type                0
Method              0
SellerG             0
Regionname          0
Propertycount       0
Distance            0
CouncilArea      1369
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea     6450
Price               0
dtype: int64

In [20]:
dataset['Landsize'] = dataset['Landsize'].fillna(dataset.Landsize.mean())
dataset['BuildingArea'] = dataset['BuildingArea'].fillna(dataset.Landsize.mean())

In [21]:
dataset.isna().sum()

Suburb              0
Rooms               0
Type                0
Method              0
SellerG             0
Regionname          0
Propertycount       0
Distance            0
CouncilArea      1369
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea        0
Price               0
dtype: int64

In [22]:
dataset.dropna(inplace=True)
dataset.isna().sum()

Suburb           0
Rooms            0
Type             0
Method           0
SellerG          0
Regionname       0
Propertycount    0
Distance         0
CouncilArea      0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
Price            0
dtype: int64

In [23]:
dataset = pd.get_dummies(dataset, drop_first = True)
dataset.head()

Unnamed: 0,Rooms,Propertycount,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price,Suburb_Aberfeldie,...,CouncilArea_Moreland,CouncilArea_Nillumbik,CouncilArea_Port Phillip,CouncilArea_Stonnington,CouncilArea_Unavailable,CouncilArea_Whitehorse,CouncilArea_Whittlesea,CouncilArea_Wyndham,CouncilArea_Yarra,CouncilArea_Yarra Ranges
0,2,4019.0,2.5,2.0,1.0,1.0,202.0,558.416127,1480000.0,False,...,False,False,False,False,False,False,False,False,True,False
1,2,4019.0,2.5,2.0,1.0,0.0,156.0,79.0,1035000.0,False,...,False,False,False,False,False,False,False,False,True,False
2,3,4019.0,2.5,3.0,2.0,0.0,134.0,150.0,1465000.0,False,...,False,False,False,False,False,False,False,False,True,False
3,3,4019.0,2.5,3.0,2.0,1.0,94.0,558.416127,850000.0,False,...,False,False,False,False,False,False,False,False,True,False
4,4,4019.0,2.5,3.0,1.0,2.0,120.0,142.0,1600000.0,False,...,False,False,False,False,False,False,False,False,True,False


In [24]:
# from sklearn.preprocessing import StandardScaler
# from pandas import DataFrame

In [25]:
#std = StandardScaler()
#data = std.fit_transform(x)

#dataset = DataFrame(data)
#dataset.head()

In [26]:
x = dataset.drop('Price', axis = 1)
y = dataset['Price']
#x

In [46]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2)

In [48]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [50]:
model.fit(x_train, y_train)

In [52]:
model.score(x_test, y_test)

-258533644277.60342

In [54]:
model.score(x_train, y_train)

0.7101777405358054

In [56]:
from sklearn.linear_model import Lasso
lasso_model = Lasso()

In [58]:
lasso_model.fit(x_train, y_train)

  model = cd_fast.enet_coordinate_descent(


In [60]:
lasso_model.score(x_test, y_test)

0.6525752784636165

In [62]:
lasso_model.score(x_train, y_train)

0.7101608461738054

In [64]:
from sklearn.linear_model import Ridge
ridge_model = Ridge()

In [70]:
ridge_model.fit(x_train, y_train)

In [72]:
ridge_model.score(x_test, y_test)

0.6622828936615058

In [74]:
ridge_model.score(x_train, y_train)

0.7081160032985205