In [84]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline 

COLOR = 'white'
plt.rcParams['text.color'] = COLOR
plt.rcParams['axes.labelcolor'] = COLOR
plt.rcParams['xtick.color'] = COLOR
plt.rcParams['ytick.color'] = COLOR

pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.display.float_format = '{:,.2f}'.format

![](fitting.png)

In [85]:
df = pd.read_csv('Melbourne_housing_FULL.csv')
df

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.50,3067.00,...,1.00,1.00,126.00,,,Yarra City Council,-37.80,145.00,Northern Metropolitan,4019.00
1,Abbotsford,85 Turner St,2,h,1480000.00,S,Biggin,3/12/2016,2.50,3067.00,...,1.00,1.00,202.00,,,Yarra City Council,-37.80,145.00,Northern Metropolitan,4019.00
2,Abbotsford,25 Bloomburg St,2,h,1035000.00,S,Biggin,4/02/2016,2.50,3067.00,...,1.00,0.00,156.00,79.00,1900.00,Yarra City Council,-37.81,144.99,Northern Metropolitan,4019.00
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.50,3067.00,...,2.00,1.00,0.00,,,Yarra City Council,-37.81,145.01,Northern Metropolitan,4019.00
4,Abbotsford,5 Charles St,3,h,1465000.00,SP,Biggin,4/03/2017,2.50,3067.00,...,2.00,0.00,134.00,150.00,1900.00,Yarra City Council,-37.81,144.99,Northern Metropolitan,4019.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34852,Yarraville,13 Burns St,4,h,1480000.00,PI,Jas,24/02/2018,6.30,3013.00,...,1.00,3.00,593.00,,,Maribyrnong City Council,-37.81,144.88,Western Metropolitan,6543.00
34853,Yarraville,29A Murray St,2,h,888000.00,SP,Sweeney,24/02/2018,6.30,3013.00,...,2.00,1.00,98.00,104.00,2018.00,Maribyrnong City Council,-37.82,144.89,Western Metropolitan,6543.00
34854,Yarraville,147A Severn St,2,t,705000.00,S,Jas,24/02/2018,6.30,3013.00,...,1.00,2.00,220.00,120.00,2000.00,Maribyrnong City Council,-37.82,144.88,Western Metropolitan,6543.00
34855,Yarraville,12/37 Stephen St,3,h,1140000.00,SP,hockingstuart,24/02/2018,6.30,3013.00,...,,,,,,Maribyrnong City Council,,,Western Metropolitan,6543.00


In [86]:
df.nunique()

Suburb             351
Address          34009
Rooms               12
Type                 3
Price             2871
Method               9
SellerG            388
Date                78
Distance           215
Postcode           211
Bedroom2            15
Bathroom            11
Car                 15
Landsize          1684
BuildingArea       740
YearBuilt          160
CouncilArea         33
Lattitude        13402
Longtitude       14524
Regionname           8
Propertycount      342
dtype: int64

In [87]:
df.shape

(34857, 21)

In [88]:
# let's use limited columns which makes more sense for serving our purpose
cols_to_use = ['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Regionname', 'Propertycount', 
               'Distance', 'CouncilArea', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'Price']
df = df[cols_to_use]


In [89]:
df

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price
0,Abbotsford,2,h,SS,Jellis,Northern Metropolitan,4019.00,2.50,Yarra City Council,2.00,1.00,1.00,126.00,,
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.00,2.50,Yarra City Council,2.00,1.00,1.00,202.00,,1480000.00
2,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.00,2.50,Yarra City Council,2.00,1.00,0.00,156.00,79.00,1035000.00
3,Abbotsford,3,u,VB,Rounds,Northern Metropolitan,4019.00,2.50,Yarra City Council,3.00,2.00,1.00,0.00,,
4,Abbotsford,3,h,SP,Biggin,Northern Metropolitan,4019.00,2.50,Yarra City Council,3.00,2.00,0.00,134.00,150.00,1465000.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34852,Yarraville,4,h,PI,Jas,Western Metropolitan,6543.00,6.30,Maribyrnong City Council,4.00,1.00,3.00,593.00,,1480000.00
34853,Yarraville,2,h,SP,Sweeney,Western Metropolitan,6543.00,6.30,Maribyrnong City Council,2.00,2.00,1.00,98.00,104.00,888000.00
34854,Yarraville,2,t,S,Jas,Western Metropolitan,6543.00,6.30,Maribyrnong City Council,2.00,1.00,2.00,220.00,120.00,705000.00
34855,Yarraville,3,h,SP,hockingstuart,Western Metropolitan,6543.00,6.30,Maribyrnong City Council,,,,,,1140000.00


### Checking for Nan values

In [90]:
df.isna().sum()

Suburb               0
Rooms                0
Type                 0
Method               0
SellerG              0
Regionname           3
Propertycount        3
Distance             1
CouncilArea          3
Bedroom2          8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21115
Price             7610
dtype: int64

In [91]:
# Some feature's missing values can be treated as zero (another class for NA values or absence of that feature)
# like 0 for Propertycount, Bedroom2 will refer to other class of NA values
# like 0 for Car feature will mean that there's no car parking feature with house
cols_to_fill_zero = ['Propertycount', 'Distance', 'Bedroom2', 'Bathroom', 'Car']
df[cols_to_fill_zero] = df[cols_to_fill_zero].fillna(0)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price
0,Abbotsford,2,h,SS,Jellis,Northern Metropolitan,4019.00,2.50,Yarra City Council,2.00,1.00,1.00,126.00,,
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.00,2.50,Yarra City Council,2.00,1.00,1.00,202.00,,1480000.00
2,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.00,2.50,Yarra City Council,2.00,1.00,0.00,156.00,79.00,1035000.00
3,Abbotsford,3,u,VB,Rounds,Northern Metropolitan,4019.00,2.50,Yarra City Council,3.00,2.00,1.00,0.00,,
4,Abbotsford,3,h,SP,Biggin,Northern Metropolitan,4019.00,2.50,Yarra City Council,3.00,2.00,0.00,134.00,150.00,1465000.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34852,Yarraville,4,h,PI,Jas,Western Metropolitan,6543.00,6.30,Maribyrnong City Council,4.00,1.00,3.00,593.00,,1480000.00
34853,Yarraville,2,h,SP,Sweeney,Western Metropolitan,6543.00,6.30,Maribyrnong City Council,2.00,2.00,1.00,98.00,104.00,888000.00
34854,Yarraville,2,t,S,Jas,Western Metropolitan,6543.00,6.30,Maribyrnong City Council,2.00,1.00,2.00,220.00,120.00,705000.00
34855,Yarraville,3,h,SP,hockingstuart,Western Metropolitan,6543.00,6.30,Maribyrnong City Council,0.00,0.00,0.00,,,1140000.00


In [92]:
# other continuous features can be imputed with mean for faster results since our focus is on Reducing overfitting
# using Lasso and Ridge Regression
df['Landsize'] = df['Landsize'].fillna(df.Landsize.mean())
df['BuildingArea'] = df['BuildingArea'].fillna(df.BuildingArea.mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Landsize'] = df['Landsize'].fillna(df.Landsize.mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['BuildingArea'] = df['BuildingArea'].fillna(df.BuildingArea.mean())


In [93]:
df

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price
0,Abbotsford,2,h,SS,Jellis,Northern Metropolitan,4019.00,2.50,Yarra City Council,2.00,1.00,1.00,126.00,160.26,
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.00,2.50,Yarra City Council,2.00,1.00,1.00,202.00,160.26,1480000.00
2,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.00,2.50,Yarra City Council,2.00,1.00,0.00,156.00,79.00,1035000.00
3,Abbotsford,3,u,VB,Rounds,Northern Metropolitan,4019.00,2.50,Yarra City Council,3.00,2.00,1.00,0.00,160.26,
4,Abbotsford,3,h,SP,Biggin,Northern Metropolitan,4019.00,2.50,Yarra City Council,3.00,2.00,0.00,134.00,150.00,1465000.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34852,Yarraville,4,h,PI,Jas,Western Metropolitan,6543.00,6.30,Maribyrnong City Council,4.00,1.00,3.00,593.00,160.26,1480000.00
34853,Yarraville,2,h,SP,Sweeney,Western Metropolitan,6543.00,6.30,Maribyrnong City Council,2.00,2.00,1.00,98.00,104.00,888000.00
34854,Yarraville,2,t,S,Jas,Western Metropolitan,6543.00,6.30,Maribyrnong City Council,2.00,1.00,2.00,220.00,120.00,705000.00
34855,Yarraville,3,h,SP,hockingstuart,Western Metropolitan,6543.00,6.30,Maribyrnong City Council,0.00,0.00,0.00,593.60,160.26,1140000.00


In [94]:
# Since the price column is our target variable, we can not just fill it with mean or median, we need and accurate model, so we drop all of them.
df.dropna(inplace=True)
df.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


(27244, 15)

In [95]:
df = pd.get_dummies(df, drop_first=True)
df

Unnamed: 0,Rooms,Propertycount,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price,Suburb_Aberfeldie,...,CouncilArea_Moorabool Shire Council,CouncilArea_Moreland City Council,CouncilArea_Nillumbik Shire Council,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council
1,2,4019.00,2.50,2.00,1.00,1.00,202.00,160.26,1480000.00,0,...,0,0,0,0,0,0,0,0,1,0
2,2,4019.00,2.50,2.00,1.00,0.00,156.00,79.00,1035000.00,0,...,0,0,0,0,0,0,0,0,1,0
4,3,4019.00,2.50,3.00,2.00,0.00,134.00,150.00,1465000.00,0,...,0,0,0,0,0,0,0,0,1,0
5,3,4019.00,2.50,3.00,2.00,1.00,94.00,160.26,850000.00,0,...,0,0,0,0,0,0,0,0,1,0
6,4,4019.00,2.50,3.00,1.00,2.00,120.00,142.00,1600000.00,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34852,4,6543.00,6.30,4.00,1.00,3.00,593.00,160.26,1480000.00,0,...,0,0,0,0,0,0,0,0,0,0
34853,2,6543.00,6.30,2.00,2.00,1.00,98.00,104.00,888000.00,0,...,0,0,0,0,0,0,0,0,0,0
34854,2,6543.00,6.30,2.00,1.00,2.00,220.00,120.00,705000.00,0,...,0,0,0,0,0,0,0,0,0,0
34855,3,6543.00,6.30,0.00,0.00,0.00,593.60,160.26,1140000.00,0,...,0,0,0,0,0,0,0,0,0,0


In [98]:
X = df.drop('Price', axis='columns')
y = df.Price

1       1,480,000.00
2       1,035,000.00
4       1,465,000.00
5         850,000.00
6       1,600,000.00
            ...     
34852   1,480,000.00
34853     888,000.00
34854     705,000.00
34855   1,140,000.00
34856   1,020,000.00
Name: Price, Length: 27244, dtype: float64

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test