In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

In [2]:
dataset = pd.read_csv('Melbourne_housing_FULL.csv')
dataset.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [3]:
dataset.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [4]:
cols_to_use = ['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Regionname', 'Propertycount', 
               'Distance', 'CouncilArea', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'Price']
dataset = dataset[cols_to_use]

In [5]:
nan_column = dataset.isnull().any()[dataset.isnull().any() == True]
print(nan_column)

Regionname       True
Propertycount    True
Distance         True
CouncilArea      True
Bedroom2         True
Bathroom         True
Car              True
Landsize         True
BuildingArea     True
Price            True
dtype: bool


In [6]:
# important feature, so we drop those nan according to these columns
dataset.dropna(subset = ['Bedroom2','Bathroom','Landsize','BuildingArea','Price'],inplace=True)

In [7]:
dataset.isnull().any()

Suburb           False
Rooms            False
Type             False
Method           False
SellerG          False
Regionname       False
Propertycount    False
Distance         False
CouncilArea      False
Bedroom2         False
Bathroom         False
Car               True
Landsize         False
BuildingArea     False
Price            False
dtype: bool

In [8]:
dataset['Car'].fillna(np.mean(dataset.Car),inplace=True)

In [9]:
dataset.isnull().any()

Suburb           False
Rooms            False
Type             False
Method           False
SellerG          False
Regionname       False
Propertycount    False
Distance         False
CouncilArea      False
Bedroom2         False
Bathroom         False
Car              False
Landsize         False
BuildingArea     False
Price            False
dtype: bool

In [10]:
dummies = pd.get_dummies(dataset,drop_first=True)
dummies.head()

Unnamed: 0,Rooms,Propertycount,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price,Suburb_Aberfeldie,...,CouncilArea_Moorabool Shire Council,CouncilArea_Moreland City Council,CouncilArea_Nillumbik Shire Council,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council
2,2,4019.0,2.5,2.0,1.0,0.0,156.0,79.0,1035000.0,0,...,0,0,0,0,0,0,0,0,1,0
4,3,4019.0,2.5,3.0,2.0,0.0,134.0,150.0,1465000.0,0,...,0,0,0,0,0,0,0,0,1,0
6,4,4019.0,2.5,3.0,1.0,2.0,120.0,142.0,1600000.0,0,...,0,0,0,0,0,0,0,0,1,0
11,3,4019.0,2.5,4.0,2.0,0.0,245.0,210.0,1876000.0,0,...,0,0,0,0,0,0,0,0,1,0
14,2,4019.0,2.5,2.0,1.0,2.0,256.0,107.0,1636000.0,0,...,0,0,0,0,0,0,0,0,1,0


In [11]:
X = dummies.drop('Price',axis=1)
y = dummies.pop('Price')

In [12]:
x_train,x_test,y_train,y_test = train_test_split(X,y,random_state=10,test_size=0.1)

In [13]:
linreg = LinearRegression().fit(x_train,y_train)
print('test score:',linreg.score(x_test,y_test), 'train score:',linreg.score(x_train,y_train))
# overfitting occured

test score: 0.6408086479888482 train score: 0.7369315202043156


In [14]:
lassoreg = Lasso(alpha=50,max_iter=10000,tol=1e-3)
lassoreg.fit(x_train,y_train)
print('test score:',lassoreg.score(x_test,y_test), 'train score:',lassoreg.score(x_train,y_train))
# lasso still gives a overfitting problem

test score: 0.6564511763724217 train score: 0.7325009118748937


In [15]:
ridgereg = Ridge(alpha=50,max_iter=10000,tol=1e-3)
ridgereg.fit(x_train,y_train)
print('test score:',ridgereg.score(x_test,y_test), 'train score:',ridgereg.score(x_train,y_train))
# Ridge gives an acceptable outcome that both scores are almost close to each other

test score: 0.6872015683997681 train score: 0.6907383913804152
