In [1]:
# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy
%matplotlib inline

# Melbourne Housing Market Analysis

Following dataset consists of real estate on the market, its characteristics, and its asking price.

### GOAL:
Using the real estate value as the outcome variable, determine which of these characteristics "creates value in a house."

In [100]:
# importing data

housingdata = pd.read_csv('C:/Users/jafon/Documents/PythonMaterials/Data/Melbourne Housing Prices/Melbourne_housing_FULL.csv')
housingdata.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [101]:
#looking at the 21 columns
list(housingdata.columns)

['Suburb',
 'Address',
 'Rooms',
 'Type',
 'Price',
 'Method',
 'SellerG',
 'Date',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Car',
 'Landsize',
 'BuildingArea',
 'YearBuilt',
 'CouncilArea',
 'Lattitude',
 'Longtitude',
 'Regionname',
 'Propertycount']

## Explanation of Some Columns

 - __Price__: Price of house (in Australian Dollars)
 
 - __Method__: S - property sold; SP - property sold prior; PI - property passed in; PN - sold prior not disclosed; SN - sold not disclosed; NB - no bid; VB - vendor bid; W - withdrawn prior to auction; SA - sold after auction; SS - sold after auction price not disclosed. N/A - price or highest bid not available.
 
 - __Type__: br - bedroom(s); h - house,cottage,villa, semi,terrace; u - unit, duplex; t - townhouse; dev site - development site; o res - other residential.

 - __SellerG__: Real Estate Agent name

 - __Date__: Date property was sold

 - __Distance__: Distance from Melbourne's Central Business District (in kilometers)

 - __Regionname__: General Region (West, Northwest, North, Northeast ... etc.)

 - __Propertycount__: Number of properties that exist in the suburb (as identified under "Suburb").

 - __Bedroom2__: Scraped # of Bedrooms (from different source)

 - __Bathroom__: Number of Bathrooms

 - __Car__: Number of carspots available

 - __Landsize__: size of lot (in square meters)

 - __BuildingArea__: Building Size (in square meters)

 - __YearBuilt__: Year the house was built

 - __CouncilArea__: Governing council for the area

In [102]:
# checking datatypes
for column in housingdata.columns:
    print("The object type for the column {} is {}.".format(column, type(housingdata[column][1])))

The object type for the column Suburb is <class 'str'>.
The object type for the column Address is <class 'str'>.
The object type for the column Rooms is <class 'numpy.int64'>.
The object type for the column Type is <class 'str'>.
The object type for the column Price is <class 'numpy.float64'>.
The object type for the column Method is <class 'str'>.
The object type for the column SellerG is <class 'str'>.
The object type for the column Date is <class 'str'>.
The object type for the column Distance is <class 'numpy.float64'>.
The object type for the column Postcode is <class 'numpy.float64'>.
The object type for the column Bedroom2 is <class 'numpy.float64'>.
The object type for the column Bathroom is <class 'numpy.float64'>.
The object type for the column Car is <class 'numpy.float64'>.
The object type for the column Landsize is <class 'numpy.float64'>.
The object type for the column BuildingArea is <class 'numpy.float64'>.
The object type for the column YearBuilt is <class 'numpy.float

In [88]:
housingdata.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [103]:
# Now for the tough thing - what to do about the NaN's?
for column in housingdata.columns:
    print("The number of NaN's in the column {} is {}.".format(column, housingdata[column].isna().sum()))

The number of NaN's in the column Suburb is 0.
The number of NaN's in the column Address is 0.
The number of NaN's in the column Rooms is 0.
The number of NaN's in the column Type is 0.
The number of NaN's in the column Price is 7610.
The number of NaN's in the column Method is 0.
The number of NaN's in the column SellerG is 0.
The number of NaN's in the column Date is 0.
The number of NaN's in the column Distance is 1.
The number of NaN's in the column Postcode is 1.
The number of NaN's in the column Bedroom2 is 8217.
The number of NaN's in the column Bathroom is 8226.
The number of NaN's in the column Car is 8728.
The number of NaN's in the column Landsize is 11810.
The number of NaN's in the column BuildingArea is 21115.
The number of NaN's in the column YearBuilt is 19306.
The number of NaN's in the column CouncilArea is 3.
The number of NaN's in the column Lattitude is 7976.
The number of NaN's in the column Longtitude is 7976.
The number of NaN's in the column Regionname is 3.
Th

### Self-Note on how to deal with the NaN's
1. Impute price (by grouping by suburb).
2. Drop all rows that have threshold of 3 or more NaN's in it.
3. Drop all rows where landsize == 0.
4. Check NaN's again.

In [104]:
# 1. Imputing Price

# using different dataframe so as to not screw with original dataset
df = housingdata

df['Price'] = df.groupby("Suburb")['Price'].transform(lambda x: x.fillna(x.mean()))
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,1033549.0,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,1033549.0,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [105]:
# fixing numbers for aesthetics
df['Price'] = df['Price'].apply(lambda x: '%.2f' % x).values.tolist()
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,1033549.02,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,1033549.02,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [106]:
df.Price.isna().sum() # says it's 6 - it should be zero!
dfcheck = df[df.Price.isnull() == True]
dfcheck.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount


In [107]:
# I'm going to have to drop those if there is nothing to compare it to
dfdrop = df.dropna(subset=['Price'])
dfdrop.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,1033549.02,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,1033549.02,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [108]:
#2 Threshhold
df = dfdrop # reset df
dfdrop2 = df
dfdrop2.dropna(thresh=19, axis=1, inplace=True)
dfdrop2.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,1033549.02,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,1033549.02,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [122]:
df = dfdrop2

In [123]:
#3 no landsize == 0
dfland = df
dfland = dfland[dfland['Landsize'] != 0]
dfland.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,1033549.02,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0
5,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra City Council,-37.7969,144.9969,Northern Metropolitan,4019.0


In [124]:
df = dfland

In [125]:
#4 check NaN's
for column in df.columns:
    print("The number of NaN's in the column {} is {}.".format(column, housingdata[column].isna().sum()))

The number of NaN's in the column Suburb is 0.
The number of NaN's in the column Address is 0.
The number of NaN's in the column Rooms is 0.
The number of NaN's in the column Type is 0.
The number of NaN's in the column Price is 0.
The number of NaN's in the column Method is 0.
The number of NaN's in the column SellerG is 0.
The number of NaN's in the column Date is 0.
The number of NaN's in the column Distance is 1.
The number of NaN's in the column Postcode is 1.
The number of NaN's in the column Bedroom2 is 8217.
The number of NaN's in the column Bathroom is 8226.
The number of NaN's in the column Car is 8728.
The number of NaN's in the column Landsize is 11810.
The number of NaN's in the column BuildingArea is 21115.
The number of NaN's in the column YearBuilt is 19306.
The number of NaN's in the column CouncilArea is 3.
The number of NaN's in the column Lattitude is 7976.
The number of NaN's in the column Longtitude is 7976.
The number of NaN's in the column Regionname is 3.
The n

In [126]:
# Finally going to drop the miscellaneous ones:
dfdrop = df
dfdrop.dropna(subset=['Distance', 'Postcode', 'CouncilArea', 'Regionname', 'Propertycount'], inplace=True) 
dfdrop.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,1033549.02,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0
5,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra City Council,-37.7969,144.9969,Northern Metropolitan,4019.0


In [127]:
df = dfdrop

In [128]:
# Now I am going to check linearity between our X features
# ignoring with the ones with NaN's for now - 
# may incorporate them later depending on how well model does
X = df
X.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,1033549.02,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0
5,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra City Council,-37.7969,144.9969,Northern Metropolitan,4019.0


In [129]:
X.drop(columns=['Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude'], 
       inplace=True)
X.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,CouncilArea,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,1033549.02,SS,Jellis,3/09/2016,2.5,3067.0,Yarra City Council,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,Yarra City Council,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,Yarra City Council,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,Yarra City Council,Northern Metropolitan,4019.0
5,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,Yarra City Council,Northern Metropolitan,4019.0


In [130]:
Y = X['Price']

In [131]:
X.drop(['Price'], axis=1, inplace=True)
X.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,CouncilArea,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,SS,Jellis,3/09/2016,2.5,3067.0,Yarra City Council,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,S,Biggin,3/12/2016,2.5,3067.0,Yarra City Council,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,S,Biggin,4/02/2016,2.5,3067.0,Yarra City Council,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,SP,Biggin,4/03/2017,2.5,3067.0,Yarra City Council,Northern Metropolitan,4019.0
5,Abbotsford,40 Federation La,3,h,PI,Biggin,4/03/2017,2.5,3067.0,Yarra City Council,Northern Metropolitan,4019.0


In [133]:
#going to have to get dummies for most things
Xdummy = pd.get_dummies(X, columns=['Suburb', 'Type', 'Method', 'SellerG', 'CouncilArea', 'Regionname'])
Xdummy.head()

Unnamed: 0,Address,Rooms,Date,Distance,Postcode,Propertycount,Suburb_Abbotsford,Suburb_Aberfeldie,Suburb_Airport West,Suburb_Albanvale,...,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council,Regionname_Eastern Metropolitan,Regionname_Eastern Victoria,Regionname_Northern Metropolitan,Regionname_Northern Victoria,Regionname_South-Eastern Metropolitan,Regionname_Southern Metropolitan,Regionname_Western Metropolitan,Regionname_Western Victoria
0,68 Studley St,2,3/09/2016,2.5,3067.0,4019.0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,0
1,85 Turner St,2,3/12/2016,2.5,3067.0,4019.0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,0
2,25 Bloomburg St,2,4/02/2016,2.5,3067.0,4019.0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,0
4,5 Charles St,3,4/03/2017,2.5,3067.0,4019.0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,0
5,40 Federation La,3,4/03/2017,2.5,3067.0,4019.0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,0


In [139]:
# checking for linearity via correlation table
Xheat = Xdummy
Xheat.drop(['Address', 'Date', 'Postcode', 'Propertycount'], axis=1, inplace=True)

In [137]:
Xheat.corr()

Unnamed: 0,Rooms,Distance,Postcode,Propertycount,Suburb_Abbotsford,Suburb_Aberfeldie,Suburb_Airport West,Suburb_Albanvale,Suburb_Albert Park,Suburb_Albion,...,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council,Regionname_Eastern Metropolitan,Regionname_Eastern Victoria,Regionname_Northern Metropolitan,Regionname_Northern Victoria,Regionname_South-Eastern Metropolitan,Regionname_Southern Metropolitan,Regionname_Western Metropolitan,Regionname_Western Victoria
Rooms,1.000000,0.230219,0.088733,-0.061755,-0.039757,0.008170,-0.008637,-0.000846,-0.014142,-0.016780,...,-0.118680,0.023910,0.125283,0.038601,-0.140274,0.033996,0.045898,-0.005433,0.013831,0.011802
Distance,0.230219,1.000000,0.493356,-0.003800,-0.079595,-0.022599,0.002680,0.007448,-0.076263,0.003220,...,-0.236492,0.135909,0.179410,0.276553,-0.182803,0.256047,0.440284,-0.252446,-0.033856,0.169024
Postcode,0.088733,0.493356,1.000000,0.015027,-0.027385,-0.032790,-0.046722,-0.017786,0.049835,-0.039932,...,-0.035516,0.057929,0.001181,0.296873,-0.169558,0.237475,0.226415,0.244965,-0.352039,0.118205
Propertycount,-0.061755,-0.003800,0.015027,1.000000,-0.048642,-0.064333,-0.063894,-0.026292,-0.059376,-0.055449,...,0.061730,-0.014915,-0.110397,0.028052,0.279187,-0.054378,-0.030218,-0.024276,-0.160027,-0.045719
Suburb_Abbotsford,-0.039757,-0.079595,-0.027385,-0.048642,1.000000,-0.002952,-0.004319,-0.001283,-0.003840,-0.002850,...,0.343796,-0.003467,-0.024206,-0.005183,0.100757,-0.004899,-0.014627,-0.042707,-0.030862,-0.003682
Suburb_Aberfeldie,0.008170,-0.022599,-0.032790,-0.064333,-0.002952,1.000000,-0.003348,-0.000994,-0.002976,-0.002209,...,-0.008587,-0.002687,-0.018761,-0.004017,-0.029298,-0.003797,-0.011336,-0.033101,0.095651,-0.002854
Suburb_Airport West,-0.008637,0.002680,-0.046722,-0.063894,-0.004319,-0.003348,1.000000,-0.001455,-0.004354,-0.003232,...,-0.012563,-0.003932,-0.027449,-0.005877,-0.042867,-0.005556,-0.016586,-0.048430,0.139948,-0.004176
Suburb_Albanvale,-0.000846,0.007448,-0.017786,-0.026292,-0.001283,-0.000994,-0.001455,1.000000,-0.001293,-0.000960,...,-0.003731,-0.001168,-0.008153,-0.001746,-0.012732,-0.001650,-0.004926,-0.014384,0.041566,-0.001240
Suburb_Albert Park,-0.014142,-0.076263,0.049835,-0.059376,-0.003840,-0.002976,-0.004354,-0.001293,1.000000,-0.002873,...,-0.011169,-0.003495,-0.024403,-0.005225,-0.038109,-0.004939,-0.014745,0.089907,-0.031113,-0.003712
Suburb_Albion,-0.016780,0.003220,-0.039932,-0.055449,-0.002850,-0.002209,-0.003232,-0.000960,-0.002873,1.000000,...,-0.008291,-0.002595,-0.018115,-0.003878,-0.028289,-0.003666,-0.010946,-0.031960,0.092356,-0.002756


In [151]:
# judging by Xcorr, no stark co-linearities (highest co-linearity seen was <30%)
# moving on to actual modeling
from sklearn import linear_model
regr = linear_model.LinearRegression()

regr.fit(Xheat, Y) 
# Xdummy giving me an 'inf/NaN' error even though there aren't any NaN's
# so using Xheat instead

print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:')
print(regr.score(Xheat, Y))

ValueError: array must not contain infs or NaNs

In [149]:
columnlist = list(Xdummy.columns)
for column in columnlist:
    if Xdummy[column].isna().sum() > 0:
        print('**********************{} has the following number of NaN\'s: {}'.format(column, Xdummy[column].isna().sum()))
    else:
        print('Column labeled {} does not have any NaN\'s.'.format(column))

Column labeled Rooms does not have any NaN's.
Column labeled Distance does not have any NaN's.
Column labeled Suburb_Abbotsford does not have any NaN's.
Column labeled Suburb_Aberfeldie does not have any NaN's.
Column labeled Suburb_Airport West does not have any NaN's.
Column labeled Suburb_Albanvale does not have any NaN's.
Column labeled Suburb_Albert Park does not have any NaN's.
Column labeled Suburb_Albion does not have any NaN's.
Column labeled Suburb_Alphington does not have any NaN's.
Column labeled Suburb_Altona does not have any NaN's.
Column labeled Suburb_Altona Meadows does not have any NaN's.
Column labeled Suburb_Altona North does not have any NaN's.
Column labeled Suburb_Ardeer does not have any NaN's.
Column labeled Suburb_Armadale does not have any NaN's.
Column labeled Suburb_Ascot Vale does not have any NaN's.
Column labeled Suburb_Ashburton does not have any NaN's.
Column labeled Suburb_Ashwood does not have any NaN's.
Column labeled Suburb_Aspendale does not have

Column labeled Suburb_Rosanna does not have any NaN's.
Column labeled Suburb_Rowville does not have any NaN's.
Column labeled Suburb_Roxburgh Park does not have any NaN's.
Column labeled Suburb_Sandhurst does not have any NaN's.
Column labeled Suburb_Sandringham does not have any NaN's.
Column labeled Suburb_Scoresby does not have any NaN's.
Column labeled Suburb_Seabrook does not have any NaN's.
Column labeled Suburb_Seaford does not have any NaN's.
Column labeled Suburb_Seaholme does not have any NaN's.
Column labeled Suburb_Seddon does not have any NaN's.
Column labeled Suburb_Silvan does not have any NaN's.
Column labeled Suburb_Skye does not have any NaN's.
Column labeled Suburb_South Kingsville does not have any NaN's.
Column labeled Suburb_South Melbourne does not have any NaN's.
Column labeled Suburb_South Morang does not have any NaN's.
Column labeled Suburb_South Yarra does not have any NaN's.
Column labeled Suburb_Southbank does not have any NaN's.
Column labeled Suburb_Spot

Column labeled SellerG_Keatings does not have any NaN's.
Column labeled SellerG_Kelly does not have any NaN's.
Column labeled SellerG_Ken does not have any NaN's.
Column labeled SellerG_Knight does not have any NaN's.
Column labeled SellerG_L does not have any NaN's.
Column labeled SellerG_LITTLE does not have any NaN's.
Column labeled SellerG_LJ does not have any NaN's.
Column labeled SellerG_LJH does not have any NaN's.
Column labeled SellerG_LLC does not have any NaN's.
Column labeled SellerG_Landfield does not have any NaN's.
Column labeled SellerG_Langwell does not have any NaN's.
Column labeled SellerG_Le does not have any NaN's.
Column labeled SellerG_Leaders does not have any NaN's.
Column labeled SellerG_Leading does not have any NaN's.
Column labeled SellerG_Leased does not have any NaN's.
Column labeled SellerG_Leeburn does not have any NaN's.
Column labeled SellerG_Leyton does not have any NaN's.
Column labeled SellerG_Lindellas does not have any NaN's.
Column labeled Selle

Column labeled CouncilArea_Port Phillip City Council does not have any NaN's.
Column labeled CouncilArea_Stonnington City Council does not have any NaN's.
Column labeled CouncilArea_Whitehorse City Council does not have any NaN's.
Column labeled CouncilArea_Whittlesea City Council does not have any NaN's.
Column labeled CouncilArea_Wyndham City Council does not have any NaN's.
Column labeled CouncilArea_Yarra City Council does not have any NaN's.
Column labeled CouncilArea_Yarra Ranges Shire Council does not have any NaN's.
Column labeled Regionname_Eastern Metropolitan does not have any NaN's.
Column labeled Regionname_Eastern Victoria does not have any NaN's.
Column labeled Regionname_Northern Metropolitan does not have any NaN's.
Column labeled Regionname_Northern Victoria does not have any NaN's.
Column labeled Regionname_South-Eastern Metropolitan does not have any NaN's.
Column labeled Regionname_Southern Metropolitan does not have any NaN's.
Column labeled Regionname_Western Met

In [159]:
(Y.isnull() == True).value_counts() # shows that there are no NaN's here either.

False    32417
Name: Price, dtype: int64

In [164]:
df = dfdrop2 # I have zero idea why df keeps changing even though I make sure it doesn't.

In [165]:
############################################################

# going to impute the average to the rest of the variables and work from there

df['Car'] = df.groupby("Suburb")['Car'].transform(lambda x: round(x.fillna(x.mean()))) 
# ^^^^rounding so that I don't have a fraction of a car
df['Bathroom'] = df.groupby("Suburb")['Bathroom'].transform(lambda x: round(x.fillna(x.mean())))
df['Landsize'] = df.groupby("Suburb")['Landsize'].transform(lambda x: x.fillna(x.mean())) 
df['BuildingArea'] = df.groupby("Suburb")['BuildingArea'].transform(lambda x: x.fillna(x.mean())) 
df['YearBuilt'] = df.groupby("Suburb")['YearBuilt'].transform(lambda x: round(x.fillna(x.mean()))) 
df['Bedroom2'] = df.groupby("Suburb")['Bedroom2'].transform(lambda x: round(x.fillna(x.mean())))

In [167]:
list(df.columns)

['Suburb',
 'Address',
 'Rooms',
 'Type',
 'Price',
 'Method',
 'SellerG',
 'Date',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Car',
 'Landsize',
 'BuildingArea',
 'YearBuilt',
 'CouncilArea',
 'Lattitude',
 'Longtitude',
 'Regionname',
 'Propertycount']

In [169]:
df.head()
#re-doing dummies and X
X = df
X.drop(['Address', 'SellerG', 'Postcode', 'Lattitude', 'Longtitude', 'Regionname', 'Propertycount'], axis=1, inplace=True)

In [170]:
X.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Method,Date,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea
0,Abbotsford,2,h,1033549.02,SS,3/09/2016,2.5,2.0,1.0,1.0,126.0,103.610169,1943.0,Yarra City Council
1,Abbotsford,2,h,1480000.0,S,3/12/2016,2.5,2.0,1.0,1.0,202.0,103.610169,1943.0,Yarra City Council
2,Abbotsford,2,h,1035000.0,S,4/02/2016,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra City Council
3,Abbotsford,3,u,1033549.02,VB,4/02/2016,2.5,3.0,2.0,1.0,0.0,103.610169,1943.0,Yarra City Council
4,Abbotsford,3,h,1465000.0,SP,4/03/2017,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra City Council


In [175]:
X.drop(['CouncilArea'], axis=1, inplace=True)

In [176]:
# trying this again
Xdummy = pd.get_dummies(X, columns=['Suburb', 'Type', 'Method'])

In [179]:
Xdummy = Xdummy[Xdummy.Landsize != 0]
Xdummy.head()

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Suburb_Abbotsford,...,Type_u,Method_PI,Method_PN,Method_S,Method_SA,Method_SN,Method_SP,Method_SS,Method_VB,Method_W
0,2,1033549.02,2.5,2.0,1.0,1.0,126.0,103.610169,1943.0,1,...,0,0,0,0,0,0,0,1,0,0
1,2,1480000.0,2.5,2.0,1.0,1.0,202.0,103.610169,1943.0,1,...,0,0,0,1,0,0,0,0,0,0
2,2,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,1,...,0,0,0,1,0,0,0,0,0,0
4,3,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,1,...,0,0,0,0,0,0,1,0,0,0
5,3,850000.0,2.5,3.0,2.0,1.0,94.0,103.610169,1943.0,1,...,0,1,0,0,0,0,0,0,0,0


In [178]:
# judging by Xcorr, no stark co-linearities (highest co-linearity seen was <30%)
# moving on to actual modeling
from sklearn import linear_model
regr = linear_model.LinearRegression()

regr.fit(Xdummy, Y) #NOTE NOTE NOTE that as of now, price is still in X!!!!!!!

print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:')
print(regr.score(Xdummy, Y))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').