In [44]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [45]:
# import dataset
df = pd.read_csv('melb_housing.csv')

# number of rows and columns
df.shape

(13580, 21)

In [46]:
# peek at first 5 rows
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [47]:
# check data types of columns
df.dtypes

Suburb            object
Address           object
Rooms              int64
Type              object
Price            float64
Method            object
SellerG           object
Date              object
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
CouncilArea       object
Lattitude        float64
Longtitude       float64
Regionname        object
Propertycount    float64
dtype: object

In [48]:
# check for missing values
df.count()

Suburb           13580
Address          13580
Rooms            13580
Type             13580
Price            13580
Method           13580
SellerG          13580
Date             13580
Distance         13580
Postcode         13580
Bedroom2         13580
Bathroom         13580
Car              13518
Landsize         13580
BuildingArea      7130
YearBuilt         8205
CouncilArea      12211
Lattitude        13580
Longtitude       13580
Regionname       13580
Propertycount    13580
dtype: int64

Four of the columns — Car, BuildingArea, YearBuilt, CouncilArea — contain missing values.

In [49]:
# remove rows with missing values
df = df.dropna(axis=0)

In [50]:
# check number of records after rows with missing values removed
df.shape

(6196, 21)

## Price Prediction

Use machine learning techniques to attempt to predict the pricing of houses.

In [51]:
# define prediction target
Y = df.Price

# define features
X = df[['Rooms', 'Bedroom2', 'Bathroom', 'Landsize', 'BuildingArea', 'Lattitude', 'Longtitude']]

In [52]:
# split data into training and test sets
train_X, val_X, train_Y, val_Y = train_test_split(X, Y, random_state=0)

In [53]:
# define model
model = RandomForestRegressor(random_state=1)

In [54]:
# fit model
model.fit(train_X, train_Y)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [55]:
# predict
predicted_prices = model.predict(val_X)

# model validation
mean_absolute_error(val_Y, predicted_prices)

201757.0562943835

In [56]:
# write a function to find optimum number of leaf nodes to get lowest mean absolute error
def get_mae(max_leaf_nodes, train_X, train_Y, val_X, val_Y):
    model = RandomForestRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_Y)
    prediction = model.predict(val_X)
    mae = mean_absolute_error(val_Y, prediction)
    return mae

In [57]:
# test effect of different values of leaf nodes on mean absolute error
for max_leaf_node in [10, 500, 1000, 2000]:
    my_mae = get_mae(max_leaf_node, train_X, train_Y, val_X, val_Y)
    print("Max Leaf Nodes: %d \t\t Mean Absolute Error: %d " %(max_leaf_node, my_mae))



Max Leaf Nodes: 10 		 Mean Absolute Error: 303112 
Max Leaf Nodes: 500 		 Mean Absolute Error: 198125 
Max Leaf Nodes: 1000 		 Mean Absolute Error: 196516 
Max Leaf Nodes: 2000 		 Mean Absolute Error: 197932 




The most optimal leaf node number so far is 1000. Conduct further testing based on this number with the goal of finding an even more optimal value.

In [58]:
for max_leaf_node in [800, 1000, 1200, 1500]:
    my_mae = get_mae(max_leaf_node, train_X, train_Y, val_X, val_Y)
    print("Max Leaf Nodes: %d \t\t Mean Absolute Error: %d " %(max_leaf_node, my_mae))



Max Leaf Nodes: 800 		 Mean Absolute Error: 196680 
Max Leaf Nodes: 1000 		 Mean Absolute Error: 196516 




Max Leaf Nodes: 1200 		 Mean Absolute Error: 197126 
Max Leaf Nodes: 1500 		 Mean Absolute Error: 197407 


Conclusion: 1000 still remains the most optimal leaf node number found so far.