In [13]:
import pandas as pd
#save filepath to variable for easier access
melbourne_file_path = 'input/melb_data.csv'

#read the data and store data in DataFrame titled melbourne_data
melbourne_data = pd.read_csv(melbourne_file_path)

In [14]:
melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [15]:
melbourne_data = melbourne_data.dropna(axis=0)

In [16]:
y = melbourne_data.Price
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea','YearBuilt', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]
X.describe()


Unnamed: 0,Rooms,Bathroom,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1.57634,471.00694,141.568645,1964.081988,-37.807904,144.990201
std,0.971079,0.711362,897.449881,90.834824,38.105673,0.07585,0.099165
min,1.0,1.0,0.0,0.0,1196.0,-38.16492,144.54237
25%,2.0,1.0,152.0,91.0,1940.0,-37.855438,144.926198
50%,3.0,1.0,373.0,124.0,1970.0,-37.80225,144.9958
75%,4.0,2.0,628.0,170.0,2000.0,-37.7582,145.0527
max,8.0,8.0,37000.0,3112.0,2018.0,-37.45709,145.52635


In [17]:
X.head()

Unnamed: 0,Rooms,Bathroom,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude
1,2,1.0,156.0,79.0,1900.0,-37.8079,144.9934
2,3,2.0,134.0,150.0,1900.0,-37.8093,144.9944
4,4,1.0,120.0,142.0,2014.0,-37.8072,144.9941
6,3,2.0,245.0,210.0,1910.0,-37.8024,144.9993
7,2,1.0,256.0,107.0,1890.0,-37.806,144.9954


In [18]:
from sklearn.tree import DecisionTreeRegressor

# Define model. Specify a number for random_state to ensure same results each run
melbourne_model = DecisionTreeRegressor()

# Fit model
melbourne_model.fit(X, y)


DecisionTreeRegressor()

In [19]:
# print('Making predictions for the following 5 houses:')
# print(X.head())
# print("The predictions are")
# print(melbourne_model.predict(X.head()))


In [20]:
from sklearn.metrics import mean_absolute_error
#define model
predicted_home_prices = melbourne_model.predict(X)
mean_absolute_error(y, predicted_home_prices)


434.71594577146544

# The Problem with "In-Sample" Scores

The measure we just computed can be called an "in-sample" score. We used a single "sample" of houses for both building the model and evaluating it. Here's why this is bad.

Imagine that, in the large real estate market, door color is unrelated to home price.

However, in the sample of data you used to build the model, all homes with green doors were very expensive. The model's job is to find patterns that predict home prices, so it will see this pattern, and it will always predict high prices for homes with green doors.

Since this pattern was derived from the training data, the model will appear accurate in the training data.

But if this pattern doesn't hold when the model sees new data, the model would be very inaccurate when used in practice.

Since models' practical value come from making predictions on new data, we measure performance on data that wasn't used to build the model. The most straightforward way to do this is to exclude some data from the model-building process, and then use those to test the model's accuracy on data it hasn't seen before. This data is called **validation data**.


In [21]:
from sklearn.model_selection import train_test_split
# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)

# Define model 
melbourne_model = DecisionTreeRegressor()
# Fit model
melbourne_model.fit(train_X, train_y)
# get predicted prices on validation data
val_predictions = melbourne_model.predict(val_X)
mean_absolute_error(val_y, val_predictions)


259873.9761136217

In [22]:
#utility function to help compare MAE scores from different values for max_leaf_nodes

def get_mae(max_leaf_nodes, train_X, train_y, val_X, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    return mean_absolute_error(val_y, preds_val)


In [23]:
# compare MAE with differing values of max_leaf_nodes
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]

for max_leaf_nodes in candidate_max_leaf_nodes:
    my_mae = get_mae(max_leaf_nodes, train_X, train_y, val_X, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" % (max_leaf_nodes, my_mae))

    # print(f"Max leaf nodes: {max_leaf_nodes} \t \t Mean Absolute Error:  {my_mae}")
scores = {max_leaf_nodes: get_mae(
    max_leaf_nodes, train_X, train_y, val_X, val_y) for max_leaf_nodes in candidate_max_leaf_nodes}
scores

# best_tree_size = [key for key in scores if scores[key] == min(scores.values())]
best_tree_size1 = min(scores, key=scores.get)


Max leaf nodes: 5  		 Mean Absolute Error:  347380
Max leaf nodes: 25  		 Mean Absolute Error:  271044
Max leaf nodes: 50  		 Mean Absolute Error:  258171
Max leaf nodes: 100  		 Mean Absolute Error:  248734
Max leaf nodes: 250  		 Mean Absolute Error:  247206
Max leaf nodes: 500  		 Mean Absolute Error:  243495


In [24]:
from sklearn.ensemble import RandomForestRegressor

#define the model , set random_state = 1
rf_model = RandomForestRegressor(random_state=1)

#fit the model
rf_model.fit(train_X, train_y)

# predict the model
val_predict = rf_model.predict(val_X)

# Find the MAE 
rf_error = mean_absolute_error(val_y, val_predict)
print(f'Validation of MAE for randomforest {rf_error:.0f}')


Validation of MAE for randomforest 191670
