# Predicting House Prices - Melbourne Data

In [1]:
import pandas as pd

melbourne_file_path = 'melb_data.csv'
melbourne_data = pd.read_csv(melbourne_file_path) 
melbourne_data.columns    # showing columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [2]:
# The Melbourne data has some missing values (some houses for which some variables weren't recorded). 
# For now, we're gonna just drop houses from our data.

filtered_melbourne_data = melbourne_data.dropna(axis=0) # axis=0 means rows

In [3]:
y = filtered_melbourne_data.Price # by convention, the prediction target is called y

In [4]:
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea','YearBuilt', 
                      'Lattitude', 'Longtitude'] # choosing the features
print(melbourne_features)

['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude']


In [5]:
x = filtered_melbourne_data[melbourne_features] # by convention, the features are called x

In [6]:
x.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1.57634,471.00694,141.568645,1964.081988,-37.807904,144.990201
std,0.971079,0.711362,897.449881,90.834824,38.105673,0.07585,0.099165
min,1.0,1.0,0.0,0.0,1196.0,-38.16492,144.54237
25%,2.0,1.0,152.0,91.0,1940.0,-37.855438,144.926198
50%,3.0,1.0,373.0,124.0,1970.0,-37.80225,144.9958
75%,4.0,2.0,628.0,170.0,2000.0,-37.7582,145.0527
max,8.0,8.0,37000.0,3112.0,2018.0,-37.45709,145.52635


In [7]:
x.head()

Unnamed: 0,Rooms,Bathroom,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude
1,2,1.0,156.0,79.0,1900.0,-37.8079,144.9934
2,3,2.0,134.0,150.0,1900.0,-37.8093,144.9944
4,4,1.0,120.0,142.0,2014.0,-37.8072,144.9941
6,3,2.0,245.0,210.0,1910.0,-37.8024,144.9993
7,2,1.0,256.0,107.0,1890.0,-37.806,144.9954


In [8]:
# We'll use the scikit-learn library to create our models.

# The steps to building and using a model are:

# Define: What type of model will it be? A decision tree? Some other type of model? Some other parameters of the model type are specified too.
# Fit: Capture patterns from provided data. This is the heart of modeling.
# Predict: Just what it sounds like.
# Evaluate: Determine how accurate the model's predictions are.

from sklearn.tree import DecisionTreeRegressor

In [9]:
melbourne_model = DecisionTreeRegressor(random_state=1) # defining model

# Many machine learning models allow some randomness in model training. Specifying a number for random_state ensures 
# you get the same results in each run. This is considered a good practice. You use any number, and model quality won't 
# depend meaningfully on exactly what value you choose.

In [10]:
melbourne_model.fit(x, y) # fiting model

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=1, splitter='best')

In [25]:
print("Making predictions for the following 5 houses:")
print(x.head(5))
print("The predictions are")
print(melbourne_model.predict(x.head(5)))

Making predictions for the following 5 houses:
   Rooms  Bathroom  Landsize  BuildingArea  YearBuilt  Lattitude  Longtitude
1      2       1.0     156.0          79.0     1900.0   -37.8079    144.9934
2      3       2.0     134.0         150.0     1900.0   -37.8093    144.9944
4      4       1.0     120.0         142.0     2014.0   -37.8072    144.9941
6      3       2.0     245.0         210.0     1910.0   -37.8024    144.9993
7      2       1.0     256.0         107.0     1890.0   -37.8060    144.9954
The predictions are
[1035000. 1440000. 1600000. 1876000. 1265000.]


In [24]:
merged=list(zip(y,melbourne_model.predict(x))) # merging the real prices (y) and the predicted prices in a dataframe
df = pd.DataFrame(merged,columns=['y','predictions'])
df.head()

Unnamed: 0,y,predictions
0,1035000.0,1035000.0
1,1465000.0,1440000.0
2,1600000.0,1600000.0
3,1876000.0,1876000.0
4,1636000.0,1265000.0
5,1097000.0,1097000.0
6,1350000.0,1350000.0
7,750000.0,750000.0
8,1310000.0,1250000.0
9,1200000.0,1200000.0


# Model Validation

In [12]:
from sklearn.metrics import mean_absolute_error

# There are many metrics for summarizing model quality, but we'll start with one called Mean Absolute Error (MAE).
# error=actualâˆ’predicted

predicted_home_prices = melbourne_model.predict(x)
mean_absolute_error(y, predicted_home_prices)

434.71594577146544

In [13]:
from sklearn.model_selection import train_test_split

# Split data into training and validation data, for both features and target.
# The split is based on a random number generator. Supplying a numeric value to the random_state argument guarantees 
# we get the same split every time we run this script.

train_x, val_x, train_y, val_y = train_test_split(x, y, random_state = 0)

melbourne_model = DecisionTreeRegressor() # defining model

melbourne_model.fit(train_x, train_y) # fiting model

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [26]:
# Get predicted prices on validation data
val_predictions = melbourne_model.predict(val_x)
val_mae = mean_absolute_error(val_y, val_predictions)
print(val_mae)

259926.89928986444


In [15]:
# Your mean absolute error for the in-sample data was about 500 dollars. Out-of-sample it is more than 250,000 dollars.

# This is the difference between a model that is almost exactly right, and one that is unusable for most practical purposes. 
# As a point of reference, the average home value in the validation data is 1.1 million dollars. So the error in new data is 
# about a quarter of the average home value.

# There are many ways to improve this model, such as experimenting to find better features or different model types.

# Underfitting and Overfitting

In [16]:
# Overfitting: capturing spurious patterns that won't recur in the future, leading to less accurate predictions, or
# Underfitting: failing to capture relevant patterns, again leading to less accurate predictions.

In [17]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_x, val_x, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_x, train_y)
    preds_val = model.predict(val_x)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [18]:
# Compare MAE with differing values of max_leaf_nodes

for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_x, val_x, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5  		 Mean Absolute Error:  347380
Max leaf nodes: 50  		 Mean Absolute Error:  258171
Max leaf nodes: 500  		 Mean Absolute Error:  243495
Max leaf nodes: 5000  		 Mean Absolute Error:  254983


In [19]:
#  The max_leaf_nodes argument provides a very sensible way to control overfitting vs underfitting. The more leaves we 
# allow the model to make, the more we move from the underfitting area in the above graph to the overfitting area.