In [10]:
# Start with decision tree regression

# We will predict a price based on a chosen set of features.

#

In [11]:
import pandas as pd
melbourne_file_path = "melb_data.csv"
melbourne_data = pd.read_csv(melbourne_file_path)
melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [33]:
# dropna drops missing values (think of na as "not available")
melbourne_data = melbourne_data.dropna(axis=0)
melbourne_data.columns



Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [13]:
y = melbourne_data.Price

In [14]:
# Choose my features
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']

In [58]:
# Select independent variables
X = melbourne_data[melbourne_features]

df = pd.DataFrame(X)

In [59]:
X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1.57634,471.00694,-37.807904,144.990201
std,0.971079,0.711362,897.449881,0.07585,0.099165
min,1.0,1.0,0.0,-38.16492,144.54237
25%,2.0,1.0,152.0,-37.855438,144.926198
50%,3.0,1.0,373.0,-37.80225,144.9958
75%,4.0,2.0,628.0,-37.7582,145.0527
max,8.0,8.0,37000.0,-37.45709,145.52635


In [61]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_X = scaler.fit_transform(df)
df_scaled = pd.DataFrame(scaled_X, columns=df.columns)

df_scaled.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0
mean,6.421949e-17,1.3761320000000002e-17,-2.2935530000000003e-17,-5.792139e-14,3.862802e-14
std,1.000081,1.000081,1.000081,1.000081,1.000081
min,-1.98909,-0.8102572,-0.5248705,-4.70722,-4.5164
25%,-0.9592244,-0.8102572,-0.355488,-0.6267223,-0.6454799
50%,0.0706412,-0.8102572,-0.1092148,0.07454976,0.05646482
75%,1.100507,0.5956105,0.1749465,0.6553448,0.6303042
max,5.219969,9.030817,40.70639,4.625451,5.407088


In [62]:
X = df_scaled

X.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
0,-0.959224,-0.810257,-0.351031,5.5e-05,0.032261
1,0.070641,0.595611,-0.375546,-0.018404,0.042346
2,1.100507,-0.810257,-0.391147,0.009284,0.03932
3,0.070641,0.595611,-0.251853,0.072572,0.091763
4,-0.959224,-0.810257,-0.239595,0.025106,0.052431


In [63]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

In [64]:
from sklearn.tree import DecisionTreeRegressor

# Define model
melbourne_model = DecisionTreeRegressor(random_state=1)

# Fit model
melbourne_model.fit(train_X, train_y)


In [65]:
print("We will make predictions for the following 5 houses")
print(val_X.head())
print("We predict the selling price for these houses will be:")
print(melbourne_model.predict(val_X.head()))

We will make predictions for the following 5 houses
         Rooms  Bathroom  Landsize  Lattitude  Longtitude
2669 -0.959224 -0.810257 -0.417892  -0.556348    0.051422
1281 -0.959224 -0.810257 -0.524870  -1.085063    0.005031
5154 -0.959224 -0.810257 -0.373318  -0.626492    0.055557
2006  0.070641  0.595611 -0.296427   0.113445    0.368093
2701 -0.959224 -0.810257 -0.079127   0.958598   -0.050437
We predict the selling price for these houses will be:
[ 900000.  526000. 1120000. 1590000.  630000.]


In [66]:
from sklearn.metrics import mean_absolute_error

predicted_home_prices = melbourne_model.predict(val_X)
mean_absolute_error(val_y, predicted_home_prices)

273495.1007101356

In [67]:
# By running the cell above, we see that the mean absolute error for housing prices, is above $250,000, so the mode did not do great.
# The cost of the average house was $1M

In [68]:
# Model overfit
# A good way to control overfitting vs underfitting for decision tree regressor is the max_leaf_nodes

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y,  preds_val)
    return mae

In [69]:
for max_leaf_nodes in [5,50,500,5000]:
    mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d \t\t Mean Absolute Error: %d" %(max_leaf_nodes, mae))

Max leaf nodes: 5 		 Mean Absolute Error: 385696
Max leaf nodes: 50 		 Mean Absolute Error: 279794
Max leaf nodes: 500 		 Mean Absolute Error: 261718
Max leaf nodes: 5000 		 Mean Absolute Error: 272061


In [70]:
from sklearn.ensemble import RandomForestRegressor

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
melb_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, melb_preds))

207251.77903921632


In [71]:
# Above we see some improvement of using random forrest regressor (averages predictions of many trees) over single decision tree regression.


In [72]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [73]:
# Delete columns with missing data --> can evaluate on

cols_with_missing = [col for col in train_X.columns if train_X[col].isnull().any()] 

# Drop columns in training and validation data
reduced_X_train = train_X.drop(cols_with_missing, axis=1)
reduced_X_valid = val_X.drop(cols_with_missing, axis=1)

print("MAE from Approach 1 (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, train_y, val_y))

MAE from Approach 1 (Drop columns with missing values):
215435.9418979987


In [74]:
## Dealing more holistically with missing values (imputation instead of row deletion)
