In [2]:
# Objective:
#     The objective of this project is to build a house price predictor using two models, Decision Tree Regressor and Random 
# Forest Regressor, and compare the mean absolute error of the predictions of both the models.

# Data set:
#     The data set used here is taken from "https://www.kaggle.com/shree1992/housedata" named House Price Prediction dataset. It
# consists of house prices and different other features from cities like Melbourne, Sydney, etc. and present an interesting 
# oppurtunity to analyze and predict where property prices are moving towards.

# Features:
#     The given dataset consists of the following columns:
#         1) date
#         2)price
#         3)bedrooms
#         4)bathrooms
#         5)sqft_living
#         6)sqft_lot
#         7)floors
#         8)waterfront
#         9)view
#         10)condition
#         11)sqft_above
#         12)sqft_basement
#         13)yr_built
#         14)yr_renovated
#         15)street
#         16)city
#         17)statezip
#         18)country

In [3]:
#Importing necessary libraries.

import pandas as pd                                     #to read .csv files into DataFrames 
from sklearn.tree import DecisionTreeRegressor          #importing the Decision Tree Regressor from sklearn
from sklearn.ensemble import RandomForestRegressor      #importing Random Forest Regressor from sklearn 
from sklearn.model_selection import train_test_split    #importing train_test_split for splitting the data into training and validation data
from sklearn.metrics import mean_absolute_error         #importing mean_absolute_error to find the absolute error

In [4]:
#The dataset is imported as a pd DataFrame into housePrices.

filePath='Documents\documents\data.csv'

housePrices = pd.read_csv(filePath)

In [5]:
housePrices.columns

Index(['date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'view', 'condition', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'street', 'city',
       'statezip', 'country'],
      dtype='object')

In [6]:
housePrices.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [7]:
housePrices.describe()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated
count,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0
mean,551963.0,3.40087,2.160815,2139.346957,14852.52,1.512065,0.007174,0.240652,3.451739,1827.265435,312.081522,1970.786304,808.608261
std,563834.7,0.908848,0.783781,963.206916,35884.44,0.538288,0.084404,0.778405,0.67723,862.168977,464.137228,29.731848,979.414536
min,0.0,0.0,0.0,370.0,638.0,1.0,0.0,0.0,1.0,370.0,0.0,1900.0,0.0
25%,322875.0,3.0,1.75,1460.0,5000.75,1.0,0.0,0.0,3.0,1190.0,0.0,1951.0,0.0
50%,460943.5,3.0,2.25,1980.0,7683.0,1.5,0.0,0.0,3.0,1590.0,0.0,1976.0,0.0
75%,654962.5,4.0,2.5,2620.0,11001.25,2.0,0.0,0.0,4.0,2300.0,610.0,1997.0,1999.0
max,26590000.0,9.0,8.0,13540.0,1074218.0,3.5,1.0,4.0,5.0,9410.0,4820.0,2014.0,2014.0


In [8]:
# y denotes the values to predict, i.e, the price of the house.
# x denotes the features. Here we consider only the bedrooms, bathrooms, house area and lot area

y = housePrices.price
features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot']
X = housePrices[features]

In [9]:
X.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot
0,3.0,1.5,1340,7912
1,5.0,2.5,3650,9050
2,3.0,2.0,1930,11947
3,3.0,2.25,2000,8030
4,4.0,2.5,1940,10500


In [10]:
#Defining Decision Tree Regressor. First we use default value for the leaf nodes.

dt_model = DecisionTreeRegressor(random_state = 1)
dt_model.fit(X,y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

In [11]:
#Splitting the data for training and validation

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

In [12]:
#Fitting the Decision Tree using the training data

dt_model.fit(train_X, train_y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

In [13]:
#The predictions of the model are stored in dt_predictions. And the mean error is stored in dt_error_default.

dt_predictions = dt_model.predict(val_X)
dt_error_default = mean_absolute_error(dt_predictions, val_y)

In [14]:
#Now we check for the number of leaf nodes for which the mean absolute error is minimum.
#Checking the best number of leaf nodes to minmize the mean absolute error.

leaf_nodes = [5, 50, 500, 5000]

for i in leaf_nodes:
    model = DecisionTreeRegressor(max_leaf_nodes = i, random_state = 1)
    model.fit(train_X, train_y)
    m = model.predict(val_X)
    mae = mean_absolute_error(m, val_y)
    print(str(i)+": "+str(mae))

5: 186807.31460051288
50: 182662.2768300128
500: 213646.163282489
5000: 239107.80076412915


In [15]:
#Thus we can see that 50 is a better leaf node with a mean absolute error of 182662.2768300128. Therefore we redefit the 
#decision tree model using the whole data.

dt_error_nodes = 182662.2768300128

dt_model_final = DecisionTreeRegressor(max_leaf_nodes = 50, random_state = 1)
dt_model_final.fit(X,y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=50, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

In [16]:
#Now we create a Random Forest Regressor and fit the model using the training data which was split before.

In [17]:
rf_model = RandomForestRegressor(random_state = 1)
rf_model.fit(train_X, train_y)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [24]:
#The predictions of the model are stored in rf_predictions. And the mean absolute error is stored in rf_error_df.

rf_prediction = rf_model.predict(val_X)
rf_error_df = mean_absolute_error(rf_prediction, val_y)

In [28]:
#Now we check for the value of n_estimators for which the value of the mean absolute error is minimum.

no_of_trees = [4,5,6,7]

for n in no_of_trees:
    rf_model_new = RandomForestRegressor(n_estimators = n, random_state = 0)
    rf_model_new.fit(train_X, train_y)
    pred = rf_model_new.predict(val_X)
    print(mean_absolute_error(pred, val_y))

202579.9833988104
199543.6944785675
205941.44517637417
209625.30363276834


In [30]:
#As we can see the best number of n_estimators is 5 with mean absolute error of 199543.6944785675

rf_error_n = 199543.6944785675

In [31]:
#The mean absolute errors

print("Mean Absolute Error for Decision Tree Regressor using default values: "+str(dt_error_default))
print("Mean Absolute Error for Decision Tree Regressor by setting max_leaf_nodes: "+str(dt_error_nodes))
print("Mean Absolute Error for Random Forest Regressor: "+str(rf_error_df))
print("Mean Absolute Error for Random Forest Regressor by setting n_estimators: "+str(rf_error_n))

Mean Absolute Error for Decision Tree Regressor using default values: 260120.98067290089
Mean Absolute Error for Decision Tree Regressor by setting max_leaf_nodes: 182662.2768300128
Mean Absolute Error for Random Forest Regressor: 205864.49699757603
Mean Absolute Error for Random Forest Regressor by setting n_estimators: 199543.6944785675


In [32]:
# Conclusion: 
#     Thus we can see in this case that the decision tree regressor using default values has the maximum mean absolute error,
# followed by the random forest regressor, then random forest regressor after setting n_estimators and then the decision tree regressor after setting the leaf nodes.