In [1]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

### Import Dataset

In [2]:
df = pd.read_csv('../input/melbourne-housing-snapshot/melb_data.csv')
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


**Data Information**

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

**Data Statistics**

In [4]:
df.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [5]:
df.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

**Features and Target**

In [6]:
df = df[["Rooms", 
         "Distance", 
         "Bedroom2", 
         "Bathroom", 
         "Car", 
         "Landsize", 
         "BuildingArea", 
         "YearBuilt",
         "Lattitude",
         "Longtitude",
         "Price"]]

In [7]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6830 entries, 1 to 13579
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rooms         6830 non-null   int64  
 1   Distance      6830 non-null   float64
 2   Bedroom2      6830 non-null   float64
 3   Bathroom      6830 non-null   float64
 4   Car           6830 non-null   float64
 5   Landsize      6830 non-null   float64
 6   BuildingArea  6830 non-null   float64
 7   YearBuilt     6830 non-null   float64
 8   Lattitude     6830 non-null   float64
 9   Longtitude    6830 non-null   float64
 10  Price         6830 non-null   float64
dtypes: float64(10), int64(1)
memory usage: 640.3 KB


In [8]:
# Features
X = df[df.columns.drop("Price")]

# Target
y = df["Price"]

### Decision Tree Regression

In [9]:
dtr = DecisionTreeRegressor(random_state = 6830)

dtr.fit(X, y)

pred_dtr = dtr.predict(X)

pd.DataFrame({"Actual": y,
              "Peredicted": pred_dtr}).sample(10)

Unnamed: 0,Actual,Peredicted
12652,1900000.0,1900000.0
11079,1570000.0,1570000.0
7321,640000.0,640000.0
4072,550000.0,550000.0
7550,1000000.0,1000000.0
11925,640000.0,640000.0
949,1020000.0,1020000.0
878,1690000.0,1690000.0
4645,590000.0,590000.0
131,1830000.0,1830000.0


**Mean Absolute Error**

In [10]:
mae = mean_absolute_error(y, pred_dtr)
mae

392.31332357247436

### Model Validation

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.2,
                                                    random_state = 600)

In [12]:
dtr.fit(X_train, y_train)

pred_dtr_t = dtr.predict(X_test)

pd.DataFrame({"Actual": y_test,
             "Predicted": pred_dtr_t}).sample(10)

Unnamed: 0,Actual,Predicted
1561,1650000.0,1725000.0
7651,910000.0,910000.0
12098,910000.0,1210000.0
12993,931000.0,709000.0
6621,936000.0,915000.0
10596,386000.0,460000.0
11800,460000.0,360000.0
5219,420000.0,270000.0
712,2435000.0,1620000.0
11876,580000.0,600000.0


**Mean Absolute Error (Validation)**

In [13]:
mae_t = mean_absolute_error(y_test, pred_dtr_t)
mae_t

232676.47218155197

### Underfitting and Overfitting

In [14]:
mln = []

for x in [5, 25, 50, 100, 250, 500]:
    dt = DecisionTreeRegressor(max_leaf_nodes = x, random_state = 420)
    dt.fit(X_train, y_train)
    pred_dt = dt.predict(X_test)
    mae_ = mean_absolute_error(y_test, pred_dt)
    mln.append(mae_)    
mln

[327535.47287957754,
 245063.0555114401,
 237252.0108789654,
 225969.12750214303,
 218882.16819337526,
 218934.63550640174]

### Random Forest

In [15]:
rf = RandomForestRegressor(random_state = 313)

rf.fit(X_train, y_train)

pred_rf = rf.predict(X_test)

pd.DataFrame({"Actual": y_test,
              "Predicted": pred_rf}).sample(10)


Unnamed: 0,Actual,Predicted
10741,373500.0,411955.0
2286,510000.0,561303.4
5025,840000.0,909405.0
6563,580000.0,603485.0
1666,3145000.0,2898065.0
5297,1275000.0,1631252.0
8747,1731000.0,1768410.0
10885,1225000.0,852505.0
5785,616000.0,1026865.0
10563,595000.0,919765.0


**mean absolute error (random forest)**

In [16]:
mae_rf = mean_absolute_error(y_test, pred_rf)
mae_rf

159333.66556857

**Mean absolute error by using DECISION TREE is above 20 thousand but by using RANDOM FOREST is approximately 16 tousand**