In [86]:
#Initializing
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn


from sklearn import linear_model
from sklearn import neighbors
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

#source
#https://www.kaggle.com/anthonypino/melbourne-housing-market

#Loading Data
df = pd.read_csv('/Users/Kevin/Files/Thinkful/Data Files/Melbourne_housing_FULL.csv')

# Description and codes

- Suburb: Suburb

- Address: Address 

- Rooms: Number of rooms

- Type: 
    - br - bedroom(s);
    - h - house,cottage,villa, semi,terrace;
    - u - unit, duplex; t - townhouse;
    - dev site - development site;
    - o res - other residential.
    
- Price: Price in Australian dollars

- Method: 
    - S - property sold;
    - SP - property sold prior;
    - PI - property passed in;
    - PN - sold prior not disclosed;
    - SN - sold not disclosed;
    - NB - no bid; VB - vendor bid;
    - W - withdrawn prior to auction;
    - SA - sold after auction;
    - SS - sold after auction price not disclosed.
    - N/A - price or highest bid not available.
    
- SellerG: Real Estate Agent

- Date: Date sold

- Distance: Distance from CBD in Kilometres

- Regionname: General Region (West, North West, North, North east ...etc)

- Propertycount: Number of properties that exist in the suburb.

- Bedroom2 : Scraped # of Bedrooms (from different source)

- Bathroom: Number of Bathrooms

- Car: Number of carspots

- Landsize: Land Size in Metres

- BuildingArea: Building Size in Metres

- YearBuilt: Year the house was built

- CouncilArea: Governing council for the area

- Lattitude: Lattitude

- Longtitude: Longtitude


In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34857 entries, 0 to 34856
Data columns (total 21 columns):
Suburb           34857 non-null object
Address          34857 non-null object
Rooms            34857 non-null int64
Type             34857 non-null object
Price            27247 non-null float64
Method           34857 non-null object
SellerG          34857 non-null object
Date             34857 non-null object
Distance         34856 non-null float64
Postcode         34856 non-null float64
Bedroom2         26640 non-null float64
Bathroom         26631 non-null float64
Car              26129 non-null float64
Landsize         23047 non-null float64
BuildingArea     13742 non-null float64
YearBuilt        15551 non-null float64
CouncilArea      34854 non-null object
Lattitude        26881 non-null float64
Longtitude       26881 non-null float64
Regionname       34854 non-null object
Propertycount    34854 non-null float64
dtypes: float64(12), int64(1), object(8)
memory usage: 5.6+ M

In [88]:
#counting unique vales to see if 'get dummies' can handle this data
categorical = df.select_dtypes(include=['object'])
for i in categorical:
    column = categorical[i]
    print(i)
    print(column.nunique())

Suburb
351
Address
34009
Type
3
Method
9
SellerG
388
Date
78
CouncilArea
33
Regionname
8


In [89]:
# Count nulls test
null_count = df.isnull().sum()
null_count[null_count>0]

Price             7610
Distance             1
Postcode             1
Bedroom2          8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21115
YearBuilt        19306
CouncilArea          3
Lattitude         7976
Longtitude        7976
Regionname           3
Propertycount        3
dtype: int64

In [90]:
# Count nulls test
null_count[null_count==0]

Suburb     0
Address    0
Rooms      0
Type       0
Method     0
SellerG    0
Date       0
dtype: int64

In [91]:
#drop null values
df1 = df.dropna(axis=0)

In [92]:
#replacing NaN values with 0s
#df1 = df.fillna(0)

In [93]:
# Count nulls test
null_count = df1.isnull().sum()
null_count[null_count>0]

Series([], dtype: int64)

In [94]:
#Creating variables - dropping Address since 
X = df1.drop(['Price', 'Address'], 1)
Y = df1['Price']


In [95]:
X = pd.get_dummies(X)

# Random Forest Model

In [96]:
from sklearn import ensemble
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import MinMaxScaler

In [97]:
# Split dataset into training set and test set
from sklearn.model_selection import train_test_split

# 70% training and 30% test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3) 

In [98]:
# Blind model - using all the features
rfr = ensemble.RandomForestRegressor()
cross_val_score(rfc, X, Y, cv=10)

array([0.78997921, 0.81636605, 0.68903546, 0.76945006, 0.77109153,
       0.83133097, 0.67383469, 0.7744019 , 0.84180653, 0.80391116])

In [105]:
rfr.fit(X,Y)
#print(rfr.feature_importances_)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [100]:
feature_importances = pd.DataFrame(rfr.feature_importances_, index = X.columns, columns=['importance']).sort_values('importance',ascending=False)

In [102]:
print(feature_importances.head(20))

                                  importance
BuildingArea                        0.347193
Regionname_Southern Metropolitan    0.170340
YearBuilt                           0.116454
Distance                            0.072319
Landsize                            0.055417
Lattitude                           0.044593
Longtitude                          0.037282
SellerG_Hall                        0.013116
Bathroom                            0.012109
Postcode                            0.010659
Rooms                               0.009259
Propertycount                       0.008945
Type_u                              0.007326
Type_h                              0.007058
Bedroom2                            0.006949
Car                                 0.005882
Suburb_Middle Park                  0.004951
Regionname_Eastern Metropolitan     0.003072
CouncilArea_Yarra City Council      0.002222
Method_S                            0.002010


After creating a random forest model, the most important features within the model were:
    - Building Area
    - Region name: Southern Metropolitan 
    - Year built
    - Distance 
    - Land size 
    
The features above represent 75% explanatory power of the model. Therefore if we were a real estate developer we would want:
    - A large/spacious house (Building Area)
    - In the southern Metropolitan area
    - Newly built (Year Built)
    - Close proximity to the CBD (Distance)
    - Large land(Land size)
    

In [77]:
# Perform Grid-Search
gsc = GridSearchCV(
    estimator=RandomForestRegressor(),
    param_grid={
        'max_depth': range(3,7),
        'n_estimators': (10, 50, 100, 1000),
        },
    cv=5, 
    scoring='neg_mean_squared_error', 
    verbose=0, 
    n_jobs=-1)
    
grid_result = gsc.fit(X, Y)
best_params = grid_result.best_params_

In [78]:
print(best_params)

{'max_depth': 6, 'n_estimators': 1000}


In [79]:
# Model using grid search best paramters
rfr = RandomForestRegressor(max_depth=best_params["max_depth"], n_estimators=best_params["n_estimators"],                               random_state=False, verbose=False)

# Perform K-Fold CV
scores = cross_val_score(rfr, X, Y, cv=2)

print(scores)

[0.73966082 0.66123871]
