In [1]:
import pandas as pd
import numpy as np
import sklearn

from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error

import joblib
from joblib import dump, load

In [2]:
df = pd.read_csv("ml_house_data_set.csv")

In [3]:
df.columns

Index(['year_built', 'stories', 'num_bedrooms', 'full_bathrooms',
       'half_bathrooms', 'livable_sqft', 'total_sqft', 'garage_type',
       'garage_sqft', 'carport_sqft', 'has_fireplace', 'has_pool',
       'has_central_heating', 'has_central_cooling', 'house_number',
       'street_name', 'unit_number', 'city', 'zip_code', 'sale_price'],
      dtype='object')

In [4]:
# Removing the fields not needed
del df['house_number']
del df['unit_number']
del df['street_name']
del df['zip_code']

In [5]:
df.columns

Index(['year_built', 'stories', 'num_bedrooms', 'full_bathrooms',
       'half_bathrooms', 'livable_sqft', 'total_sqft', 'garage_type',
       'garage_sqft', 'carport_sqft', 'has_fireplace', 'has_pool',
       'has_central_heating', 'has_central_cooling', 'city', 'sale_price'],
      dtype='object')

In [6]:
# Replacing Categorical data with one-hot encoded data
features_df = pd.get_dummies(df, columns=['garage_type', 'city'])

In [7]:
# Making an matrix of X and y
X = features_df.values
y = features_df['sale_price'].values

In [8]:
y

array([ 270897.,  302404., 2519996., ...,   98280.,   98278.,  186480.])

In [9]:
## Split the data into training set (70%) and test set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [10]:
# Fit the regression model (Value Prediction)
model = ensemble.GradientBoostingRegressor(
        n_estimators= 1000, # How many decision trees to build
        learning_rate=0.1, # How much each descision tree influence prediction 
        max_depth=6, # How many layers deep 
        min_samples_leaf=9, # At least 9 houses should exhibit same properties or how many times a value should appear (this is helpful in eliminating outliers)
        max_features=0.1, # % of features we randomly choose to build model
        loss='huber', # calucluating model's error rate or cost function 
        random_state = 0 
)

model.fit(X_train, y_train)

GradientBoostingRegressor(loss='huber', max_depth=6, max_features=0.1,
                          min_samples_leaf=9, n_estimators=1000,
                          random_state=0)

In [11]:
# Saving the trained model to a file so we can use it in other programs
joblib.dump(model, 'trained_house_classifier_model.pkl')

['trained_house_classifier_model.pkl']

In [12]:
# Find the error rate on the training set (On average how wrong is our prediction model)
mse = mean_absolute_error(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

Training Set Mean Absolute Error: 7212.7038


In [13]:
# Find the error rate on the test set 
mse = mean_absolute_error(y_test, model.predict(X_test))
print("Test Set Mean Absolute Error:  %.4f" % mse)

Test Set Mean Absolute Error:  10272.3201
