In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
from sklearn.externals import joblib

In [33]:
# Load the data set
df = pd.read_csv("ml_house_data_set.csv")

In [34]:
df

Unnamed: 0,year_built,stories,num_bedrooms,full_bathrooms,half_bathrooms,livable_sqft,total_sqft,garage_type,garage_sqft,carport_sqft,has_fireplace,has_pool,has_central_heating,has_central_cooling,house_number,street_name,unit_number,city,zip_code,sale_price
0,1978,1,4,1,1,1689,1859,attached,508,0,True,False,True,True,42670,Lopez Crossing,,Hallfort,10907,270897.0
1,1958,1,3,1,1,1984,2002,attached,462,0,True,False,True,True,5194,Gardner Park,,Hallfort,10907,302404.0
2,2002,1,3,2,0,1581,1578,none,0,625,False,False,True,True,4366,Harding Islands,,Lake Christinaport,11203,2519996.0
3,2004,1,4,2,0,1829,2277,attached,479,0,True,False,True,True,3302,Michelle Highway,,Lake Christinaport,11203,197193.0
4,2006,1,4,2,0,1580,1749,attached,430,0,True,False,True,True,582,Jacob Cape,,Lake Christinaport,11203,207897.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42698,1982,1,1,1,0,591,627,none,0,200,False,False,True,True,562,Gregory Ford,253.0,Lake Jack,10825,88197.0
42699,1983,1,1,1,0,592,624,none,0,204,False,False,True,True,62028,Gregory Ford,3560.0,Lake Jack,10825,102690.0
42700,1983,1,1,1,0,594,618,none,0,197,False,False,True,True,62028,Gregory Ford,3931.0,Lake Jack,10825,98280.0
42701,1981,1,3,2,0,1398,1401,attached,401,0,False,False,True,True,7456,Garcia View,,Lake Jack,10825,98278.0


In [36]:
df['house_number']

0        42670
1         5194
2         4366
3         3302
4          582
         ...  
42698      562
42699    62028
42700    62028
42701     7456
42702     7456
Name: house_number, Length: 42703, dtype: int64

In [20]:
# Remove the fields from the data set that we don't want to include in our model
del df['house_number']
del df['unit_number']
del df['street_name']
del df['zip_code']

In [21]:
# Replace categorical data with one-hot encoded data
features_df = pd.get_dummies(df, columns=['garage_type', 'city'])

In [22]:
# Remove the sale price from the feature data
del features_df['sale_price']

In [23]:
# Create the X and y arrays
X = features_df.as_matrix()
y = df['sale_price'].as_matrix()

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [24]:
# Split the data set in a training set (70%) and a test set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [25]:
# Fit regression model
model = ensemble.GradientBoostingRegressor(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=6,
    min_samples_leaf=9,
    max_features=0.1,
    loss='huber',
    random_state=0
)

In [26]:
model.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='huber', max_depth=6,
                          max_features=0.1, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=9, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=1000,
                          n_iter_no_change=None, presort='auto', random_state=0,
                          subsample=1.0, tol=0.0001, validation_fraction=0.1,
                          verbose=0, warm_start=False)

In [27]:
# Save the trained model to a file so we can use it in other programs
joblib.dump(model, 'trained_house_classifier_model.pkl')

['trained_house_classifier_model.pkl']

In [28]:
# Find the error rate on the training set
mse = mean_absolute_error(y_train, model.predict(X_train))

In [29]:
print("Training Set Mean Absolute Error: %.4f" % mse)

Training Set Mean Absolute Error: 48727.0015


In [30]:
# Find the error rate on the test set
mse = mean_absolute_error(y_test, model.predict(X_test))

In [31]:
print("Test Set Mean Absolute Error: %.4f" % mse)

Test Set Mean Absolute Error: 59225.1333
