In [1]:
import pandas
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

In [2]:
# Load the data set
df = pandas.read_csv("data//ml_house_data_set.csv")

In [3]:
# Remove the fields from the data set that we don't want to include in our model
del df['house_number']
del df['unit_number']
del df['street_name']
del df['zip_code']

In [4]:
# Replace categorical data with one-hot encoded data
features_df = pandas.get_dummies(df, columns=['garage_type', 'city'])
del features_df['sale_price']

In [5]:
X = features_df.values
y = df['sale_price'].values

In [6]:
# Split the data set in a training set (70%) and a test set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [7]:
# Create the model
model = ensemble.GradientBoostingRegressor()

In [8]:
# Parameters we want to try
param_grid = {
    'n_estimators': [500, 1000, 3000],
    'max_depth': [4, 6],
    'min_samples_leaf': [3, 5, 9, 17],
    'learning_rate': [0.1, 0.05, 0.02, 0.01],
    'max_features': [1.0, 0.3, 0.1],
    'loss': ['ls', 'lad', 'huber']
}

In [11]:
# Define the grid search we want to run. Run it with four cpus in parallel.
gs_cv = GridSearchCV(model, param_grid, n_jobs=4, verbose=100)

In [None]:
# Run the grid search - on only the training data!
gs_cv.fit(X_train, y_train)

Fitting 3 folds for each of 864 candidates, totalling 2592 fits
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
Pickling array (shape=(29892, 63), dtype=object).
Pickling array (shape=(29892,), dtype=float64).
Pickling array (shape=(19928,), dtype=int64).
Pickling array (shape=(9964,), dtype=int64).
Pickling array (shape=(29892, 63), dtype=object).
Pickling array (shape=(29892,), dtype=float64).
Pickling array (shape=(19928,), dtype=int64).
Pickling array (shape=(9964,), dtype=int64).
Pickling array (shape=(29892, 63), dtype=object).
Pickling array (shape=(29892,), dtype=float64).
Pickling array (shape=(19928,), dtype=int64).
Pickling array (shape=(9964,), dtype=int64).
Pickling array (shape=(29892, 63), dtype=object).
Pickling array (shape=(29892,), dtype=float64).
Pickling array (shape=(19928,), dtype=int64).
Pickling array (shape=(9964,), dtype=int64).
Pickling array (shape=(29892, 63), dtype=object).
Pickling array (shape=(29892,), dtype=float64).
Picklin

In [None]:
# Print the parameters that gave us the best result!
print(gs_cv.best_params_)

In [None]:
# Find the error rate on the training set using the best parameters
mse = mean_absolute_error(y_train, gs_cv.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

In [None]:
# Find the error rate on the test set using the best parameters
mse = mean_absolute_error(y_test, gs_cv.predict(X_test))
print("Test Set Mean Absolute Error: %.4f" % mse)