In [1]:
'''
Based on: https://github.com/fmfn/BayesianOptimization/blob/master/examples/xgboost_example.py
Computes the best parameters for XGB model optimization
'''

# imports
import pandas as pd
import numpy as np
import os
import gc
import datetime as dt

from bayes_opt import BayesianOptimization
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit

from slacker import Slacker
import json
import requests
from cloudinary.uploader import upload
from cloudinary.utils import cloudinary_url
from cloudinary.api import delete_resources_by_tag, resources_by_tag

import os
# slack secrets (in your ~/.bashrc)
webhook_url = os.environ.get('SLACK_URL')
slacker = Slacker(os.environ.get('SLACK_TOKEN'))

%load_ext sql
# %sql mysql://root@localhost/rental_nerd
%sql mysql://prod:nerd@52.2.153.189/rental_nerd
limit = 100000

In [2]:
def XGBcv(max_depth, gamma, min_child_weight, max_delta_step, subsample, colsample_bytree, alpha):
    folds = 5
    paramt = {
        'eta': 0.05,
        'verbose_eval': 1,
        'silent': 0,
        'objective': 'binary:logistic',
        'booster': 'gbtree',
        'eval_metric': 'error',
        'updater': 'grow_gpu',
#         'eta': max(eta, 0),
        'max_depth': int(max_depth),
        'alpha': max(alpha, 0),
        'gamma': max(gamma, 0),
        'subsample': max(min(subsample, 1), 0),
        'colsample_bytree': max(min(colsample_bytree, 1), 0),
        'min_child_weight': int(min_child_weight),
        'max_delta_step': int(max_delta_step)
    }
    
    out = xgb.cv(paramt,
           dtrain,
           num_boost_round=3000,
           folds=tscv.split(dtrain),
           callbacks=[xgb.callback.early_stop(50)])
    
    output = out['test-error-mean'].values[-1]
    del out
    gc.collect()
    
    return output

def sanitize(data, zipcode_list = None):
    # abort if the city has no top zipcodes
    if data.empty:
        return 0    
    
    data.drop(['abnormal', 'bookmarked', 'created_at', 'ignore', 'is_latest', 'id', 'closed_diff_id', 'listed_diff_id',
                     'notes', 'source', 'updated_at', 'home_type', 'sfh', 'description', 
                   'event_name', 'neighborhood'], axis=1, inplace=True)
    
    # filters out any non-sensical values or fat finger mistakes in MLS listings
    print("Entries before filter: ", len(data))

    if(data.transaction_type.iloc[0] == 'sales'):
        data = data[ data.price > 50000 ]
    else:
        data = data [ data.price > 500 ]
    
    if(zipcode_list is not None):
        data = data[data.zipcode.isin(zipcode_list)]
        
#     data = data [ (data.price_closed - data.price_listed).abs() < 50000 ]

    slack("Entries after filter: %i" % len(data))
    
    # fills in some sensible defaults where data is missing
    data["near_golf_course"] = data["near_golf_course"].apply(lambda x: True if x == 1.0 else False)
    data["has_pool"] = data["has_pool"].apply(lambda x: True if x == 1.0 else False)
    data["garage"] = data["garage"].apply(lambda x: True if x == 1.0 else False)
    data['date_closed'] = data['date_closed'].apply(lambda x: 0 if x == None else (x - dt.date(2000, 1, 1)).days)
    data['date_closed'] = data['date_closed'].astype(int)
    
    # convert the area name into dummy variables
    dm = pd.get_dummies(data[['area_name', 'zipcode']], prefix=['area_name','zipcode'])
    data = pd.concat([data, dm], axis=1)
    del dm
    
    return data

def slack(text, url = None, title = None):
    print("Slacking: " + text)
    
    if url == None:
        data=json.dumps({"text": text})
    else:
        data = json.dumps( { "text": text, "attachments": [ { "fallback": "Model MAE"
                                           , "title": title
                                           , "image_url": url } ] } )
    
    response = requests.post(webhook_url, data , headers={'Content-Type': 'application/json'})
    if response.status_code != 200:
        raise ValueError('Request to slack returned an error %s, the response is:\n%s' % (response.status_code, response.text))


In [3]:
query = %sql (\
    SELECT \
    area_name_zipcodes.area_name, \
    properties.*, \
    property_transaction_logs.id as 'transaction_id', \
    property_transaction_logs.* \
    FROM  \
    property_transaction_logs \
    LEFT JOIN \
    properties on properties.id = property_transaction_logs.`property_id`  \
    LEFT JOIN \
    area_name_zipcodes on properties.zipcode = area_name_zipcodes.zipcode \
    where \
    home_type = 'sfh' AND \
    properties.sqft between 1 and 10000 and \
    ( abnormal = false OR abnormal IS NULL OR abnormal = 0 ) and \
    property_transaction_logs.price between 500 and 400000 and \
    properties.bedrooms <= 6 and \
    properties.bathrooms <= 6 and \
    transaction_type = 'sales' and  \
    date_closed is not null and \
    price_closed is not null and \
    days_on_market is not null and \
    transaction_status = 'closed' \
    ORDER BY property_transaction_logs.date_closed DESC \
    LIMIT :limit )


df = query.DataFrame()

100000 rows affected.


In [4]:
df.set_index('property_id', inplace=True)
df.index.name = 'property_id'

In [5]:
init_values = {'target': [ 0.05,  0.05], 'alpha': [ 8.97,  9.99], 'colsample_bytree': [ 0.35,  0.26], 'gamma': [ 9.37,  6.42], 'max_delta_step': [ 0.09,  2.86], 'max_depth': [ 14.6,  10. ], 'min_child_weight': [ 19.96,   6.34], 'subsample': [ 0.8,  0.8]}

In [6]:
print(df.columns)
df = sanitize(df)

Index(['area_name', 'id', 'address', 'neighborhood', 'bedrooms', 'bathrooms',
       'sqft', 'source', 'origin_url', 'created_at', 'updated_at', 'latitude',
       'longitude', 'elevation', 'lookup_address', 'luxurious', 'garage',
       'year_built', 'level', 'dist_to_park', 'sfh', 'dist_to_golf_course',
       'zipcode', 'near_golf_course', 'home_type', 'has_pool', 'bookmarked',
       'notes', 'hoa_fees', 'lot', 'zestimate_rent', 'zestimate_sale', 'saves',
       'event_name', 'construction', 'adult', 'description', 'rooms',
       'stories', 'images', 'transaction_id', 'id', 'price',
       'transaction_status', 'date_listed', 'date_closed', 'days_on_market',
       'created_at', 'updated_at', 'transaction_type', 'is_latest', 'abnormal',
       'ignore', 'closed_diff_id', 'listed_diff_id', 'price_listed',
       'price_closed', 'date_transacted_latest'],
      dtype='object')
Entries before filter:  100000
Slacking: Entries after filter: 92087


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [7]:
df.columns.values

array(['area_name', 'address', 'bedrooms', 'bathrooms', 'sqft',
       'origin_url', 'latitude', 'longitude', 'elevation',
       'lookup_address', 'luxurious', 'garage', 'year_built', 'level',
       'dist_to_park', 'dist_to_golf_course', 'zipcode',
       'near_golf_course', 'has_pool', 'hoa_fees', 'lot', 'zestimate_rent',
       'zestimate_sale', 'saves', 'construction', 'adult', 'rooms',
       'stories', 'images', 'transaction_id', 'price',
       'transaction_status', 'date_listed', 'date_closed',
       'days_on_market', 'transaction_type', 'price_listed',
       'price_closed', 'date_transacted_latest', 'area_name_BAY_AREA',
       'area_name_DENVER', 'area_name_DETROIT', 'area_name_HOUSTON',
       'area_name_PH', 'area_name_PORTLAND', 'area_name_SEATTLE',
       'area_name_SF', 'area_name_ST_LOUIS', 'area_name_TUSCON',
       'area_name_VEGAS', 'zipcode_07924', 'zipcode_15768',
       'zipcode_31406', 'zipcode_32055', 'zipcode_33186', 'zipcode_35077',
       'zipcode_36541', 

In [8]:
params = {       'max_depth': (5, 15),
                 'gamma': (0.0, 20.0),
                 'min_child_weight': (1, 40),
                 'max_delta_step': (0, 10),
                 'subsample': (0.1, 5.0),
                 'colsample_bytree' :(0.01, 1.0),
                 'alpha': (0, 20)
               }

XGB_BOpt = BayesianOptimization(XGBcv, params)
XGB_BOpt.initialize(init_values)

discount = .10
df['good_sell'] = (df.price_closed >= (df.price_listed * (1 - discount )))

In [None]:
cols = df.columns
# ind2remove = ['Unnamed: 0', 'address', 'area_name', 'date_listed', 'id', 'listed_diff_id', 'lookup_address',
#               'origin_url', 'neighborhood', 'zipcode', 'luxurious', 'transaction_status', 'transaction_type',
#               'zestimate_sale']
ind2remove = ['area_name', 'address', 'origin_url', 'lookup_address', 'zipcode', 'zestimate_rent', 'date_listed', 'date_closed', 
              'zestimate_sale', 'construction', 'stories', 'transaction_id', 'transaction_status', 'good_sell', 
              'transaction_type', 'price_closed', 'date_transacted_latest', 'updated_at','notes','price']

factors = np.setdiff1d(cols, ind2remove)
print(factors)

['adult' 'area_name_BAY_AREA' 'area_name_DENVER' 'area_name_DETROIT'
 'area_name_HOUSTON' 'area_name_PH' 'area_name_PORTLAND'
 'area_name_SEATTLE' 'area_name_SF' 'area_name_ST_LOUIS' 'area_name_TUSCON'
 'area_name_VEGAS' 'bathrooms' 'bedrooms' 'days_on_market'
 'dist_to_golf_course' 'dist_to_park' 'elevation' 'garage' 'has_pool'
 'hoa_fees' 'images' 'latitude' 'level' 'longitude' 'lot' 'luxurious'
 'near_golf_course' 'price_listed' 'rooms' 'saves' 'sqft' 'year_built'
 'zipcode_07924' 'zipcode_15768' 'zipcode_31406' 'zipcode_32055'
 'zipcode_33186' 'zipcode_35077' 'zipcode_36541' 'zipcode_36582'
 'zipcode_36693' 'zipcode_36695' 'zipcode_47371' 'zipcode_48021'
 'zipcode_48030' 'zipcode_48075' 'zipcode_48089' 'zipcode_48101'
 'zipcode_48122' 'zipcode_48124' 'zipcode_48126' 'zipcode_48127'
 'zipcode_48128' 'zipcode_48146' 'zipcode_48152' 'zipcode_48201'
 'zipcode_48202' 'zipcode_48203' 'zipcode_48204' 'zipcode_48205'
 'zipcode_48206' 'zipcode_48207' 'zipcode_48208' 'zipcode_48209'
 'zipcod

In [None]:
dtrain = xgb.DMatrix(df[factors].values, label=df.good_sell, feature_names=factors)
tscv = TimeSeriesSplit(n_splits=5)

# per link below i need to use Upper Confidence Bound and add some alpha (square of stdev), otherwise it starts to loop
# https://github.com/fmfn/BayesianOptimization/issues/10 
XGB_BOpt.maximize(init_points=5, n_iter=100, acq='ucb', kappa=50)

[31mInitialization[0m
[94m----------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |     alpha |   colsample_bytree |     gamma |   max_delta_step |   max_depth |   min_child_weight |   subsample | 
Multiple eval metrics have been passed: 'test-error' will be used for early stopping.

Will train until test-error hasn't improved in 50 rounds.
Stopping. Best iteration:
[0]	train-error:0.0603233+7.54247e-06	test-error:0.0603237+1.46135e-05

    1 | 00m18s | [35m   0.06032[0m | [32m  14.4479[0m | [32m            0.0850[0m | [32m   5.2991[0m | [32m          6.2809[0m | [32m     5.2320[0m | [32m           33.7491[0m | [32m     2.8862[0m | 
Multiple eval metrics have been passed: 'test-error' will be used for early stopping.

Will train until test-error hasn't improved in 50 rounds.
Stopping. Best iteration:
[227]	train-error:0.048992+6.79412e-05	test-error:

In [None]:
# not used - reset the variable
#new_init = { 'target': [], 'alpha': [], 'colsample_bytree': [], 'gamma': [], 'max_delta_step': [], 'max_depth': [], 'min_child_weight': [], 'subsample': [] }
new_init = init_values

# store resulting values to help seed the next run. make sure not to overwrite but add incrementally
# copy paste the print out of init_values into the cell above
for i in range(len(XGB_BOpt.res['all']['values'])):
    new_init['target'].append(XGB_BOpt.res['all']['values'][i])
    for k,v in XGB_BOpt.res['all']['params'][i].items():
        new_init[k].append(np.round(v,decimals=2).values)
    
print (new_init)

In [None]:
%matplotlib inline
print(XGB_BOpt.res['max'])
(pd.DataFrame(XGB_BOpt.res['all']['values'])*-1.0).plot()

In [None]:
import json
import requests
from cloudinary.uploader import upload
from cloudinary.utils import cloudinary_url
from cloudinary.api import delete_resources_by_tag, resources_by_tag

def plot_rounds(plot):
    # uploads the graph to the web and returns the URL
    
    fig = plot.get_figure()
    fig.savefig('temp_plot.png')
    
    response = upload("temp_plot.png")
    url, options = cloudinary_url(response['public_id'],
        format = response['format'],
        crop = "fill")
    return url

def slack(text, url = None):
    print("Slacking: " + text)
    
    if url == None:
        data=json.dumps({"text": text})
    else:
        data = json.dumps( { "text": text, "attachments": [ { "fallback": "Model MAE"
                                           , "title": "Model Mean Average Error by Iteration ($)"
                                           , "image_url": url } ] } )
    
    response = requests.post(webhook_url, data , headers={'Content-Type': 'application/json'})
    if response.status_code != 200:
        raise ValueError('Request to slack returned an error %s, the response is:\n%s' % (response.status_code, response.text))

In [None]:
result = pd.DataFrame(XGB_BOpt.res['all']['params'])
error = pd.Series(XGB_BOpt.res['all']['values']) * -1
error.name = 'test-error'
result = pd.concat([error, result], axis=1)
result.head(25)

url = plot_rounds(error.plot())
slack("Bayesian Search: Max params %s" % XGB_BOpt.res['max'], url)

file = 'ALL-bayesian-parameters.csv'
result.to_csv(file)
slacker.files.upload(file, channels='#progress')