In [155]:
%load_ext sql
%sql mysql://prod:nerd@52.2.153.189/rental_nerd

# imports
import pandas as pd
import numpy as np
import datetime as dt
import timeit  # for timing models
import contextlib
from sklearn import metrics
from sklearn import cross_validation as cv
import matplotlib.pyplot as plt

import os
import gc

from slacker import Slacker
import json
import requests
from cloudinary.uploader import upload
from cloudinary.utils import cloudinary_url
from cloudinary.api import delete_resources_by_tag, resources_by_tag


# this allows plots to appear directly in the notebook
%matplotlib inline

import xgboost as xgb

# today's date for output filenames
today = dt.date.today()

# where to save the xgb models - they get huge so keep them out of any git path
model_path = '/home/ilya/rentalnerd-models/'


# booster parameters
param = {'verbose': 0,
         'silent': 0,
         'objective':'binary:logistic',
         'booster': 'gbtree',
         'eval_metric':'error', 
         'updater': 'grow_gpu',
         'eta': 0.1, # not tuned, learning rate with default of 0.3
         'max_depth': 10  # all of the following parameters are __tuned__ so do not change them
#          'alpha': 2.6456,
#          'gamma': 6.4589, 
#          'subsample': 0.9893,
#          'colsample_bytree': 0.6759,
#          'min_child_weight': 16,
#          'max_delta_step': 0
        }

num_round = 5000 # pick a high number - XGB will abort as soon as accuracy drops in the testing set

import os
# slack secrets (in your ~/.bashrc)
webhook_url = os.environ.get('SLACK_URL')
slacker = Slacker(os.environ.get('SLACK_TOKEN'))

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [156]:
def sanitize(data, zipcode_list = None):
    # abort if the city has no top zipcodes
    if data.empty:
        return 0    
    
    data.drop(['abnormal', 'bookmarked', 'created_at', 'ignore', 'is_latest', 'closed_diff_id', 'id', 'listed_diff_id',
                      'notes', 'source', 'updated_at', 'home_type', 'sfh', 'description', 
                    'event_name', 'neighborhood'], axis=1, inplace=True)
    
    # filters out any non-sensical values or fat finger mistakes in MLS listings
    print("Entries before filter: ", len(data))

    if(data.transaction_type.iloc[0] == 'sales'):
        data = data[ data.price > 50000 ]
    else:
        data = data [ data.price > 500 ]
    
    if(zipcode_list is not None):
        data = data[data.zipcode.isin(zipcode_list)]
        
#     data = data [ (data.price_closed - data.price_listed).abs() < 50000 ]

    print("Entries after filter: ",len(data))
    
    # fills in some sensible defaults where data is missing
    data["near_golf_course"] = data["near_golf_course"].apply(lambda x: True if x == 1.0 else False)
    data["has_pool"] = data["has_pool"].apply(lambda x: True if x == 1.0 else False)
    data["garage"] = data["garage"].apply(lambda x: True if x == 1.0 else False)
    data['date_closed'] = data['date_closed'].apply(lambda x: 0 if x == None else (x - dt.date(2000, 1, 1)).days)
    data['date_closed'] = data['date_closed'].astype(int)
    
    # convert the area name into dummy variables
    dm = pd.get_dummies(data[['area_name', 'zipcode']], prefix=['area_name','zipcode'])
    data = pd.concat([data, dm], axis=1)
    del dm
    
    return data

 
    
@contextlib.contextmanager
def capture():
    import sys
    from io import StringIO
    olderr, oldout = sys.stderr, sys.stdout
    try:
        out = [StringIO(), StringIO()]
        sys.stderr, sys.stdout = out
        yield out
    finally:
        sys.stderr, sys.stdout = olderr, oldout
        out[0] = out[0].getvalue().splitlines()
        out[1] = out[1].getvalue().splitlines()

def parse_rounds(result):
    import re
    pattern = re.compile(r'^\[(?P<round>\d+)\]\t*(?P<a>\D+):(?P<tmae>\-?\d+.\d+)\t*(?P<b>\D+):(?P<vmae>\-?\d+.\d+)')
    xgb_list = []
    once = True
    for line in (result):
        match = pattern.match(line)
        if match:
            idx = int(match.group("round"))
            tmae = float(match.group("tmae"))
            vmae = float(match.group("vmae"))
            xgb_list.append([idx, tmae, vmae])
            # grab the column names: we'd like to do this only once
            if once:
                a = str(match.group("a"))
                b = str(match.group("b"))
                once = False
        else:
            pass

    learning_curve = pd.DataFrame(xgb_list)
    learning_curve.columns = ['round',a,b]
    learning_curve.set_index('round',inplace=True)
    return learning_curve

def plot_rounds(plot):
    # uploads the graph to the web and returns the URL
    
    fig = plot.get_figure()
    fig.savefig('temp_plot.png')
    
    response = upload("temp_plot.png")
    url, options = cloudinary_url(response['public_id'],
        format = response['format'],
        crop = "fill")
    return url

def slack(text, url = None, title = None):
    print("Slacking: " + text)
    
    if url == None:
        data=json.dumps({"text": text})
    else:
        data = json.dumps( { "text": text, "attachments": [ { "fallback": "Model MAE"
                                           , "title": title
                                           , "image_url": url } ] } )
    
    response = requests.post(webhook_url, data , headers={'Content-Type': 'application/json'})
    if response.status_code != 200:
        raise ValueError('Request to slack returned an error %s, the response is:\n%s' % (response.status_code, response.text))

        
def output_model_metrics( x, ypred, y_known, t ):
    #Print model report:
    mae = metrics.mean_absolute_error(y_known, ypred)
    r2 = metrics.explained_variance_score(y_known, ypred)
  
    slack("%s: Model Report:\t%s \t n:\t%i \t\t MAE Score:\t%f \t\t R^2:\t%f" % (city, t, len(y_known), mae, r2))



In [174]:
limit = 100000

In [157]:
query = %sql (\
    SELECT \
    area_name_zipcodes.area_name, \
    properties.*, \
    property_transaction_logs.* \
    FROM  \
    property_transaction_logs \
    LEFT JOIN \
    properties on properties.id = property_transaction_logs.`property_id`  \
    LEFT JOIN \
    area_name_zipcodes on properties.zipcode = area_name_zipcodes.zipcode \
    where \
    date_listed IS NOT NULL AND \
    date_closed IS NOT NULL AND \
    property_transaction_logs.price_closed > 0 AND \
    property_transaction_logs.price_listed > 0 AND \
    abnormal = false AND \
    transaction_type = 'sales' AND \
    transaction_status = 'closed' AND \
    property_transaction_logs.date_closed < "2017-12-02 13:35:36" AND \
    property_transaction_logs.days_on_market < 150 \
    ORDER BY property_transaction_logs.date_closed DESC \
    LIMIT :limit )


closed = query.DataFrame()

851 rows affected.


In [201]:
query = %sql (\
    SELECT \
    area_name_zipcodes.area_name, \
    properties.*, \
    property_transaction_logs.* \
    FROM  \
    property_transaction_logs \
    LEFT JOIN \
    properties on properties.id = property_transaction_logs.`property_id`  \
    LEFT JOIN \
    area_name_zipcodes on properties.zipcode = area_name_zipcodes.zipcode \
    where \
    date_listed > "2017-02-02" AND \
    is_latest = 1 AND \
    abnormal = false AND \
    transaction_type = 'sales' AND \
    transaction_status = 'open' AND \
    area_name = 'PH' \
    LIMIT :limit )


for_sale = query.DataFrame()

2007 rows affected.


In [202]:
q = pd.concat([closed,for_sale])

In [203]:
print(q.columns)
q = sanitize(q)

Index(['area_name', 'id', 'address', 'neighborhood', 'bedrooms', 'bathrooms',
       'sqft', 'source', 'origin_url', 'created_at', 'updated_at', 'latitude',
       'longitude', 'elevation', 'lookup_address', 'luxurious', 'garage',
       'year_built', 'level', 'dist_to_park', 'sfh', 'dist_to_golf_course',
       'zipcode', 'near_golf_course', 'home_type', 'has_pool', 'bookmarked',
       'notes', 'hoa_fees', 'lot', 'zestimate_rent', 'zestimate_sale', 'saves',
       'event_name', 'construction', 'adult', 'description', 'rooms',
       'stories', 'images', 'id', 'price', 'transaction_status', 'date_listed',
       'date_closed', 'days_on_market', 'created_at', 'updated_at',
       'property_id', 'transaction_type', 'is_latest', 'abnormal', 'ignore',
       'closed_diff_id', 'listed_diff_id', 'price_listed', 'price_closed'],
      dtype='object')
Entries before filter:  2858
Entries after filter:  2806


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [204]:
for_sale = q[q.transaction_status == 'open']
sold = q[q.transaction_status == 'closed']


In [205]:
sold['good_sell'] = (sold.price_closed >= sold.price_listed)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [206]:
sold[['good_sell','price_listed','price_closed','days_on_market']].head(20)

Unnamed: 0,good_sell,price_listed,price_closed,days_on_market
0,False,409999.0,394000,115.0
1,True,295000.0,358118,17.0
2,True,176900.0,176900,56.0
3,True,304425.0,304425,0.0
4,True,1199000.0,1199000,144.0
5,True,649978.0,649978,129.0
6,True,549999.0,549999,106.0
7,True,95000.0,95000,50.0
8,False,199999.0,184100,4.0
9,True,518000.0,518000,0.0


In [207]:
for_sale[['price','price_closed','days_on_market','transaction_status','property_id']].head(20)

Unnamed: 0,price,price_closed,days_on_market,transaction_status,property_id
0,268800,,,open,1111159
1,351990,,,open,7397651
2,419000,,,open,1037777
3,393260,,,open,1040358
4,399900,,,open,975748
5,571990,,,open,7397727
6,435900,,,open,7397740
7,525000,,,open,31555
8,650000,,,open,1847649
9,929900,,,open,7397874


In [208]:
ind2remove = ['Unnamed: 0', 'address', 'area_name', 'date_listed', 'id', 'listed_diff_id', 'lookup_address',
              'origin_url', 'neighborhood', 'zipcode', 'luxurious', 'transaction_status', 'transaction_type',
              'images','zestimate_sale','zestimate_rent', 'price', 'price_closed']
factors = np.setdiff1d(sold.columns, ind2remove).tolist()

In [209]:
sales_train, sales_test = cv.train_test_split(sold, test_size = 0.25) # set aside X% of the dataset for testing

In [210]:
# cap number of homes that fit into VRAM
memory_cap = 250000
city = 'ALL'

# init empty model that we can load into on the second iteration
bst = xgb.Booster()

# first run the price model
label = 'good_sell'
f = factors
f.remove(label) # this happens in place

dtrain = xgb.DMatrix(sales_train[factors].values, label=sales_train[label], feature_names=factors)
dtest = xgb.DMatrix(sales_test[factors].values, label=sales_test[label], feature_names=factors)
watchlist  = [(dtrain,'train'),(dtest,'eval')]

xgb_model = xgb.train( param, dtrain, num_round, evals = watchlist, early_stopping_rounds = 10, verbose_eval = 1 )

if hasattr(xgb_model, 'best_score'):
    slack("Early stopping occured, best_score %f, best_iteration %i" % (xgb_model.best_score, xgb_model.best_iteration))

[0]	train-error:0.189369	eval-error:0.422886
Multiple eval metrics have been passed: 'eval-error' will be used for early stopping.

Will train until eval-error hasn't improved in 10 rounds.
[1]	train-error:0.136213	eval-error:0.41791
[2]	train-error:0.098007	eval-error:0.437811
[3]	train-error:0.086379	eval-error:0.432836
[4]	train-error:0.068106	eval-error:0.39801
[5]	train-error:0.071429	eval-error:0.427861
[6]	train-error:0.05814	eval-error:0.402985
[7]	train-error:0.048173	eval-error:0.422886
[8]	train-error:0.048173	eval-error:0.437811
[9]	train-error:0.041528	eval-error:0.427861
[10]	train-error:0.036545	eval-error:0.40796
[11]	train-error:0.033223	eval-error:0.40796
[12]	train-error:0.028239	eval-error:0.38806
[13]	train-error:0.028239	eval-error:0.402985
[14]	train-error:0.023256	eval-error:0.40796
[15]	train-error:0.018272	eval-error:0.40796
[16]	train-error:0.011628	eval-error:0.393035
[17]	train-error:0.013289	eval-error:0.39801
[18]	train-error:0.009967	eval-error:0.393035


In [211]:
# load for sale properties

target = xgb.DMatrix( for_sale[f].values, feature_names=f)
ypred = xgb_model.predict(target, ntree_limit=(xgb_model.best_iteration if hasattr(xgb_model, 'best_score') else None))


In [221]:
values = np.column_stack((for_sale.property_id.values
                         ,for_sale.address.values
                         ,ypred
                         ,for_sale.price.values
                         ,for_sale['origin_url'].values))
output = pd.DataFrame(values[:,1:],index=values[:,0],columns=['address','ypred',
                                                              'list', 'url'])
output = output.sort_values(by='ypred',ascending=True)
# output = output[output.ypred > 0.50]
file = 'good_sale_target_list.csv'
output.to_csv(file)

In [222]:
output.head(100)

Unnamed: 0,address,ypred,list,url
85004,"19174 E Vallejo St, Queen Creek, AZ 85142",0.187553,699000,https://www.zillow.com/homedetails/19174-E-Val...
1849508,"15212 N Lorma Ln, Fountain Hills, AZ 85268",0.196815,649900,https://www.zillow.com/homedetails/15212-N-Lor...
372247,"4306 E Hashknife Rd, Phoenix, AZ 85050",0.2012,590000,https://www.zillow.com/homedetails/4306-E-Hash...
365785,"22821 N 39th Run, Phoenix, AZ 85050",0.2012,649900,https://www.zillow.com/homedetails/22821-N-39t...
541423,"10457 E. Greythorn Drive, Scottsdale, AZ 85262",0.202394,2699000,https://www.zillow.com/homedetails/10457-E-Gre...
1029959,"2418 E Virgo Pl, Chandler, AZ 85249",0.207311,749900,https://www.zillow.com/homedetails/2418-E-Virg...
1030768,"2535 E Cloud Dr, Chandler, AZ 85249",0.207311,735000,http://www.zillow.com/homedetails/2535-E-Cloud...
1005196,"3004 S Bell Pl, Chandler, AZ 85286",0.207311,450000,https://www.zillow.com/homedetails/3004-S-Bell...
1044903,"5961 S Gemstone Dr, Chandler, AZ 85249",0.207311,889900,https://www.zillow.com/homedetails/5961-S-Gems...
1006819,"3385 E Cardinal Way, Chandler, AZ 85286",0.207311,480000,https://www.zillow.com/homedetails/3385-E-Card...


In [218]:
len(output.index)

723

In [219]:
output.tail(5)

Unnamed: 0,address,ypred,list,url
1912,"1759 W Rovey Ave, Phoenix, AZ 85015",0.501161,310000,https://www.zillow.com/homedetails/1759-W-Rove...
7387014,"20724 W Saguaro Vista Dr, Wittmann, AZ 85361",0.500791,265000,https://www.zillow.com/homedetails/20724-W-Sag...
7457341,"723 E Boca Raton Rd # 25, Phoenix, AZ 85022",0.500737,290100,https://www.zillow.com/homedetails/723-E-Boca-...
7449288,"8336 N 105th Ln, Peoria, AZ 85345",0.500585,99000,https://www.zillow.com/homedetails/8336-N-105t...
289481,"3537 E Coolidge St, Phoenix, AZ 85018",0.500476,799000,https://www.zillow.com/homedetails/3537-E-Cool...
