In [124]:
%load_ext sql

# imports
import pandas as pd
import numpy as np
import datetime as dt

# this allows plots to appear directly in the notebook
%matplotlib inline

from sklearn import model_selection
from sklearn import preprocessing
from sklearn import metrics
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from xgboost import XGBClassifier

# city abbreviation code
city = 'PH'
# limit on number of lines returned from sql queries (for debugging)
limit = 1000000
# where to find the xgb models - they get huge so keep them out of any git path
path = '/home/ivoytov/rentalnerd-models/'



# %sql mysql://root@localhost/rental_nerd
%sql mysql://prod:nerd@52.2.153.189/rental_nerd

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


'Connected: prod@rental_nerd'

In [125]:
# data columns used for the booster
factors = ['property_id', 'bedrooms', 'bathrooms', 'sqft','longitude', 'latitude','zipcode', 'elevation', 'garage'
                          ,'year_built', 'level','dist_to_park','dist_to_golf_course', 'has_pool'
                          , 'date_closed','multifamily']

# booster parameters
param = {'max_depth':6, 'num_parallel_tree': 1000, 'eval_metric':'mae' }
num_round = 10
plst = param.items()

# disable warning on assignment without copy
pd.options.mode.chained_assignment = None  # default='warn'

In [133]:
def sanitize(data):
    # filters out any non-sensical values or fat finger mistakes in MLS listings
    print("Entries before filter: ", len(data))
    data = data[  (data.sqft <= 10000) 
                & (data.price <= 400000) 
                & (data.price > 500)
                & (data.bedrooms <= 6) 
                & (data.bathrooms <= 6) 
                & (data.sqft != 0) ]
    print("Entries after filter: ",len(data))
    
    # fills in some sensible defaults where data is missing
    data["near_golf_course"] = data["near_golf_course"].apply(lambda x: True if x == 1.0 else False)
    data["has_pool"] = data["has_pool"].apply(lambda x: True if x == 1.0 else False)
    data["garage"] = data["garage"].apply(lambda x: True if x == 1.0 else False)
    data["multifamily"] = data["home_type"].apply(lambda x: True if x == "mfh" else False)
    data['date_closed'] = data['date_closed'].apply(lambda x: (x - dt.date(2000, 1, 1)))
    data['date_closed'] = data['date_closed'].astype(int)
    
    return data

def query(transaction_type, transaction_status, city, limit, date="2000-01-01 10:01:13"):
    # sql query helper function
    query = %sql (\
    select  \
    *  \
    from  \
    properties, \
    property_transaction_logs, \
    area_name_zipcodes \
    where  \
    property_transaction_logs.created_at > :date and \
    area_name_zipcodes.`area_name` = :city and \
    area_name_zipcodes.`zipcode` = properties.`zipcode` and     \
    properties.`id` = property_transaction_logs.`property_id` and \
    property_transaction_logs.`transaction_status` = :transaction_status and \
    property_transaction_logs.`transaction_type` = :transaction_type and \
    property_transaction_logs.`is_latest` = true \
    order by \
    property_transaction_logs.id desc \
    limit :limit)

    return query.DataFrame().T.groupby(level=0).first().T

In [128]:
for_sale = query('sales', 'open', city, limit, "2017-01-23 10:01:13")

4841 rows affected.


In [129]:
# query the top 100 zipcodes in the database (roughly equal to all zipcodes >10k properties)
query = %sql (\
SELECT zipcode, COUNT(id) \
FROM properties \
GROUP BY zipcode \
ORDER BY 2 DESC \
limit 100)

zipcode_filter = query.DataFrame()
print("Top zipcode by count is",zipcode_filter.iloc[0,0],"with",zipcode_filter.iloc[0,1],"properties")
print("100th zipcode by count is",zipcode_filter.iloc[99,0],"with",zipcode_filter.iloc[99,1],"properties")

100 rows affected.
Top zipcode by count is 94565 with 18729 properties
100th zipcode by count is 89147 with 9682 properties


In [130]:
# load sales model
bst = xgb.Booster() #init model
bst.load_model(path + city.lower() + '_sales_20170207.model')

In [131]:
# use today's date for 'close date' since the transaction is still open i.e. home is currently listed for sale
for_sale.date_closed = dt.date.today()
for_sale = sanitize(for_sale)

# remove for rent listings that are categorized as for sale
# remove listings outside of the top 100 zipcodes 
for_sale = for_sale[(for_sale.price > 50000) & (for_sale.zipcode.isin(zipcode_filter.zipcode))]

target = xgb.DMatrix( for_sale[factors].values, feature_names=factors)
ypred = bst.predict(target)

Entries before filter:  4841
Entries after filter:  3684


In [132]:
#Print model report:
print ("Model Report")
print ("MAE Score (Test): %f" % metrics.mean_absolute_error(for_sale.price, ypred))
print ("R^2: %f" % metrics.explained_variance_score(for_sale.price, ypred))

Model Report
MAE Score (Test): 40673.099076
R^2: 0.742003


In [139]:
def slack(test):
    return test

slack("R^2: %f, MAE: %" % 234.43)

TypeError: slack() takes 1 positional argument but 3 were given

In [94]:
values = np.column_stack((for_sale.property_id.values
                         ,for_sale.address.values
                         ,for_sale.zipcode.values
                         ,ypred
                         ,for_sale.price.values
                         ,ypred-for_sale.price))
output = pd.DataFrame(values[:,1:],index=values[:,0],columns=['address','zipcode','ypred','list','gain-loss'])
output = output.sort_values(by='gain-loss',ascending=False)

output.head(20)

Unnamed: 0,address,zipcode,ypred,list,gain-loss
7398565,"5718 W Orchid Lane For Rent Only # 000, Glenda...",85302,126062.0,80000,46062.5
938918,"408 E Coronado St, Buckeye, AZ 85326",85326,87618.7,59750,27868.7
938918,"408 E Coronado St, Buckeye, AZ 85326",85326,87618.7,59800,27818.7
938918,"408 E Coronado St, Buckeye, AZ 85326",85326,87618.7,59850,27768.7
7396765,"4242 # 18, E Patrick Court Gilbert, AZ 85295",85295,175211.0,150000,25211.5
1884021,"612 E Centre Ave, Buckeye, AZ 85326",85326,84461.1,64000,20461.1
7400489,"17 E Ruth Ave, Phoenix, AZ 85020",85020,84031.8,64500,19531.8
939863,"507 E Clanton Ave, Buckeye, AZ 85326",85326,78914.2,62900,16014.2
7403372,"1713 W Nopal Dr For Rent, Chandler, AZ 85224",85224,114098.0,100000,14098.1
7402152,"25227 W La Mont Ave, Buckeye, AZ 85326",85326,205602.0,194900,10701.8


In [95]:
rent_bst = xgb.Booster() #init rent model
rent_bst.load_model(path + city.lower() + '_rent_20170123.model')


In [96]:
# predict rent prices for the houses listed for sale today, add Rent and Cap Rate columns to the dataset
target = xgb.DMatrix( for_sale[factors].values, feature_names=factors)
ypred = rent_bst.predict(target)

ypred = pd.Series(ypred,index=output.index)
ypred.name = "rent"

# calculate estimated cap rate
cr = ypred * 12 / output.list
cr.name = "cap rate"

In [97]:
# combine rent predictions to homes listed for sale
best_of = pd.concat([output,ypred, cr],axis=1)
best_of = best_of[ (best_of['gain-loss'] < 40000) & ((best_of['gain-loss'] / best_of.list).abs() < 0.25)]
best_of.head(30)

Unnamed: 0,address,zipcode,ypred,list,gain-loss,rent,cap rate
7396765,"4242 # 18, E Patrick Court Gilbert, AZ 85295",85295,175211.0,150000,25211.5,23341.054688,1.86728
7403372,"1713 W Nopal Dr For Rent, Chandler, AZ 85224",85224,114098.0,100000,14098.1,1398.417358,0.16781
7402152,"25227 W La Mont Ave, Buckeye, AZ 85326",85326,205602.0,194900,10701.8,13548.523438,0.834183
7397709,"708 W Cheyenne Dr For Rent, Chandler, AZ 85225",85225,106504.0,100000,6504.11,1200.270386,0.144032
7399742,"1111 S Sossaman Rd, Mesa, AZ 85209",85209,79447.8,74000,5447.79,1848.190186,0.299707
7398604,"9419 N 59th Ave, Glendale, AZ 85302",85302,90704.4,94500,-3795.58,2252.119629,0.285983
7392358,"3900 E Barbarita Ave # 0, Gilbert, AZ 85234",85234,114098.0,120000,-5901.88,2252.119629,0.225212
7397739,"963 E Monterey St For Rent, Chandler, AZ 85225",85225,93097.5,100000,-6902.45,1172.136353,0.140656
1082529,"286 W Palomino Dr, Chandler, AZ 85225",85225,77618.1,93000,-15381.9,13728.605469,1.77143
1082529,"286 W Palomino Dr, Chandler, AZ 85225",85225,77618.1,95000,-17381.9,1848.190186,0.233456


In [98]:
best_of.to_csv(city+'_target_list.csv')

In [143]:
(dt.datetime.today() - dt.timedelta(6*365/12)).isoformat()


'2016-08-14T09:17:03.312575'