In [12]:
%load_ext sql

# imports
import pandas as pd
import numpy as np
import datetime
from datetime import datetime
from datetime import date
import matplotlib.pyplot as plt

# this allows plots to appear directly in the notebook
%matplotlib inline

from sklearn import model_selection
from sklearn import preprocessing
from sklearn import metrics
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit

# city abbreviation code
city = 'PH'
# limit on number of lines returned from sql queries (for debugging)
limit = 10000000

# %sql mysql://root@localhost/rental_nerd
%sql mysql://prod:nerd@52.2.153.189/rental_nerd

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


'Connected: prod@rental_nerd'

In [5]:
# data columns used for the booster
factors = ['property_id', 'bedrooms', 'bathrooms', 'sqft','longitude', 'latitude','zipcode', 'elevation', 'garage'
                          ,'year_built', 'level','dist_to_park','dist_to_golf_course', 'has_pool','school_district_id'
                          , 'date_closed','multifamily']

# booster parameters
param = {'max_depth':6, 'num_parallel_tree': 10000, 'eval_metric':'mae' }
num_round = 10
plst = param.items()

# disable warning on assignment without copy
pd.options.mode.chained_assignment = None  # default='warn'

In [11]:
def sanitize(data):
    # filters out any non-sensical values or fat finger mistakes in MLS listings
    print("Entries before filter: ", len(data))
    data = data[  (data.sqft <= 10000) 
                & (data.price <= 400000) 
                & (data.price > 500)
                & (data.bedrooms <= 6) 
                & (data.bathrooms <= 6) 
                & (data.sqft != 0) ]
    print("Entries after filter: ",len(data))
    
    # fills in some sensible defaults where data is missing
    data["near_golf_course"] = data["near_golf_course"].apply(lambda x: True if x == 1.0 else False)
    data["has_pool"] = data["has_pool"].apply(lambda x: True if x == 1.0 else False)
    data["garage"] = data["garage"].apply(lambda x: True if x == 1.0 else False)
    data["multifamily"] = data["home_type"].apply(lambda x: True if x == "mfh" else False)
    data['date_closed'] = data['date_closed'].apply(lambda x: (x - date(2000, 1, 1)))
    data['date_closed'] = data['date_closed'].astype(int)
    
    return data

def query(transaction_type, transaction_status, city, limit, date="2000-01-01 10:01:13"):
    # sql query helper function
    query = %sql (\
    select  \
    *  \
    from  \
    properties, \
    property_transaction_logs, \
    area_name_zipcodes, \
    property_school_districts \
    where  \
    property_transaction_logs.created_at > :date and \
    area_name_zipcodes.`area_name` = :city and \
    area_name_zipcodes.`zipcode` = properties.`zipcode` and     \
    properties.`id` = property_transaction_logs.`property_id` and \
    property_transaction_logs.`transaction_status` = :transaction_status and \
    property_transaction_logs.`transaction_type` = :transaction_type and \
    property_transaction_logs.`is_latest` = true and \
    property_school_districts.`property_id` = properties.`id` \
    order by \
    property_transaction_logs.id desc \
    limit :limit)

    return query.DataFrame().T.groupby(level=0).first().T

In [7]:
sales_train = query('sales', 'closed', city, limit)

495700 rows affected.


In [13]:
sales_train = sanitize(sales_train)

# remove for rent listings that are categorized as for sale
sales_train = sales_train[sales_train.price > 50000]

Entries before filter:  495700
Entries after filter:  445454


In [14]:
# train model based on historical sales information
import timeit
start_time = timeit.default_timer()

dtrain = xgb.DMatrix(sales_train[factors].values, label=sales_train.price, feature_names=factors)
bst = xgb.train( plst, dtrain, num_round )

elapsed = timeit.default_timer() - start_time
print("Time to train model: ", elapsed)

# save model
bst.save_model(city + '_sales.model')

Time to train model:  17566.896312999997


In [15]:
# predict the training set using the model - note this is in sample testing
dtrain_predictions = bst.predict(dtrain)
# dtrain_predprob =  predict_proba(sales_train[factors].values)[:,1]
        
#Print model report:
print ("Model Report")
print ("MAE Score (Train): %f" % metrics.mean_absolute_error(sales_train.price, dtrain_predictions))
print ("R^2: %f" % metrics.explained_variance_score(sales_train.price, dtrain_predictions))

Model Report
MAE Score (Train): 31703.234560
R^2: 0.720092


In [16]:
for_sale = query('sales', 'open', city, limit)

35183 rows affected.


In [17]:
# use today's date for 'close date' since the transaction is still open i.e. home is currently listed for sale
for_sale.date_closed = date.today()
for_sale = sanitize(for_sale)

# remove for rent listings that are categorized as for sale
for_sale = for_sale[for_sale.price > 50000]

target = xgb.DMatrix( for_sale[factors].values, feature_names=factors)
ypred = bst.predict(target)

AttributeError: 'str' object has no attribute 'today'

In [None]:
values = np.column_stack((for_sale.property_id.values
                         ,for_sale.address.values
                         ,ypred
                         ,for_sale.price.values
                         ,ypred-for_sale.price))
output = pd.DataFrame(values[:,1:],index=values[:,0],columns=['address','ypred','list','gain-loss'])
output = output.sort_values(by='gain-loss',ascending=False)

output.head(20)

In [None]:
rent_train = query(transaction_type='rental',transaction_status='closed', city=city,limit=limit)

In [None]:
# train rental model

rent_train = sanitize(rent_train)
dtrain = xgb.DMatrix(rent_train[factors].values, label=rent_train.price, feature_names=factors)
rent_bst = xgb.train( plst, dtrain, num_round )

# save rental model
rent_bst.save_model(city + '_rent.model')

In [None]:
# predict rent prices for home that are listed for sale
ypred = rent_bst.predict(target)
ypred = pd.Series(ypred,index=output.index)
ypred.name = "rent"

# calculate estimated cap rate
cr = ypred * 12 / output.list
cr.name = "cap rate"

In [None]:
# combine rent predictions to homes listed for sale
best_of = pd.concat([output,ypred, cr],axis=1)
best_of = best_of[ (best_of['gain-loss'] < 50000) & ((best_of['gain-loss'] / best_of.list).abs() < 0.5)]
best_of.head(30)

In [None]:
best_of.to_csv(city+'_target_list.csv')

In [None]:
xgb.plot_importance(bst)
xgb.plot_importance(rent_bst)