In [336]:
%load_ext sql

# imports
import pandas as pd
import numpy as np
import datetime
from datetime import datetime
from datetime import date
import matplotlib.pyplot as plt

# this allows plots to appear directly in the notebook
%matplotlib inline

from sklearn import model_selection
from sklearn import preprocessing
from sklearn import metrics
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit

# city abbreviation code
city = 'dtw'

%sql mysql://prod:nerd@52.2.153.189/rental_nerd

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


'Connected: prod@rental_nerd'

In [352]:
# data columns used for the booster
factors = ['property_id', 'bedrooms', 'bathrooms', 'sqft','longitude', 'latitude','zipcode', 'elevation', 'garage'
                          ,'year_built', 'level','dist_to_park','dist_to_golf_course', 'has_pool','school_district_id'
                          , 'date_closed', 'garage','multifamily']

# booster parameters
param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'bst:n_estimators':1000, 'eval_metric':'mae' }
num_round = 10
plst = param.items()

def sanitize(data):
    # filters out any non-sensical values or fat finger mistakes in MLS listings
    print("Entries before filter: ", len(data))
    data = data[  (data.sqft <= 10000) 
                & (data.price <= 400000) 
                & (data.price > 500)
                & (data.bedrooms <= 6) 
                & (data.bathrooms <= 6) 
                & (data.sqft != 0) ]
    print("Entries after filter: ",len(data))
    
    # fills in some sensible defaults where data is missing
    data["near_golf_course"] = data["near_golf_course"].apply(lambda x: True if x == 1.0 else False)
    data["has_pool"] = data["has_pool"].apply(lambda x: True if x == 1.0 else False)
    data["garage"] = data["garage"].apply(lambda x: True if x == 1.0 else False)
    data["multifamily"] = data["home_type"].apply(lambda x: True if x == "mfh" else False)
    data['date_closed'] = data['date_closed'].apply(lambda x: (x - date(2000, 1, 1)))
    data['date_closed'] = data['date_closed'].astype(int)
    
    return data

In [357]:
query = %sql (\
select  \
*  \
from  \
properties, \
property_transaction_logs, \
area_name_zipcodes, \
property_school_districts \
where  \
area_name_zipcodes.`area_name` = 'PH' and \
area_name_zipcodes.`zipcode` = properties.`zipcode` and     \
properties.`id` = property_transaction_logs.`property_id` and \
property_transaction_logs.`transaction_status` = 'closed' and \
property_transaction_logs.`transaction_type` = 'sales' and \
property_transaction_logs.`is_latest` = true and \
property_school_districts.`property_id` = properties.`id` \
order by \
property_transaction_logs.id desc \
limit 100000)

sales_train = query.DataFrame()

100 rows affected.


In [358]:
sales_train = sanitize(sales_train)
dtrain = xgb.DMatrix(sales_train[factors].values, label=sales_train.price)
bst = xgb.train( plst, dtrain, num_round )

Entries before filter:  100
Entries after filter:  90


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [359]:
query = %sql (\
select  \
*  \
from  \
properties, \
property_transaction_logs, \
area_name_zipcodes, \
property_school_districts \
where  \
area_name_zipcodes.`area_name` = 'PH' and \
area_name_zipcodes.`zipcode` = properties.`zipcode` and     \
properties.`id` = property_transaction_logs.`property_id` and \
property_transaction_logs.`transaction_status` = 'open' and \
property_transaction_logs.`transaction_type` = 'sales' and \
property_transaction_logs.`is_latest` = true and \
property_school_districts.`property_id` = properties.`id` \
order by \
property_transaction_logs.id desc \
limit 100000)

for_sale = query.DataFrame()

100 rows affected.


In [361]:
# use today's date for 'close date'
for_sale.date_closed = date.today()

for_sale = sanitize(for_sale)
target = xgb.DMatrix( for_sale[factors].values)
ypred = bst.predict(target)

Entries before filter:  100
Entries after filter:  99


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [362]:
values = np.column_stack((for_sale.property_id.values[:,-1]
                         ,for_sale.address.values
                         ,ypred
                         ,for_sale.price.values
                         ,ypred-for_sale.price))
output = pd.DataFrame(values[:,1:],index=values[:,0],columns=['address','ypred','list','gain-loss'])
output = output.sort_values(by='gain-loss',ascending=False)
output.head(20)

price: 28486.4140625 prediction: 8123 W Indianola Ave, Phoenix, AZ 85033  gain: 114900


Unnamed: 0,address,ypred,list,gain-loss
432288,"6963 W Pierson St, Phoenix, AZ 85033",154198.0,139000,15198.4
436221,"8032 W Clayton Dr, Phoenix, AZ 85033",176802.0,170000,6802.27
895,"7955 W College Dr, Phoenix, AZ 85033",152417.0,150000,2417.03
451398,"8801 W Fairmount Ave, Phoenix, AZ 85037",138042.0,137000,1041.58
432420,"7014 W Verde Ln, Phoenix, AZ 85033",168314.0,169000,-685.688
429750,"6427 W Catalina Dr, Phoenix, AZ 85033",173757.0,175000,-1242.88
500404,"7207 W Roanoke Ave, Phoenix, AZ 85035",153314.0,155000,-1685.95
494246,"6108 W Virginia Ave, Phoenix, AZ 85035",145818.0,147900,-2082.28
500404,"7207 W Roanoke Ave, Phoenix, AZ 85035",153314.0,159000,-5685.95
429750,"6427 W Catalina Dr, Phoenix, AZ 85033",173757.0,180000,-6242.88


In [363]:
query = %sql (\
select  \
*  \
from  \
properties, \
property_transaction_logs, \
area_name_zipcodes, \
property_school_districts \
where  \
area_name_zipcodes.`area_name` = 'PH' and \
area_name_zipcodes.`zipcode` = properties.`zipcode` and     \
properties.`id` = property_transaction_logs.`property_id` and \
property_transaction_logs.`transaction_status` = 'closed' and \
property_transaction_logs.`transaction_type` = 'rental' and \
property_transaction_logs.`is_latest` = true and \
property_school_districts.`property_id` = properties.`id` \
order by \
property_transaction_logs.id desc \
limit 100000)

rent_train = query.DataFrame()

100 rows affected.


In [364]:
rent_train = sanitize(rent_train)
dtrain = xgb.DMatrix(rent_train[factors].as_matrix(), label=rent_train.price)
rent_bst = xgb.train( plst, dtrain, num_round )

Entries before filter:  100
Entries after filter:  79


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [380]:
# predict rent prices for home that are listed for sale
ypred = rent_bst.predict(target)
ypred = pd.Series(ypred,index=output.index)
ypred.name = "rent"

# calculate estimated cap rate
cr = ypred * 12 / output.list
cr.name = "cap rate"

In [381]:
# combine rent predictions to homes listed for sale
pd.concat([output,ypred, cr],axis=1)

Unnamed: 0,address,ypred,list,gain-loss,rent,cap rate
432288,"6963 W Pierson St, Phoenix, AZ 85033",154198,139000,15198.4,1197.153076,0.103351
436221,"8032 W Clayton Dr, Phoenix, AZ 85033",176802,170000,6802.27,1171.198853,0.0826729
895,"7955 W College Dr, Phoenix, AZ 85033",152417,150000,2417.03,996.833008,0.0797466
451398,"8801 W Fairmount Ave, Phoenix, AZ 85037",138042,137000,1041.58,1140.864380,0.0999297
432420,"7014 W Verde Ln, Phoenix, AZ 85033",168314,169000,-685.688,1487.903687,0.10565
429750,"6427 W Catalina Dr, Phoenix, AZ 85033",173757,175000,-1242.88,1048.275024,0.0718817
500404,"7207 W Roanoke Ave, Phoenix, AZ 85035",153314,155000,-1685.95,963.604736,0.0746017
494246,"6108 W Virginia Ave, Phoenix, AZ 85035",145818,147900,-2082.28,1184.381104,0.0960958
500404,"7207 W Roanoke Ave, Phoenix, AZ 85035",153314,159000,-5685.95,1184.381104,0.0893873
429750,"6427 W Catalina Dr, Phoenix, AZ 85033",173757,180000,-6242.88,1096.219238,0.0730813
