In [109]:
%load_ext sql

# imports
import pandas as pd
import numpy as np
import datetime
from datetime import datetime
import matplotlib.pyplot as plt

# this allows plots to appear directly in the notebook
%matplotlib inline

from sklearn import model_selection
from sklearn import preprocessing
from sklearn import metrics
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit

# city abbreviation code
city = 'dtw'

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [110]:
%sql mysql://root:@localhost/rental_nerd

'Connected: root@rental_nerd'

In [None]:
query = %sql (\
SELECT  \
properties.id as 'property_id', \
properties.address,  \
properties.bedrooms,  \
properties.bathrooms,  \
properties.sqft,  \
properties.source,  \
properties.origin_url,  \
properties.longitude,  \
properties.latitude,  \
properties.elevation,  \
(2016 - properties.year_built) as 'age',  \
properties.garage,  \
properties.level,  \
properties.luxurious,  \
properties.dist_to_park,  \
properties.zipcode, \
properties.dist_to_golf_course, \
properties.near_golf_course, \
properties.has_pool, \
properties.home_type, \
property_transaction_logs.id 'ptl_id',  \
property_transaction_logs.transaction_type,  \
property_transaction_logs.price,  \
property_transaction_logs.transaction_status,  \
property_transaction_logs.days_on_market,  \
property_transaction_logs.date_closed as 'date',  \
property_transaction_logs.date_listed,  \
neighborhoods.name as 'neighborhood',  \
neighborhoods.id as 'nid',  \
neighborhoods.shapefile_source,  \
property_school_districts.school_district_id \
FROM  \
properties,  \
property_transaction_logs,  \
property_neighborhoods,  \
neighborhoods,  \
property_school_districts \
WHERE  \
property_school_districts.property_id = properties.id AND  \
property_transaction_logs.property_id = properties.id AND  \
property_transaction_logs.transaction_type = "rental" AND  \
property_transaction_logs.date_closed is not null AND \
neighborhoods.shapefile_source = "PH" AND  \
properties.id = property_neighborhoods.property_id AND  \
property_neighborhoods.neighborhood_id = neighborhoods.id AND \
properties.sqft > 0 AND \
properties.bedrooms IS NOT NULL AND \
properties.bathrooms IS NOT NULL AND \
properties.elevation IS NOT NULL AND \
properties.level IS NOT NULL AND \
properties.dist_to_park IS NOT NULL AND \
properties.near_golf_course IS NOT NULL AND \
properties.home_type IS NOT NULL AND \
properties.zipcode IS NOT NULL AND \
properties.sqft IS NOT NULL AND \
properties.year_built IS NOT NULL AND \
property_transaction_logs.price > 0 )
           
    
# properties.has_pool IS NOT NULL AND \
# properties.garage IS NOT NULL AND \
data = query.DataFrame()

In [None]:
data["near_golf_course"] = data["near_golf_course"].apply(lambda x: True if x == 1.0 else False)
data["has_pool"] = data["has_pool"].apply(lambda x: True if x == 1.0 else False)
data["multifamily"] = data["home_type"].apply(lambda x: True if x == "mfh" else False)

In [None]:
# filter out any outliers, defined as rent >$10k or >2,500 sq ft, or not in SF

print("Entries before filter: ", len(data))
data = data[  (data.sqft <= 10000) 
            & (data.price <= 4000) 
            & (data.price > 500)
            & (data.bedrooms <= 6) 
            & (data.bathrooms <= 6) 
            & (data.sqft != 0) ]
print("Entries after filter: ",len(data))



In [None]:
data['date_int'] = pd.to_datetime(data['date'])    
data['date_delta'] = (data['date_int'] - data['date_int'].min())  / np.timedelta64(1,'D')
data.sort_values('date_delta',ascending= False, inplace= True)

In [None]:
#Helper Functions
def reg_metrics(reg,X_test,y):
    y_pred = reg.predict(X_test)
    print( "r-squared: ", metrics.r2_score(y,y_pred))
    print( "mse: ", metrics.mean_squared_error(y,y_pred))
    print( "mean error: ", (y-y_pred).abs().median())
# Scale features
def scale_X(X_train,X_test):
    scaler = StandardScaler()
    scaler.fit(X_train) 
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

# GridSearch Parameters
def grid_optimize(reg,param_grid,X,y):
    grid = GridSearchCV(reg,param_grid,cv=5)
    grid.fit(X,y)
    print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_))
    return grid

# This is the better one, the XGBoost
def scikit_xgb_mhs(X_train,X_test,y_train,y_test):
    sk_xgb = xgb.XGBRegressor()
    xgb_param_grid = {'max_depth':[6],'n_estimators':[1000]} # 10,000 is best and 6 deep
    xgb_grid = grid_optimize(sk_xgb,xgb_param_grid,X_train,y_train)
    sk_xgb = xgb.XGBRegressor(max_depth=xgb_grid.best_params_['max_depth'],n_estimators=xgb_grid.best_params_['n_estimators'])
    sk_xgb.fit(X_train,y_train)
    print("XGBoostRegressor (Sci-kit Learn API)")
    print("Training metrics")
    reg_metrics(sk_xgb,X_train,y_train)
    print("Test metrics")
    reg_metrics(sk_xgb,X_test,y_test) 
    return sk_xgb

In [None]:
factors = ['property_id', 'bedrooms', 'bathrooms', 'sqft','longitude', 'latitude','zipcode', 'elevation'
                          ,'age', 'level','dist_to_park','dist_to_golf_course', 'has_pool','school_district_id'
                          ,'date_delta', 'multifamily']

X = np.array(data[factors].as_matrix())
y = np.array(data.price.as_matrix())
tscv = TimeSeriesSplit(n_splits=2)
print(tscv)  
for train, test in tscv.split(X):
    print("%s %s" % (train, test))
    X_train = X[train, :]
    y_train = [y[i] for i in train]
    X_test = X[test, :]
    y_test = [y[i] for i in test]
    print("1st training listing", X_train[0, [0,14]], 'price', y_train[0] )
    print("Last training listing", X_train[len(X_train)-1, [0,14]], 'price', y_train[test[0]-1])
    print("1st testing listing", X_test[0, [0,14]], 'price', y_test[0] )
    print("Last testing listing", X_test[len(X_test)-1, [0,14]], 'price', y_test[len(y_test)-1])
    xgb_model = scikit_xgb_mhs(X_train, X_test, y_train, y_test)

In [None]:
# feature importance
print(xgb_model.feature_importances_)
# plot





In [None]:
print [factors, xgb_model.feature_importances_]
plt.bar(range(len(xgb_model.feature_importances_)), xgb_model.feature_importances_)
plt.show()

# 1. 'property_id'  
# 2.'bedrooms', 
# 3.'bathrooms',
# 4. 'sqft'
# 5. 'longitude', 
# 6. 'latitude',
# 7. 'zipcode'
# 8. 'elevation',
# 9. 'age',
# 10. 'level',
# 11. 'dist_to_park',
# 12. 'dist_to_golf_course', 
# 13. 'has_pool',
# 14. 'school_district_id',
# 15. 'date_delta'
# 16. 'multifamily'

The best parameters are {'n_estimators': 10000, 'max_depth': 6} with a score of 0.82
XGBoostRegressor (Sci-kit Learn API)
Training metrics
r-squared:  0.997201102182
mse:  962.054036088
Test metrics
r-squared:  0.836490911662
mse:  56514.4679358