In [1]:
import pandas as pd
import numpy as np
import sklearn

from sklearn import metrics
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector, TransformedTargetRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, PowerTransformer, FunctionTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost

from sklearn.model_selection import GridSearchCV

In [26]:
df = pd.read_csv('./AustinHousingData_Preprocessed.csv')
feature_blacklist = [
    # Maybe usable with extra preprocessing
    'streetAddress', 
    # Not applicable for price regression?
    'description', 'homeImage', 
    # Not necessary after adjusted price?
    'latestPriceSource', 'latest_saledate', 'latest_salemonth', 'latest_saleyear',
    # Leak prevention
    'orig_price', 'propertyTaxRate', 'numPriceChanges', 'zip_rank', 'median_zip', 'pr_sqft',
    # Always austin
    'city',
]
X = df.drop(feature_blacklist, axis=1)
y = X.pop('price')

In [93]:
X.dtypes

zipcode                       int64
latitude                    float64
longitude                   float64
garageSpaces                  int64
hasAssociation                int64
hasCooling                    int64
hasGarage                     int64
hasHeating                    int64
hasSpa                        int64
hasView                       int64
homeType                     object
parkingSpaces                 int64
yearBuilt                     int64
numOfPhotos                   int64
accessibility                 int64
numOfAppliances               int64
numOfParkingFeatures          int64
patioporch                    int64
security                      int64
waterfront                    int64
windowfeatures                int64
community                     int64
lotSizeSqFt                 float64
livingAreaSqFt              float64
numOfPrimarySchools           int64
numOfElementarySchools        int64
numOfMiddleSchools            int64
numOfHighSchools            

In [85]:
# Worked well, but that was with the leaking parameters
LogExpTransformer = lambda: FunctionTransformer(func=np.log, inverse_func=np.exp)
OneHotEncoderInit = lambda: OneHotEncoder(handle_unknown='ignore')

def wrap_model(model, transformer=StandardScaler, encoder=OneHotEncoderInit):
    preprocessing = make_column_transformer(
        (
            StandardScaler(), 
            make_column_selector(dtype_include=np.number)
        ),
        (
            encoder(), 
            make_column_selector(dtype_include=object)
        ),
    )
    
    pipeline = make_pipeline(preprocessing, model)
    transformer_obj = transformer()
    pipeline = TransformedTargetRegressor(regressor=pipeline,
                                          transformer=transformer_obj)
    return pipeline

def test_model(model, X, y, preprocess=True, metric='neg_mean_absolute_error', splits=5, transformer=StandardScaler, encoder=OneHotEncoderInit):
    # If no preprocessing, just use model
    pipeline = model
    if preprocess:
        pipeline = wrap_model(model, transformer, encoder)
        
    # K-fold cross-validation (Maybe try stratified?)
    kf = KFold(n_splits=splits, shuffle=True, random_state=0)
    cv_results = cross_val_score(
        pipeline,
        X,
        y,
        cv=kf,
        scoring=metric,
        n_jobs=-1)
    
    return cv_results

def test_models(models, X, y, metric='neg_mean_absolute_error', splits=5, transformer=StandardScaler, encoder=OneHotEncoderInit):
    for model in models:
        result = -test_model(model, X, y, metric=metric, transformer=transformer).mean()
        print(f'{model}: {result}')

In [36]:
# Baselines
models = [
    # Dummy
    DummyRegressor(strategy='mean'),
    # Linear
    LinearRegression(),
    Ridge(),
    Lasso(),
    # Decision tree
    RandomForestRegressor(random_state=42),
    xgboost.XGBRegressor(random_state=42),
    # SVM
    SVR(),
]

test_models(models, X, y)

DummyRegressor(): 156061.51981158793
LinearRegression(): 104640.48366271799
Ridge(): 104635.89337734645
Lasso(): 156061.51981158793
RandomForestRegressor(random_state=42): 64440.46016183711
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=42, ...): 64890.92482869577
SVR(): 69432.01803074928


In [33]:
# Baselines
models = [
    # Dummy
    DummyRegressor(strategy='mean'),
    # Linear
    LinearRegression(),
    Ridge(),
    Lasso(),
    # Decision tree
    RandomForestRegressor(random_state=42),
    xgboost.XGBRegressor(random_state=42),
    # SVM
    SVR(),
]

test_models(models, X, y, transformer=LogExpTransformer)

DummyRegressor(): 149492.03770853623
LinearRegression(): 96950.77654329439
Ridge(): 96951.18522630286
Lasso(): 149492.03770853623
RandomForestRegressor(random_state=42): 66421.03615859227
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=42, ...): 66162.34344310875
SVR(): 68706.72079765055


In [37]:
# Baselines
models = [
    # Dummy
    DummyRegressor(strategy='mean'),
    # Linear
    LinearRegression(),
    Ridge(),
    Lasso(),
    # Decision tree
    RandomForestRegressor(random_state=42),
    xgboost.XGBRegressor(random_state=42),
    # SVM
    SVR(),
]

test_models(models, X, y, transformer=PowerTransformer)

DummyRegressor(): 149247.71078334423
LinearRegression(): 97526.15358223366
Ridge(): 97526.1492574012
Lasso(): 149247.71078334423
RandomForestRegressor(random_state=42): 67034.07003731855
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=42, ...): 66692.31882481751
SVR(): 68440.83721094967


## XGBoost

In [39]:
# XGBoost
models = [
    xgboost.XGBRegressor(n_estimators=100, max_depth=10),
    xgboost.XGBRegressor(n_estimators=200, max_depth=15),
    xgboost.XGBRegressor(n_estimators=100, max_depth=5),
    xgboost.XGBRegressor(n_estimators=100, max_depth=10, learning_rate=0.01),
    xgboost.XGBRegressor(n_estimators=100, max_depth=10, learning_rate=0.2),
    xgboost.XGBRegressor(learning_rate=0.2, eval_metric='mae'),
    xgboost.XGBRegressor(n_estimators=100, max_depth=10, learning_rate=0.2, eval_metric='mae'),
    xgboost.XGBRegressor(n_estimators=100, max_depth=10, learning_rate=0.1, eval_metric='mae'),
    xgboost.XGBRegressor(n_estimators=100, max_depth=10, learning_rate=0.08, eval_metric='mae'),
    xgboost.XGBRegressor(n_estimators=100, max_depth=10, learning_rate=0.08, eval_metric='mae', subsample=0.5),
    xgboost.XGBRegressor(n_estimators=200, max_depth=10, learning_rate=0.08, eval_metric='mae', subsample=0.5),
]

test_models(models, X, y)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=10, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=100, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...): 67313.68503359535
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, e

In [None]:
parameters = {
    'regressor__xgbregressor__max_depth': range(2, 15, 2),
    'regressor__xgbregressor__n_estimators': range(50, 250, 50),
    'regressor__xgbregressor__learning_rate': np.arange(0.05, 0.2, 0.05),
    'regressor__xgbregressor__subsample': np.arange(0.5, 1, 0.1)
}
grid_search = GridSearchCV(
    estimator=wrap_model(xgboost.XGBRegressor(seed=0)),
    param_grid=parameters,
    scoring = 'neg_mean_absolute_error',
    cv = 4,
    n_jobs=-1,
    verbose=True
)
grid_search.fit(X,y)

In [42]:
print(grid_search.best_score_)
grid_search.best_params_
# TODO: Look into more parameters?

-72927.96953732331


{'regressor__xgbregressor__learning_rate': 0.1,
 'regressor__xgbregressor__max_depth': 6,
 'regressor__xgbregressor__n_estimators': 150,
 'regressor__xgbregressor__subsample': 0.7}

## Random Forest

In [None]:
params = {
    'regressor__randomforestregressor__bootstrap': [True, False],
    'regressor__randomforestregressor__max_depth': list(range(5,25,10))+[None],
    'regressor__randomforestregressor__max_features': [1, 'sqrt', 'log2'],
    'regressor__randomforestregressor__min_samples_leaf': [1, 2, 4],
    'regressor__randomforestregressor__min_samples_split': [2, 5, 10],
    'regressor__randomforestregressor__n_estimators': range(100, 700, 200)
}

grid_search = GridSearchCV(
    estimator=wrap_model(RandomForestRegressor(random_state=0)),
    param_grid=params,
    scoring = 'neg_mean_absolute_error',
    cv = 4,
    n_jobs=-1,
    verbose=True
)
grid_search.fit(X,y)


In [44]:
print(grid_search.best_score_)
grid_search.best_params_

# Mostly default values, except bootstrap=False and n_estimators=500

-75699.39245206534


{'regressor__randomforestregressor__bootstrap': False,
 'regressor__randomforestregressor__max_depth': None,
 'regressor__randomforestregressor__max_features': 'sqrt',
 'regressor__randomforestregressor__min_samples_leaf': 1,
 'regressor__randomforestregressor__min_samples_split': 2,
 'regressor__randomforestregressor__n_estimators': 500}

In [45]:
# Manual checks
models = [
    RandomForestRegressor(n_estimators=100),
    RandomForestRegressor(n_estimators=100, max_depth=10),
    RandomForestRegressor(n_estimators=500),
    RandomForestRegressor(n_estimators=100, bootstrap=False),
    RandomForestRegressor(n_estimators=500, bootstrap=False),
]

test_models(models, X, y)

# Bootstrap=False gives way worse results, n_estimators=500 doesn't change much

RandomForestRegressor(): 64542.45158221328
RandomForestRegressor(max_depth=10): 68106.92435363178
RandomForestRegressor(n_estimators=500): 64207.50117775381
RandomForestRegressor(bootstrap=False): 89372.07183112312
RandomForestRegressor(bootstrap=False, n_estimators=500): 89310.65708912213


In [46]:
models = [
    RandomForestRegressor(n_estimators=400),
    RandomForestRegressor(n_estimators=500),
    RandomForestRegressor(n_estimators=600),
    RandomForestRegressor(n_estimators=1000),
]

test_models(models, X, y)

RandomForestRegressor(n_estimators=400): 64387.82788319059
RandomForestRegressor(n_estimators=500): 64276.0654767631
RandomForestRegressor(n_estimators=600): 64222.04162405124
RandomForestRegressor(n_estimators=1000): 64254.52432699746


## SVR

In [47]:
# SVR different kernels
models = [
    SVR(kernel='rbf'),
    SVR(kernel='linear'),
    SVR(kernel='poly'),
    SVR(kernel='sigmoid'),
]

test_models(models, X, y)

# Pretty bad results, maybe with better tuning

SVR(): 69432.01803074928
SVR(kernel='linear'): 99335.80035322995
SVR(kernel='poly'): 76794.9312477439
SVR(kernel='sigmoid'): 3210465.8172194357


In [48]:
# SVR different epsilon values
models = [
    SVR(epsilon=0.001),
    SVR(epsilon=0.01),
    SVR(epsilon=0.1),
]

test_models(models, X, y)

SVR(epsilon=0.001): 69381.1654947454
SVR(epsilon=0.01): 69336.23443968405
SVR(): 69432.01803074928


In [None]:
params = {
    'regressor__svr__epsilon': [0.001, 0.01],
    'regressor__svr__C': [0.25, 0.5, 1],
    'regressor__svr__kernel': ['rbf', 'linear', 'poly'],
}

grid_search = GridSearchCV(
    estimator=wrap_model(SVR()),
    param_grid=params,
    scoring = 'neg_mean_absolute_error',
    cv = 3,
    n_jobs=-1,
    verbose=2
)
grid_search.fit(X,y)

In [51]:
print(grid_search.best_score_)
grid_search.best_params_
# Grid search hasn't found anything better

-82365.74458682409


{'regressor__svr__C': 1,
 'regressor__svr__epsilon': 0.01,
 'regressor__svr__kernel': 'rbf'}

## Final check & save

In [95]:
best = xgboost.XGBRegressor(n_estimators=150, 
                            max_depth=6, 
                            learning_rate=0.08, 
                            eval_metric='mae', 
                            subsample=0.7)
print(test_model(best, X, y, metric='neg_root_mean_squared_error', encoder=OrdinalEncoder).mean())
print(test_model(best, X, y, metric='neg_mean_absolute_error', encoder=OrdinalEncoder).mean())

-101930.56626671125
-62376.78766577033


In [99]:
import joblib
best_wrapped = wrap_model(best)
best_wrapped.fit(X, y)
joblib.dump(best_wrapped, "model.pkl")

['model.pkl']