In [36]:
from collections import namedtuple
import pickle
import pandas as pd
import numpy as np
from zillow_adapter import ask_zillow
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import SimpleImputer
from category_encoders.basen import BaseNEncoder

with open("final_df.pickle", "rb") as fp: 
    X_, y = pickle.load(fp)
    
cate = ['roof_year', 'furnace_year']    

def prep(dat: pd.DataFrame) -> pd.DataFrame: 
    
    return (dat.drop(['zip5_encoded', 'city_encoded'], axis=1)
            
            # match column names to zillow
            .rename(columns={'year_built_bin': 'year_built', 'baths_lavs': 'bathrooms', 'beds_total': 'bedrooms'})
            
            ## BIN LABELS NEED TO BE THIS
            .assign(**{feat: pd.cut(dat[feat], 
                                    bins=[0, 4, 9, 14, 10000], 
                                    labels=["0-4 years", "5-9 years", "10-14 years", "15+"])
                                .fillna("15+") 
                       for feat 
                       in cate})
            
            ## NEEDS TO BE IN THIS ORDER
            .reindex(['bedrooms', 'bathrooms', 'year_built', 
                      'quartz_countertops', 'granite_countertops', 'formica_countertops', 'tile_countertops',
                      'laminate_floors', 'hardwood_floors', 
                      'roof_year', 'furnace_year'], axis=1))

X = prep(X_)

# address, zipcode = "4565 White Rd, Pierson, MI", 49339
# house = ask_zillow(address, zipcode)

print(X.shape)
#inps = {feat: house.results.__dict__[feat] for feat in ('bedrooms', 'bathrooms', 'year_built')}

countertops_map = {"Marble/Quartz": [1, 0, 0, 0],
                   "Granite/Concrete": [0, 1, 0, 0],
                   '"Formica/Tile"': [0, 0, 1, 1], 
                   'Laminate': [0, 0, 0, 0], 
                   'skip': [X.quartz_countertops.mean(), 
                            X.granite_countertops.mean(), 
                            X.formica_countertops.mean(), 
                            X.tile_countertops.mean()]}

flooring_map = {"Hardwood" : [0, 1], 
                "Engineered/Laminate": [1, 0], 
                "Ceramic Tile": [0, 0], 
                "Porcelain Tile/Concrete": [0, 0], 
                "skip": [X.laminate_floors.mean(), 
                         X.hardwood_floors.mean()]}

roof_furnace_map = {**{'skip': y.mean()}, **{feat: feat 
                                             for feat 
                                             in ["0-4 years", "5-9 years", "10-14 years", "15+"]}}

X.head()


(24773, 11)


Unnamed: 0,bedrooms,bathrooms,year_built,quartz_countertops,granite_countertops,formica_countertops,tile_countertops,laminate_floors,hardwood_floors,roof_year,furnace_year
0,4,2.1,1999,0,1,0,0,0,0,15+,15+
3,5,3.0,1996,0,0,0,0,0,1,15+,15+
6,3,2.1,1997,0,1,0,0,0,0,15+,15+
10,4,3.0,1992,0,1,0,0,0,1,15+,15+
11,4,3.1,1980,0,0,0,0,0,1,15+,15+


In [None]:
assert X.isna().sum().sum()==0

gbr = Pipeline(steps=[('regressor', GradientBoostingRegressor())])

param_grid = {'regressor__learning_rate': [np.exp(k) for k in range(-5, -1)], 'regressor__max_depth': range(2, 9), 
            'regressor__min_samples_split': [2, 3, 4], 'regressor__n_estimators': range(135, 800)} 

#param_grid = {'regressor__n_estimators': range(60, 500), 'regressor__min_samples_leaf': range(1, 100)}

gbr_cv = RandomizedSearchCV(gbr, param_distributions=param_grid, n_iter=10, cv=3, iid=False, verbose=10)

#try:
gbr_cv.fit(X.drop(cate, axis=1), y)
#except Exception as e: 
#    print(e)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] regressor__n_estimators=548, regressor__min_samples_split=3, regressor__max_depth=4, regressor__learning_rate=0.1353352832366127 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  regressor__n_estimators=548, regressor__min_samples_split=3, regressor__max_depth=4, regressor__learning_rate=0.1353352832366127, score=0.17971534520470722, total=   4.1s
[CV] regressor__n_estimators=548, regressor__min_samples_split=3, regressor__max_depth=4, regressor__learning_rate=0.1353352832366127 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.3s remaining:    0.0s


[CV]  regressor__n_estimators=548, regressor__min_samples_split=3, regressor__max_depth=4, regressor__learning_rate=0.1353352832366127, score=0.19791636117509037, total=   4.1s
[CV] regressor__n_estimators=548, regressor__min_samples_split=3, regressor__max_depth=4, regressor__learning_rate=0.1353352832366127 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    8.5s remaining:    0.0s


[CV]  regressor__n_estimators=548, regressor__min_samples_split=3, regressor__max_depth=4, regressor__learning_rate=0.1353352832366127, score=0.17935180254307925, total=   4.0s
[CV] regressor__n_estimators=590, regressor__min_samples_split=4, regressor__max_depth=2, regressor__learning_rate=0.049787068367863944 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   12.7s remaining:    0.0s


[CV]  regressor__n_estimators=590, regressor__min_samples_split=4, regressor__max_depth=2, regressor__learning_rate=0.049787068367863944, score=0.1972395226519953, total=   2.0s
[CV] regressor__n_estimators=590, regressor__min_samples_split=4, regressor__max_depth=2, regressor__learning_rate=0.049787068367863944 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   14.8s remaining:    0.0s


[CV]  regressor__n_estimators=590, regressor__min_samples_split=4, regressor__max_depth=2, regressor__learning_rate=0.049787068367863944, score=0.22472727482194021, total=   2.0s
[CV] regressor__n_estimators=590, regressor__min_samples_split=4, regressor__max_depth=2, regressor__learning_rate=0.049787068367863944 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   16.9s remaining:    0.0s


[CV]  regressor__n_estimators=590, regressor__min_samples_split=4, regressor__max_depth=2, regressor__learning_rate=0.049787068367863944, score=0.2209681093962106, total=   2.1s
[CV] regressor__n_estimators=141, regressor__min_samples_split=3, regressor__max_depth=8, regressor__learning_rate=0.049787068367863944 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   19.0s remaining:    0.0s


[CV]  regressor__n_estimators=141, regressor__min_samples_split=3, regressor__max_depth=8, regressor__learning_rate=0.049787068367863944, score=0.17718680364597308, total=   3.9s
[CV] regressor__n_estimators=141, regressor__min_samples_split=3, regressor__max_depth=8, regressor__learning_rate=0.049787068367863944 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   23.1s remaining:    0.0s


[CV]  regressor__n_estimators=141, regressor__min_samples_split=3, regressor__max_depth=8, regressor__learning_rate=0.049787068367863944, score=0.19155432579819132, total=   4.3s
[CV] regressor__n_estimators=141, regressor__min_samples_split=3, regressor__max_depth=8, regressor__learning_rate=0.049787068367863944 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   27.5s remaining:    0.0s


[CV]  regressor__n_estimators=141, regressor__min_samples_split=3, regressor__max_depth=8, regressor__learning_rate=0.049787068367863944, score=0.1742619371063755, total=   4.5s
[CV] regressor__n_estimators=244, regressor__min_samples_split=3, regressor__max_depth=7, regressor__learning_rate=0.1353352832366127 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   32.2s remaining:    0.0s


[CV]  regressor__n_estimators=244, regressor__min_samples_split=3, regressor__max_depth=7, regressor__learning_rate=0.1353352832366127, score=0.12914601655596547, total=   4.8s
[CV] regressor__n_estimators=244, regressor__min_samples_split=3, regressor__max_depth=7, regressor__learning_rate=0.1353352832366127 
[CV]  regressor__n_estimators=244, regressor__min_samples_split=3, regressor__max_depth=7, regressor__learning_rate=0.1353352832366127, score=0.14959568677718693, total=   5.1s
[CV] regressor__n_estimators=244, regressor__min_samples_split=3, regressor__max_depth=7, regressor__learning_rate=0.1353352832366127 
[CV]  regressor__n_estimators=244, regressor__min_samples_split=3, regressor__max_depth=7, regressor__learning_rate=0.1353352832366127, score=0.1246904272310151, total=   4.6s
[CV] regressor__n_estimators=646, regressor__min_samples_split=4, regressor__max_depth=8, regressor__learning_rate=0.1353352832366127 
[CV]  regressor__n_estimators=646, regressor__min_samples_split=4

In [4]:
from valuation import valuation
from constants import SurveyPredictants

address, zipcode = "4565 White Rd, Pierson, MI", 49339
house = ask_zillow(address, zipcode)

#good_address = "3400 Pacific Ave., Marina Del Rey, CA, 90292", #"12345 Butternut Avenue, Sand Lake, MI 49343",
countertops = "Granite/Concrete"
flooring= "Hardwood"
roof_age= "0-4 years"
furnace_age= "10-14 years"

x = valuation(house.results, SurveyPredictants(countertops, flooring, roof_age, furnace_age))

In [28]:
from category_encoders.helmert import HelmertEncoder



In [7]:
X.roof_year.value_counts()

15+            20561
10-14 years     2294
5-9 years       1018
0-4 years        900
Name: roof_year, dtype: int64

In [11]:
# X[['roof_year', 'furnace_year']].values

array([['15+', '15+'],
       ['15+', '15+'],
       ['15+', '15+'],
       ...,
       ['0-4 years', '0-4 years'],
       ['15+', '15+'],
       ['15+', '15+']], dtype=object)

In [32]:
np.exp(-1)

0.36787944117144233

In [None]:

# zill_props = ['bathrooms',
#  'bedrooms',
#  'data',
#  'get_attr',
#  'graph_data_link',
#  'home_detail_link',
#  'home_size',
#  'home_type',
#  'last_sold_date',
#  'last_sold_price',
#  'last_sold_price_currency',
#  'latitude',
#  'longitude',
#  'map_this_home_link',
#  'property_size',
#  'tax_value',
#  'tax_year',
#  'year_built',]
