In [None]:
from sklearn.ensemble import RandomForestRegressor

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.diagnostic import het_breuschpagan
import numpy as np
import pandas as pd
from sklearn import linear_model
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_log_error
from sklearn.tree import DecisionTreeRegressor, plot_tree


from statsmodels.tools.eval_measures import mse, rmse
from sklearn.linear_model import RidgeCV,LassoCV,ElasticNetCV


from statsmodels.graphics.regressionplots import plot_leverage_resid2
from statsmodels.stats.outliers_influence import variance_inflation_factor

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')


In [None]:
def print_vif(x):
    """Utility for checking multicollinearity assumption
    
    :param x: input features to check using VIF. This is assumed to be a pandas.DataFrame
    :return: nothing is returned the VIFs are printed as a pandas series
    """
    # Silence numpy FutureWarning about .ptp
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        x = sm.add_constant(x)

    vifs = []
    for i in range(x.shape[1]):
        vif = variance_inflation_factor(x.values, i)
        vifs.append(vif)

    print("VIF results\n-------------------------------")
    print(pd.Series(vifs, index=x.columns))
    print("-------------------------------\n")
    
def eval_preds(y_true, y_pred,graph=False):
    error = y_true - y_pred

    rmse = np.sqrt((error ** 2).mean())
    mae = error.abs().mean()
    mape = (error / y_true).abs().mean()

    print(f"rmse {rmse}")
    print(f"mae {mae}")
    print(f"mape {mape}")
    
    if graph==True:
        line_pts = [y_true.min(), y_true.max()]
        plt.scatter(y_true, y_pred)
        plt.plot(line_pts, line_pts, c="red", ls="--", alpha=0.5)
        plt.xlabel("Actual")
        plt.ylabel("Fit")
        plt.show()


In [64]:
import numpy as np
import pandas as pd
from sklearn import linear_model
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from dask import dataframe as dd 
from dask.distributed import Client, progress
import joblib


from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor

from category_encoders import LeaveOneOutEncoder


In [2]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
df = pd.read_sql_query('select * from houseprices',con=engine)

# no need for an open connection, as we're only doing a single query
engine.dispose()



In [4]:
client = Client(n_workers=4, threads_per_worker=2, memory_limit='2GB')
sd = dd.from_pandas(df, npartitions=3)


In [5]:
#deal with missing values
drop_cols=[]
for col in sd.columns:
    if df[col].isna().mean()>.4:
        drop_cols=drop_cols+[col]
df_clean=sd.drop(columns=drop_cols).dropna()


In [11]:
#for dask, you need to use mask instead of loc 
df_clean['secondflrexists']=0
df_clean['secondflrexists']=df_clean['secondflrexists'].mask(df_clean['secondflrsf']>0,1) #can prob drop 2nd fl sq ft
df_clean['yrsbltqared']=df_clean['yearbuilt']*df_clean['yearbuilt']
df_clean['agebuilt']=df_clean['yrsold']-df_clean['yearbuilt']
df_clean['ageremodeled']=df_clean['yrsold']-df_clean['yearremodadd']
df_clean['agebuiltsquared']=df_clean['agebuilt']*df_clean['agebuilt']
df_clean['agebuiltcubed']=df_clean['agebuilt']*df_clean['agebuilt']*df_clean['agebuilt']


In [35]:
df_clean['overallqualsquared']=df_clean['overallqual']*df_clean['overallqual']
df_clean['overallqualcubed']=df_clean['overallqual']*df_clean['overallqual']*df_clean['overallqual']
df_clean['grlivareasquared']=df_clean['grlivarea']*df_clean['grlivarea']
df_clean['ageremodeledsquared']=df_clean['ageremodeled']*df_clean['ageremodeled']
df_clean['totalsf']=df_clean['totalbsmtsf']+df_clean['grlivarea']
df_clean['lotareasquared']=df_clean['lotarea']*df_clean['lotarea']

#making partial,centralair and the neighborhood a binary variable
niceneighborhoods=['NridgHt', 'NoRidge', 'Somerst', 'Timber', 'Veenker', 'StoneBr']
goodneighborhoods=['ClearCr','Crawfor','CollgCr','Gilbert','Blmngtn','SawyerW','NWAmes']

df_clean['nicehood']=0
df_clean['nicehood']=df_clean['nicehood'].mask(df_clean.neighborhood.isin(niceneighborhoods),1)
# df_clean['hoodrank']=0
# df_clean.loc[df_clean.neighborhood.isin(goodneighborhoods),'hoodrank']=1
# df_clean.loc[df_clean.neighborhood.isin(niceneighborhoods),'hoodrank']=2
df_clean['goodhood']=0
df_clean['goodhood']=df_clean['goodhood'].mask(df_clean.neighborhood.isin(goodneighborhoods),1)

#Making binary cats ints
# df_clean['sale_partial']=0
# df_clean.loc[df_clean.salecondition=='Partial','sale_partial']=1
df_clean['centralairint']=0
df_clean['centralairint']=df_clean['centralairint'].mask(df_clean.centralair=='Y',1)
# df_clean['remodeled']=1
# df_clean.loc[df_clean.agebuilt==df_clean.ageremodeled,'remodeled']=0
# df_clean['pavedDW']=0
# df_clean.loc[df_clean.paveddrive=='P','pavedDW']=1
df_clean['haspool']=0
df_clean['haspool']=df_clean['haspool'].mask(df_clean.poolarea>0,1)

#Making some interaction variables:
df_clean['nicehood_quality']=df_clean['overallqual']*df_clean['nicehood']
# df_clean['goodhood_quality']=df_clean['overallqual']*df_clean['goodhood']
df_clean['nicehood_totalsf']=df_clean['totalsf']*df_clean['nicehood']
df_clean['goodhood_totalsf']=df_clean['totalsf']*df_clean['goodhood']

df_clean['totaloutside']=(df_clean['enclosedporch']+df_clean['wooddecksf']+ 
                          df_clean['openporchsf']+df_clean['threessnporch']+
                          df_clean['screenporch']#+df_clean['poolarea']
                         )
df_clean['totaloutside_quality']=df_clean['overallqual']*df_clean['totaloutside']


In [52]:
cat_cols=['nicehood','goodhood','centralairint','salepartial']
num_cols=['overallqual',
          'overallqualsquared',
          'overallqualcubed',
          'grlivarea',
          'totalbsmtsf',
          'garagearea',
          'lotarea',
          'lotareasquared',
          'ageremodeled',
          'ageremodeledsquared',
          'agebuilt',
          'agebuiltsquared',
          'bedroomabvgr']

X = df_clean[['overallqual',
              'overallqualsquared',#
              'overallqualcubed',#
              'grlivarea',
              'grlivareasquared',#
               'totalbsmtsf',
               'garagearea', 
               'lotarea',
              'lotareasquared',
              'neighborhood',
               'ageremodeled',
               'ageremodeledsquared',
               'agebuilt',
               'agebuiltsquared',
              'centralairint',
              'bedroomabvgr',#
              'nicehood_totalsf',
              'goodhood_totalsf',
              'totaloutside',
              'secondflrexists',#
              'haspool',
              ]]
y = df_clean['saleprice']
y_log=np.log(y)


In [59]:
#I had to use compute here because I didn't have dask_ml
X_train, X_test, y_log_train, y_log_test=train_test_split(X.compute(),y_log.compute(),test_size=.2,random_state=1)
# X_train = dd.from_pandas(X_train, npartitions=3)
# X_test = dd.from_pandas(X_test, npartitions=3)
# y_log_test = dd.from_pandas(y_log_test, npartitions=3)
# y_log_train = dd.from_pandas(y_log_train, npartitions=3)


In [63]:
encoder = LeaveOneOutEncoder(cols=["neighborhood"])
X_train=encoder.fit_transform(X_train, y_log_train)
X_test=encoder.transform(X_test)

  elif pd.api.types.is_categorical(cols):


In [67]:
grid = {"max_depth": [5,6,8], "n_estimators": [50],'min_samples_leaf':[5, 10]}
model = GridSearchCV(
    RandomForestRegressor(),
    param_grid=grid,
    cv=4,
    #     scoring=make_scorer(f1_score),
#     scoring=make_scorer(roc_auc_score),
    verbose=1,
)

with joblib.parallel_backend('dask'):
    model.fit(X_train, y_log_train)

Fitting 4 folds for each of 6 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend DaskDistributedBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:    9.9s finished


In [69]:
print(model.score(X_train, y_log_train))
print(model.score(X_test, y_log_test))


0.918821199576981
0.8485988119939958
