In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import time

from catboost import CatBoostRegressor

import re

from statsmodels.stats.outliers_influence import variance_inflation_factor
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVR

import pickle

In [2]:
def dummify(df, non_dummies, dummies):
    for dummified in dummies:
        for original in non_dummies:
            if original in dummified:
                orig_name = f'{original}_'
                value = dummified.replace(orig_name, '')
                df[dummified] = df[original].map(lambda x: 1 if x == value else 0)
    df=df.drop(non_dummies,axis=1)
    return df

In [3]:
pd.set_option('display.max_rows', 500)

In [4]:
df = pd.read_csv('./../data/ames_housing_price_data_v5.csv')

In [5]:
#drop outlier rows
df = df[(df['PID'] != 902207130) & (df['PID'] != 908154205)]
df = df[(df['SaleCondition'] == 'Normal') | (df['SaleCondition'] == 'Partial')]
df = df[df['BedroomAbvGr'] != 0]
df = df[df['MSZoning_com'] != 'Nonresidential']
df.reset_index(drop=True, inplace = True)

In [6]:
#separate price from table
price=df['SalePrice']
price_log = np.log10(price)
df=df.drop(['SalePrice_log'],axis=1) #'SalePrice',

In [7]:
#add radial average price feature
avg_price_df=pd.read_csv('./../data/house_surrounding_avg_prices.csv')
avg_price_df2=avg_price_df[['PID','AvgPrice-0.5']].drop_duplicates() #,'AvgPrice-0.25'

df2=df.merge(avg_price_df2,how='left')

In [8]:
#add geographical features
radial = pd.read_csv('./../data/house_coordinates_1.0.csv')
radial.drop(columns = ('2204_park'), inplace = True)
for col in radial.columns:
    prefix = str(col)[0:4]
    if re.search('^\d\d\d\d_', str(col)):
        radial.rename(columns = {col: col[5:]}, inplace = True)
rad_drops = [
    'Address',
    'Coords4',
    'latitude',
    'longitude',
    'town_hall',
    'cemetery',
    'motel',
    'camp_site',
    'general',
    'picnic_site',
    'wastewater_plant',
    'spring',
    'beach',
    'street_lamp',
    'helipad',
    'vineyard',
    'crossing',
    'tree',
    'grass',
    'christian',
    'bus_stop',
    'parking',
    'toilet',
    'bench',
    'commercial',
    'waste_basket',
    'drinking_water',
    'convenience',
    'camera_surveillance',
    'comms_tower',
    'residential',
    'gift_shop',
    'jeweller',
    'hairdresser',
    'bookshop',
    'clothes',
    'retail',
    'food_court',
    'artwork',
    'cafe',
    'traffic_signals',
    'beauty_shop',
    'sports_shop',
    'weir',
    'track',
    'turning_circle',
    'computer_shop',
    'bicycle_shop',
    'department_store',
    'parking_bicycle',
    'golf_course',
    'tower',
    'beverages',
    'university'
]
radial.drop(columns = rad_drops, inplace = True)

df2=df2.merge(radial, how='left')

In [9]:
#other rows to drop:
droplist=['PID','GarageFinish','SaleCondition','GarageType_com','Garage_age_bin','sold_datetime']
df2=df2.drop(droplist,axis=1)

In [10]:
#fillnas
df2=df2.fillna(0)

In [11]:
scaler = MinMaxScaler()

def fit_scale(col):
    scaler.fit(df2[[col]])
    df2[[col]]=scaler.transform(df2[[col]])

fit_scale('OverallQual')
fit_scale('ExterQual')
fit_scale('OverallCond')
fit_scale('KitchenQual')

df2['PorchSF']=df2['OpenPorchSF']+df2['EnclosedPorch']+df2['3SsnPorch']+df2['ScreenPorch']


df2['SaleTypeNew']=(df2['SaleType']=='New')
df2['SaleTypeNew']=df2['SaleTypeNew'].apply(lambda x: 1 if x == True else 0)


df2['BSMT_LowQual']=df2['TotalBsmtSF']-df2['BSMT_GLQ']-df2['BSMT_ALQ']
df2['BSMT_HighQual']=df2['BSMT_GLQ']+df2['BSMT_ALQ']

In [12]:
features_to_use=[
    ### from original dataset
    'GrLivArea', 
    'LotArea', 
    'OverallQual',
    'BSMT_LowQual', 
    'house_age_years', 
    'GarageCars',
    'MasVnrType',
    'FullBath',
    'HalfBath',
    'BsmtExposure_ord',
    'SaleTypeNew',
    'Neighborhood',
    'BldgType',
    'PorchSF',
    'BSMT_HighQual',
    'Fireplaces',
    'Pool',
    'BedroomAbvGr',
    'ExterQual',
    'OverallCond',
    'KitchenQual',
    
    ### from radial location data for catboost
    'water_tower',
    'graveyard',
    'police', 
    'optician',
    'slipway',
    'bar',
    'cinema',
    'supermarket',
    'hotel',
    'stop',
    'farmyard',
    'christian_catholic', 
    'jewish',
    'muslim',
    'garden_centre',
    'christian_lutheran'                 
]

In [13]:
front_end=df2[features_to_use]

# front_end.to_csv('data/ames_housing_price_data_v6.csv')

In [14]:
#function and dictionaries required to transform front-end to back-end
dummies = [   
    'Neighborhood_Blueste',
    'Neighborhood_BrDale',
    'Neighborhood_BrkSide',
    'Neighborhood_ClearCr',
    'Neighborhood_CollgCr',
    'Neighborhood_Crawfor',
    'Neighborhood_Edwards',
    'Neighborhood_Gilbert', 
    'Neighborhood_Greens', 
    'Neighborhood_GrnHill',
    'Neighborhood_IDOTRR',
    'Neighborhood_Landmrk',
    'Neighborhood_MeadowV',
    'Neighborhood_Mitchel',
    'Neighborhood_NAmes',
    'Neighborhood_NPkVill',
    'Neighborhood_NWAmes',
    'Neighborhood_NoRidge',
    'Neighborhood_NridgHt',
    'Neighborhood_OldTown',
    'Neighborhood_SWISU',
    'Neighborhood_Sawyer',
    'Neighborhood_SawyerW',
    'Neighborhood_Somerst',
    'Neighborhood_StoneBr',
    'Neighborhood_Timber',
    'Neighborhood_Veenker',
    'BldgType_2fmCon',
    'BldgType_Duplex',
    'BldgType_Twnhs',
    'BldgType_TwnhsE',
    'MasVnrType_None',
    'MasVnrType_Stone'
    ]

non_dummies=['Neighborhood', 'BldgType', 'MasVnrType']

In [15]:
#transformation of front-end to back-end, and catboost application
back_end = front_end.copy()
back_end['ExterQualDisc']=back_end['ExterQual']-back_end['OverallQual']
back_end['OverallCondDisc']=back_end['OverallCond']-back_end['OverallQual']
back_end['KitchenQualDisc']=back_end['KitchenQual']-back_end['OverallQual']
back_end=back_end.drop(['ExterQual','OverallCond','KitchenQual'],axis=1)

back_end = dummify(back_end, non_dummies, dummies)

cbl = CatBoostRegressor();
cbl.load_model("./../Moritz/HousePriceCatBoost", "cbm")
cbl_pred = cbl.predict(back_end)

In [16]:
dummies_linear = [   
    'Neighborhood_Blueste',
    'Neighborhood_BrDale',
    'Neighborhood_BrkSide',
    'Neighborhood_ClearCr',
    'Neighborhood_CollgCr',
    'Neighborhood_Crawfor',
    'Neighborhood_Edwards',
    'Neighborhood_Gilbert', 
    'Neighborhood_Greens', 
    'Neighborhood_GrnHill',
    'Neighborhood_IDOTRR',
    'Neighborhood_Landmrk',
    'Neighborhood_MeadowV',
    'Neighborhood_Mitchel',
    'Neighborhood_NAmes',
    'Neighborhood_NPkVill',
    'Neighborhood_NWAmes',
    'Neighborhood_NoRidge',
    'Neighborhood_NridgHt',
    'Neighborhood_OldTown',
    'Neighborhood_SWISU',
    'Neighborhood_Sawyer',
    'Neighborhood_SawyerW',
    'Neighborhood_Somerst',
    'Neighborhood_StoneBr',
    'Neighborhood_Timber',
    'Neighborhood_Veenker',
    'BldgType_2fmCon',
    'BldgType_Duplex',
    'BldgType_Twnhs',
    'BldgType_TwnhsE',
    'MasVnrType_None',
    'MasVnrType_Stone',
    'BSMT_HighQual_bin_500-1000',
    'BSMT_HighQual_bin_0-500',
    'BSMT_HighQual_bin_1000-1500',
    'BSMT_HighQual_bin_1500+',
    'BSMT_LowQual_bin_0-500',
    'BSMT_LowQual_bin_500-1000',
    'BSMT_LowQual_bin_1000-1500',
    'BSMT_LowQual_bin_1500+'
    ]

non_dummies_linear = ['Neighborhood', 'BldgType', 'MasVnrType', 'BSMT_HighQual_bin', 'BSMT_LowQual_bin']

In [17]:
back_end_linear = front_end.copy()
back_end_linear['GrLivArea_log'] = np.log10(back_end_linear['GrLivArea'])
back_end_linear['LotArea_log'] = np.log10(back_end_linear['LotArea'])
back_end_linear['ExterQualDisc'] = back_end_linear['ExterQual'] - back_end_linear['OverallQual']
back_end_linear['OverallCondDisc'] = back_end_linear['OverallCond'] - back_end_linear['OverallQual']
back_end_linear['KitchenQualDisc'] = back_end_linear['KitchenQual'] - back_end_linear['OverallQual']
back_end_linear = back_end_linear.drop(['ExterQual','OverallCond','KitchenQual'], axis=1)

back_end_linear['BSMT_LowQual_bin'] = pd.cut(back_end_linear['BSMT_LowQual'], [-1, 1, 500, 1000, 1500, 2500], labels = ['No basement', '0-500', '500-1000', '1000-1500', '1500+']) 
back_end_linear['BSMT_HighQual_bin'] = pd.cut(back_end_linear['BSMT_HighQual'], [-1, 1, 500, 1000, 1500, 2500], labels = ['No basement', '0-500', '500-1000', '1000-1500', '1500+']) 
back_end_linear.drop(['BSMT_HighQual', 'BSMT_LowQual', 'GrLivArea', 'LotArea'], axis = 1, inplace = True)

back_end_linear = dummify(back_end_linear, non_dummies_linear, dummies_linear)

In [18]:
# # code used to generate model

# kfold = KFold(n_splits=5, shuffle = True, random_state = 1)
# params_log = {'alpha' : [1e-7, 1e-6, 1e-5, 1e-4]
#           }
# lasso = Lasso(normalize = True, max_iter = 1000, tol = 0.001)
# lasso_tuner = GridSearchCV(lasso, params_log, cv=kfold, return_train_score = True)
# lasso_tuner.fit(back_end_linear, np.log10(price))
# with open('./Matt/linearmodel.pickle', mode = 'wb') as file:
#     pickle.dump(lasso_tuner.best_estimator_, file)

In [19]:
with open('./../Matt/linearmodel.pickle', mode = 'rb') as file:
    lm = pickle.load(file)

lm_pred = 10**lm.predict(back_end_linear)

In [20]:
svr = SVR(kernel = 'linear')

In [21]:
kfold = KFold(n_splits=5, shuffle = True, random_state = 0)

In [41]:
params = {
    'C' : [1, 10, 100],
    'epsilon' : [1e-7, 1e-6, 1e-5]    
}

In [23]:
ss_X = StandardScaler()
ss_y = StandardScaler()

In [37]:
X_trans = pd.DataFrame(ss_X.fit_transform(back_end_linear))
y_trans = pd.DataFrame(ss_y.fit_transform(np.array(price_log).reshape(-1,1)))

In [38]:
subsample = X_trans.loc[0:25,:]

In [39]:
subprice = y_trans.loc[0:25]

In [47]:
subprice = subprice[0]

In [53]:
y_trans = y_trans[0]

In [54]:
svr_tuner = GridSearchCV(svr, params, cv = kfold, return_train_score = True)

In [55]:
svr_tuner.fit(X_trans, y_trans)

GridSearchCV(cv=KFold(n_splits=5, random_state=0, shuffle=True),
             estimator=SVR(kernel='linear'),
             param_grid={'C': [1, 10, 100], 'epsilon': [1e-07, 1e-06, 1e-05]},
             return_train_score=True)

In [56]:
svr_tuner.best_score_

0.932199436563035

In [57]:
svr_tuner.best_params_

{'C': 10, 'epsilon': 1e-05}

In [68]:
svr_model = svr_tuner.best_estimator_

In [70]:
svr_tuner.best_estimator_.n_support_

array([2471])

In [67]:
feat_imp_svr = pd.Series(data = svr_tuner.best_estimator_.coef_[0], index = back_end_linear.columns)
feat_imp_svr = feat_imp_svr.sort_values(ascending = False)
ignored_svr = feat_imp_svr[feat_imp_svr == 0]
feat_imp_svr = feat_imp_svr[feat_imp_svr != 0]
print(len(feat_imp_svr))
print(feat_imp_svr)
print(len(ignored_svr))
print(ignored_svr)

73
OverallQual                    0.373164
GrLivArea_log                  0.355332
OverallCondDisc                0.205682
BSMT_HighQual_bin_1000-1500    0.151763
LotArea_log                    0.147699
BSMT_HighQual_bin_500-1000     0.133188
BSMT_LowQual_bin_1000-1500     0.128874
BSMT_LowQual_bin_500-1000      0.107150
BSMT_HighQual_bin_1500+        0.099126
GarageCars                     0.081549
BSMT_LowQual_bin_0-500         0.068544
BSMT_LowQual_bin_1500+         0.066955
Neighborhood_Crawfor           0.066203
BSMT_HighQual_bin_0-500        0.048786
Fireplaces                     0.041137
Neighborhood_StoneBr           0.035711
PorchSF                        0.035509
BsmtExposure_ord               0.034860
KitchenQualDisc                0.034602
Neighborhood_GrnHill           0.033683
Neighborhood_NridgHt           0.033647
Neighborhood_Somerst           0.031348
Neighborhood_NoRidge           0.024612
garden_centre                  0.023470
bar                            0.0223