In [15]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import time

from catboost import CatBoostRegressor

import re

from statsmodels.stats.outliers_influence import variance_inflation_factor
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler

import pickle

In [16]:
def dummify(df, non_dummies, dummies):
    for dummified in dummies:
        for original in non_dummies:
            if original in dummified:
                orig_name = f'{original}_'
                value = dummified.replace(orig_name, '')
                df[dummified] = df[original].map(lambda x: 1 if x == value else 0)
    df=df.drop(non_dummies,axis=1)
    return df

In [17]:
pd.set_option('display.max_rows', 500)

In [18]:
df = pd.read_csv('data/ames_housing_price_data_v5.csv')

In [19]:
#drop outlier rows
df = df[(df['PID'] != 902207130) & (df['PID'] != 908154205)]
df = df[(df['SaleCondition'] == 'Normal') | (df['SaleCondition'] == 'Partial')]
df = df[df['BedroomAbvGr'] != 0]
df = df[df['MSZoning_com'] != 'Nonresidential']
df.reset_index(drop=True, inplace = True)

In [20]:
#separate price from table
price=df['SalePrice']
df=df.drop(['SalePrice_log'],axis=1) #'SalePrice',

In [21]:
#add radial average price feature
avg_price_df=pd.read_csv('data/house_surrounding_avg_prices.csv')
avg_price_df2=avg_price_df[['PID','AvgPrice-0.5']].drop_duplicates() #,'AvgPrice-0.25'

df2=df.merge(avg_price_df2,how='left')

In [22]:
#add geographical features
radial = pd.read_csv('data/house_coordinates_1.0.csv')
radial.drop(columns = ('2204_park'), inplace = True)
for col in radial.columns:
    prefix = str(col)[0:4]
    if re.search('^\d\d\d\d_', str(col)):
        radial.rename(columns = {col: col[5:]}, inplace = True)
rad_drops = [
    'Address',
    'Coords4',
    'latitude',
    'longitude',
    'town_hall',
    'cemetery',
    'motel',
    'camp_site',
    'general',
    'picnic_site',
    'wastewater_plant',
    'spring',
    'beach',
    'street_lamp',
    'helipad',
    'vineyard',
    'crossing',
    'tree',
    'grass',
    'christian',
    'bus_stop',
    'parking',
    'toilet',
    'bench',
    'commercial',
    'waste_basket',
    'drinking_water',
    'convenience',
    'camera_surveillance',
    'comms_tower',
    'residential',
    'gift_shop',
    'jeweller',
    'hairdresser',
    'bookshop',
    'clothes',
    'retail',
    'food_court',
    'artwork',
    'cafe',
    'traffic_signals',
    'beauty_shop',
    'sports_shop',
    'weir',
    'track',
    'turning_circle',
    'computer_shop',
    'bicycle_shop',
    'department_store',
    'parking_bicycle',
    'golf_course',
    'tower',
    'beverages',
    'university'
]
radial.drop(columns = rad_drops, inplace = True)

df2=df2.merge(radial, how='left')

In [23]:
# # Added by Hao-Wei, in order to save the PID for app purpose.
df2.set_index("PID", inplace = True);
droplist=['GarageFinish','SaleCondition','GarageType_com','Garage_age_bin','sold_datetime']
df2=df2.drop(droplist,axis=1)

In [24]:
#other rows to drop:
# droplist=['PID','GarageFinish','SaleCondition','GarageType_com','Garage_age_bin','sold_datetime']
# df2=df2.drop(droplist,axis=1)

In [25]:
#fillnas
df2=df2.fillna(0)

In [26]:
scaler = MinMaxScaler()

def fit_scale(col):
    scaler.fit(df2[[col]])
    df2[[col]]=scaler.transform(df2[[col]])

fit_scale('OverallQual')
fit_scale('ExterQual')
fit_scale('OverallCond')
fit_scale('KitchenQual')

df2['PorchSF']=df2['OpenPorchSF']+df2['EnclosedPorch']+df2['3SsnPorch']+df2['ScreenPorch']


df2['SaleTypeNew']=(df2['SaleType']=='New')
df2['SaleTypeNew']=df2['SaleTypeNew'].apply(lambda x: 1 if x == True else 0)


df2['BSMT_LowQual']=df2['TotalBsmtSF']-df2['BSMT_GLQ']-df2['BSMT_ALQ']
df2['BSMT_HighQual']=df2['BSMT_GLQ']+df2['BSMT_ALQ']

In [27]:
features_to_use=[
    ### from original dataset
    "SalePrice",
    'GrLivArea', 
    'LotArea', 
    'OverallQual',
    'BSMT_LowQual', 
    'house_age_years', 
    'GarageCars',
    'MasVnrType',
    'FullBath',
    'HalfBath',
    'BsmtExposure_ord',
    'SaleTypeNew',
    'Neighborhood',
    'BldgType',
    'PorchSF',
    'BSMT_HighQual',
    'Fireplaces',
    'Pool',
    'BedroomAbvGr',
    'ExterQual',
    'OverallCond',
    'KitchenQual',
    
    ### from radial location data for catboost
    'water_tower',
    'graveyard',
    'police', 
    'optician',
    'slipway',
    'bar',
    'cinema',
    'supermarket',
    'hotel',
    'stop',
    'farmyard',
    'christian_catholic', 
    'jewish',
    'muslim',
    'garden_centre',
    'christian_lutheran'                 
]

In [28]:
front_end=df2[features_to_use]

front_end.to_csv('data/ames_housing_price_data_v6.csv')

front_end

Unnamed: 0_level_0,SalePrice,GrLivArea,LotArea,OverallQual,BSMT_LowQual,house_age_years,GarageCars,MasVnrType,FullBath,HalfBath,...,cinema,supermarket,hotel,stop,farmyard,christian_catholic,jewish,muslim,garden_centre,christian_lutheran
PID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
909176150,126000,856,7890,0.428571,856.0,71.210959,2.0,,1,0,...,0,4,0,0,0,1,0,0,0,3
905476230,139500,1049,4235,0.285714,104.0,25.104110,1.0,Brick Face,2,0,...,0,4,0,0,0,1,0,1,0,2
535377150,114000,1039,8146,0.142857,405.0,109.402740,1.0,,1,0,...,1,1,0,0,1,0,0,0,0,3
534177230,227000,1665,8400,0.714286,167.0,8.838356,2.0,,2,1,...,1,1,0,0,0,0,1,0,0,3
908128060,198500,1922,7301,0.571429,0.0,6.501370,2.0,Brick Face,3,0,...,0,4,3,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
903205040,121000,952,8854,0.428571,952.0,93.394521,1.0,,1,0,...,0,3,0,0,0,0,0,0,0,3
905402060,139600,1733,13680,0.000000,0.0,54.452055,2.0,,2,0,...,0,4,0,0,0,0,0,1,0,1
909275030,145000,2002,6270,0.285714,1001.0,58.619178,3.0,,2,0,...,0,0,0,0,0,1,0,0,0,2
907192040,217500,1842,8826,0.571429,144.0,7.501370,2.0,Brick Face,2,1,...,0,0,3,1,13,0,0,0,0,0


In [93]:
#function and dictionaries required to transform front-end to back-end
dummies = [   
    'Neighborhood_Blueste',
    'Neighborhood_BrDale',
    'Neighborhood_BrkSide',
    'Neighborhood_ClearCr',
    'Neighborhood_CollgCr',
    'Neighborhood_Crawfor',
    'Neighborhood_Edwards',
    'Neighborhood_Gilbert', 
    'Neighborhood_Greens', 
    'Neighborhood_GrnHill',
    'Neighborhood_IDOTRR',
    'Neighborhood_Landmrk',
    'Neighborhood_MeadowV',
    'Neighborhood_Mitchel',
    'Neighborhood_NAmes',
    'Neighborhood_NPkVill',
    'Neighborhood_NWAmes',
    'Neighborhood_NoRidge',
    'Neighborhood_NridgHt',
    'Neighborhood_OldTown',
    'Neighborhood_SWISU',
    'Neighborhood_Sawyer',
    'Neighborhood_SawyerW',
    'Neighborhood_Somerst',
    'Neighborhood_StoneBr',
    'Neighborhood_Timber',
    'Neighborhood_Veenker',
    'BldgType_2fmCon',
    'BldgType_Duplex',
    'BldgType_Twnhs',
    'BldgType_TwnhsE',
    'MasVnrType_None',
    'MasVnrType_Stone'
    ]

non_dummies=['Neighborhood', 'BldgType', 'MasVnrType']

In [116]:
#transformation of front-end to back-end, and catboost application
back_end = front_end.copy()

back_end.drop("Saleprice", axis = 1, inplace = True);

back_end['ExterQualDisc']=back_end['ExterQual']-back_end['OverallQual']
back_end['OverallCondDisc']=back_end['OverallCond']-back_end['OverallQual']
back_end['KitchenQualDisc']=back_end['KitchenQual']-back_end['OverallQual']
back_end=back_end.drop(['ExterQual','OverallCond','KitchenQual'],axis=1)

back_end = dummify(back_end, non_dummies, dummies)

cbl = CatBoostRegressor();
cbl.load_model("Moritz/HousePriceCatBoost", "cbm")
cbl_pred = cbl.predict(back_end)

In [95]:
dummies_linear = [   
    'Neighborhood_Blueste',
    'Neighborhood_BrDale',
    'Neighborhood_BrkSide',
    'Neighborhood_ClearCr',
    'Neighborhood_CollgCr',
    'Neighborhood_Crawfor',
    'Neighborhood_Edwards',
    'Neighborhood_Gilbert', 
    'Neighborhood_Greens', 
    'Neighborhood_GrnHill',
    'Neighborhood_IDOTRR',
    'Neighborhood_Landmrk',
    'Neighborhood_MeadowV',
    'Neighborhood_Mitchel',
    'Neighborhood_NAmes',
    'Neighborhood_NPkVill',
    'Neighborhood_NWAmes',
    'Neighborhood_NoRidge',
    'Neighborhood_NridgHt',
    'Neighborhood_OldTown',
    'Neighborhood_SWISU',
    'Neighborhood_Sawyer',
    'Neighborhood_SawyerW',
    'Neighborhood_Somerst',
    'Neighborhood_StoneBr',
    'Neighborhood_Timber',
    'Neighborhood_Veenker',
    'BldgType_2fmCon',
    'BldgType_Duplex',
    'BldgType_Twnhs',
    'BldgType_TwnhsE',
    'MasVnrType_None',
    'MasVnrType_Stone',
    'BSMT_HighQual_bin_500-1000',
    'BSMT_HighQual_bin_0-500',
    'BSMT_HighQual_bin_1000-1500',
    'BSMT_HighQual_bin_1500+',
    'BSMT_LowQual_bin_0-500',
    'BSMT_LowQual_bin_500-1000',
    'BSMT_LowQual_bin_1000-1500',
    'BSMT_LowQual_bin_1500+'
    ]

non_dummies_linear = ['Neighborhood', 'BldgType', 'MasVnrType', 'BSMT_HighQual_bin', 'BSMT_LowQual_bin']

In [96]:
back_end_linear = front_end.copy()
back_end_linear['GrLivArea_log'] = np.log10(back_end_linear['GrLivArea'])
back_end_linear['LotArea_log'] = np.log10(back_end_linear['LotArea'])
back_end_linear['ExterQualDisc'] = back_end_linear['ExterQual'] - back_end_linear['OverallQual']
back_end_linear['OverallCondDisc'] = back_end_linear['OverallCond'] - back_end_linear['OverallQual']
back_end_linear['KitchenQualDisc'] = back_end_linear['KitchenQual'] - back_end_linear['OverallQual']
back_end_linear = back_end_linear.drop(['ExterQual','OverallCond','KitchenQual'], axis=1)

back_end_linear['BSMT_LowQual_bin'] = pd.cut(back_end_linear['BSMT_LowQual'], [-1, 1, 500, 1000, 1500, 2500], labels = ['No basement', '0-500', '500-1000', '1000-1500', '1500+']) 
back_end_linear['BSMT_HighQual_bin'] = pd.cut(back_end_linear['BSMT_HighQual'], [-1, 1, 500, 1000, 1500, 2500], labels = ['No basement', '0-500', '500-1000', '1000-1500', '1500+']) 
back_end_linear.drop(['BSMT_HighQual', 'BSMT_LowQual', 'GrLivArea', 'LotArea'], axis = 1, inplace = True)

back_end_linear = dummify(back_end_linear, non_dummies_linear, dummies_linear)

In [120]:
# # code used to generate model

# kfold = KFold(n_splits=5, shuffle = True, random_state = 1)
# params_log = {'alpha' : [1e-7, 1e-6, 1e-5, 1e-4]
#           }
# lasso = Lasso(normalize = True, max_iter = 1000, tol = 0.001)
# lasso_tuner = GridSearchCV(lasso, params_log, cv=kfold, return_train_score = True)
# lasso_tuner.fit(back_end_linear, np.log10(price))
# with open('./Matt/linearmodel.pickle', mode = 'wb') as file:
#     pickle.dump(lasso_tuner.best_estimator_, file)

In [121]:
with open('./Matt/linearmodel.pickle', mode = 'rb') as file:
    lm = pickle.load(file)

lm_pred = 10**lm.predict(back_end_linear)