In [52]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import time

from catboost import CatBoostRegressor

import re

from statsmodels.stats.outliers_influence import variance_inflation_factor


In [53]:
pd.set_option('display.max_rows', 500)

In [54]:
df = pd.read_csv('./../data/ames_housing_price_data_v4.csv')

In [55]:
def seasonal_adjust(df, mo_avg):
    df['SalePrice_adj'] = 0 * len(df)
    for i in range(len(df)):
        price_norm = mo_avg
        df.loc[i, 'SalePrice_adj'] = (df.loc[i, 'SalePrice']) / mo_avg.loc[df.loc[i, 'MoSold'], 'monthly_avg_ratio']
    return df

In [56]:
mo_avg = df.groupby('MoSold').agg(monthly_avg = ('SalePrice', 'mean'), monthly_sd = ('SalePrice', 'std'))

In [57]:
mo_avg['monthly_avg_ratio'] = mo_avg['monthly_avg'] / np.mean(df['SalePrice'])

In [58]:
df = seasonal_adjust(df, mo_avg)

In [59]:
df['SalePrice_adj_log'] = np.log10(df['SalePrice_adj'])

In [60]:
#exclude weird sale types

In [61]:
df['SaleCondition'].value_counts()

Normal     2413
Partial      82
Abnorml      61
Family       17
Alloca        4
AdjLand       2
Name: SaleCondition, dtype: int64

In [62]:
df=df[(df['SaleCondition']=='Normal') | (df['SaleCondition']=='Partial')]

In [63]:
price = df['SalePrice']
price_log = df['SalePrice_log']
price_adj = df['SalePrice_adj']
price_adj_log = df['SalePrice_adj_log']

In [64]:
log_cols = []
for col in df.columns:
    if '_log' in col:
        log_cols.append(col)
log_cols

['SalePrice_log',
 'LotFrontage_log',
 'LotArea_log',
 '1stFlrSF_log',
 '2ndFlrSF_log',
 'GrLivArea_log',
 'SalePrice_adj_log']

In [65]:
to_dummify = [
    'Street_paved',
    'Alley',
    'LandContour',
    'Utilities',
    'LandSlope',
    'Neighborhood',
    'BldgType',
    #'OverallQual',
    #'OverallCond',
    'RoofStyle',
    'RoofMatl',
    'MasVnrType',
    #'ExterQual',
    #'ExterCond',
    'Foundation',
    'CentralAir',
    #'KitchenQual',
    #'FireplaceQu',
    'GarageFinish',
    #'GarageQual',
    #'GarageCond',
    'PavedDrive',
    #'PoolQC',
    'Fence',
    'MiscFeature',
    #'MoSold',#
    #'HeatingQC_ord',
    'LotShape_com',
    'MSZoning_com',
    'Heating_com',
    'Electrical_com',
    'LotConfig_com',
    #'BsmtCond_ord',
    #'BsmtQual_ord',
    'BsmtExposure_ord',
    'GarageType_com',
    #'number_floors',
    'attic',
    'PUD',
    #'Functional_ord',
    'Garage_age_bin',
    'Remod_age_bin',
    'SaleType',#
    'SaleCondition' #
]

In [66]:
df = pd.get_dummies(df, columns = to_dummify, drop_first = True)

In [67]:
avg_price_df=pd.read_csv('../data/house_surrounding_avg_prices.csv')
avg_price_df2=avg_price_df[['PID','AvgPrice-0.5']].drop_duplicates() #,'AvgPrice-0.25'

feature_df=pd.read_csv('../data/house_coordinates_1.0.csv')
feature_df=feature_df.drop(['Address','Coords4','latitude','longitude'],axis=1)

df3=df.merge(avg_price_df2,how='left')
df3=df3.merge(feature_df, how='left')

In [68]:
df3.shape

(2495, 327)

In [69]:
model = ensemble.RandomForestRegressor()

droplist = ['SaleType', 'SaleCondition', 'Garage_age_years', 'Remod_age_years', 'MoSold']
alwaysdrop = ['PID', 'SalePrice', 'SalePrice_log', 'SalePrice_adj', 'SalePrice_adj_log', 'sold_datetime']
price = df['SalePrice']
df3 = df3.drop((alwaysdrop), axis = 1) #+ droplist
df3=df3.fillna(0)
x_train, x_test, y_train, y_test = train_test_split(df3, price, test_size=0.3)

In [35]:
model.set_params(min_samples_leaf=2,min_samples_split=2,max_features=50,n_estimators=3000)
%time model.fit(x_train,y_train)
print(model.score(x_train, y_train))
print(model.score(x_test, y_test))
features_importance=pd.DataFrame({'Features':x_train.columns,'Score':model.feature_importances_}).sort_values('Score')

CPU times: user 21.4 s, sys: 133 ms, total: 21.5 s
Wall time: 21.5 s
0.980326766068441
0.8801611752741914


In [36]:
features_filtered=features_importance[features_importance['Score']<0.0003]

droplist = list(features_filtered['Features'])

df4 = df3.drop(droplist, axis = 1) #+ droplist

df4.shape

(2495, 134)

In [197]:
clf = CatBoostRegressor(learning_rate=0.05, subsample=0.9, logging_level='Silent',max_depth=4, random_state=0)

params = ({'n_estimators':[4000]})

grid_search_cat = GridSearchCV(clf, param_grid=params, cv=5, n_jobs=-1)
grid_search_cat.fit(x_train, y_train)

#'LotArea', 'OverallQual', 'OverallCond', 'ExterQual', 'BsmtUnfSF','TotalBsmtSF', 'GrLivArea_log', 'house_age_years', '7228_farmyard','8200_water'

GridSearchCV(cv=5,
             estimator=<catboost.core.CatBoostRegressor object at 0x7fb859de46d0>,
             n_jobs=-1, param_grid={'n_estimators': [4000]})

In [198]:
grid_search_cat.score(x_test, y_test)

0.9218493879696348

In [155]:
0.9495040832990003

0.9495040832990003

In [None]:
df5=df4.copy()
from datetime import datetime


clf = CatBoostRegressor(learning_rate=0.05, subsample=0.9, logging_level='Silent',max_depth=4)
params = ({'n_estimators':[500]})

score_dict_total2={}
col_dict_total2={}
j=1



while len(df5.columns)>10:
    score_dict={}
    x_train2, x_test2, y_train2, y_test2 = train_test_split(df5, price, test_size=0.3, random_state=j)
    for i in x_train2.columns:
        x_train_red=x_train2.drop(i,axis=1)
        x_test_red=x_test2.drop(i,axis=1)
        grid_search_cat = GridSearchCV(clf, param_grid=params, cv=5, n_jobs=-1)
        grid_search_cat.fit(x_train_red, y_train2)
        score=grid_search_cat.score(x_test_red, y_test2)
        score_dict[i]=score

    min_col=max(score_dict.items(), key=lambda x: x[1])[0]
    score_dict_total2[j]=max(score_dict.items(), key=lambda x: x[1])[1]
    col_dict_total2[j]=min_col
    df5=df5.drop(min_col,axis=1)
    
    print(f'{j} columns removed: {min_col}, best score is {score_dict_total2[j]}; time is {datetime.now()}')
    j+=1

1 columns removed: BSMT_GLQ, best score is 0.9119593122710785; time is 2021-08-31 11:31:15.026707
2 columns removed: GrLivArea, best score is 0.9405882494199279; time is 2021-08-31 11:38:01.048564
3 columns removed: GarageType_com_Detached, best score is 0.9426470864440233; time is 2021-08-31 11:44:41.890901
4 columns removed: GarageQual, best score is 0.9345638149037638; time is 2021-08-31 11:51:35.279496
5 columns removed: ScreenPorch, best score is 0.942938230782678; time is 2021-08-31 11:58:39.386850
6 columns removed: 7218_grass, best score is 0.9070830732291352; time is 2021-08-31 12:05:06.015257
7 columns removed: GarageCars, best score is 0.9337313825563965; time is 2021-08-31 12:11:06.847216
8 columns removed: 2421_shelter, best score is 0.8847652110576989; time is 2021-08-31 12:17:49.559041
9 columns removed: LotFrontage, best score is 0.9033638824830923; time is 2021-08-31 12:24:30.991323
10 columns removed: GarageType_com_Attached, best score is 0.8889018275358712; time is 

81 columns removed: 7203_residential, best score is 0.9221183984489617; time is 2021-08-31 15:52:41.416799
82 columns removed: LotShape_com_Slightly irregular, best score is 0.9196873574102451; time is 2021-08-31 15:53:39.039604
83 columns removed: 2519_optician, best score is 0.9261928269296251; time is 2021-08-31 15:54:36.363030
84 columns removed: 5270_parking_bicycle, best score is 0.8911982988125631; time is 2021-08-31 15:55:30.824611
85 columns removed: 2302_fast_food, best score is 0.933786498575874; time is 2021-08-31 15:56:19.612292
86 columns removed: RoofStyle_Gable, best score is 0.9028093151100088; time is 2021-08-31 15:57:08.089665
87 columns removed: 2082_school, best score is 0.905289937312848; time is 2021-08-31 15:57:50.665685
88 columns removed: Remod_age_bin_45+, best score is 0.9173871556306391; time is 2021-08-31 15:58:31.385490
89 columns removed: 2002_fire_station, best score is 0.9130378530408659; time is 2021-08-31 15:59:15.575695
90 columns removed: LotShape_

In [14]:
xyz=pd.read_csv('../../tmp.csv')

In [41]:
l_fe=list(xyz['colname'])

for x in ['5206_motorway_junction', 'Neighborhood_StoneBr', 'MasVnrType_Stone', 'MSZoning_com_Residential']:
    l_fe.remove(x)


In [85]:
def cat_b(x_train,x_test,y_train,y_test):
    clf = CatBoostRegressor(learning_rate=0.05, subsample=0.9, logging_level='Silent',max_depth=4, random_state=0)

    params = ({'n_estimators':[4000]})

    grid_search_cat = GridSearchCV(clf, param_grid=params, cv=5, n_jobs=-1)
    grid_search_cat.fit(x_train, y_train)

    return grid_search_cat.score(x_test, y_test)

In [52]:
df5=df4.drop(l_fe,axis=1)
score_list=[]
for i in range(0,20):
    x_train2, x_test2, y_train2, y_test2 = train_test_split(df5, price, test_size=0.3, random_state=i)

    score_list.append(cat_b(x_train2, x_test2, y_train2, y_test2))

In [57]:
sum(score_list)/len(score_list)

0.9101403192748195

In [58]:
df5=df4[['LotArea', 'OverallQual', 'OverallCond', 'ExterQual', 'BsmtUnfSF',
    'TotalBsmtSF', 'GrLivArea_log', 'house_age_years', '7228_farmyard',
    '8200_water']]
score_list=[]
for i in range(0,20):
    x_train2, x_test2, y_train2, y_test2 = train_test_split(df5, price, test_size=0.3, random_state=i)

    score_list.append(cat_b(x_train2, x_test2, y_train2, y_test2))

In [59]:
sum(score_list)/len(score_list)

0.9088720001871934

In [66]:
df5=df4[['LotArea', 'OverallQual', 'OverallCond', 'ExterQual', 'BsmtUnfSF',
    'TotalBsmtSF', 'GrLivArea_log', 'house_age_years', '7228_farmyard',
    '8200_water']]
x_train2, x_test2, y_train2, y_test2 = train_test_split(df5, price, test_size=0.3, random_state=i)
clf = CatBoostRegressor(learning_rate=0.05, subsample=0.9, logging_level='Silent',max_depth=4, random_state=0)

params = ({'n_estimators':[4000]})

grid_search_cat = GridSearchCV(clf, param_grid=params, cv=5, n_jobs=-1)
grid_search_cat.fit(x_train2, y_train2)

grid_search_cat.score(x_test2, y_test2)


0.9367478649819595

In [70]:
pd.DataFrame({'Name':x_train2.columns,'Importances':grid_search_cat.best_estimator_.feature_importances_})

Unnamed: 0,Name,Importances
0,LotArea,9.761092
1,OverallQual,23.659372
2,OverallCond,2.314539
3,ExterQual,5.099373
4,BsmtUnfSF,4.945722
5,TotalBsmtSF,15.845732
6,GrLivArea_log,24.599487
7,house_age_years,10.079956
8,7228_farmyard,1.550433
9,8200_water,2.144294


In [70]:
testdf=df3.drop(['AvgPrice-0.5'],axis=1)
#testdf=testdf.drop(alwaysdrop,axis=1)

In [72]:
viflist=[variance_inflation_factor(testdf.values, i) for i in range(len(testdf.columns))]
list_of_tuples = list(zip(list(testdf.columns), viflist))
df=pd.DataFrame(list_of_tuples,columns=['a','b'])
df=df.sort_values('b',ascending=False)


  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.uncentered_tss


In [74]:
low_corr_features=list(df[df['b']<5]['a'])

In [86]:
df5=df3[low_corr_features]

score_list=[]
for i in range(0,20):
    x_train2, x_test2, y_train2, y_test2 = train_test_split(df5, price, test_size=0.3, random_state=i)

    score_list.append(cat_b(x_train2, x_test2, y_train2, y_test2))

In [87]:
sum(score_list)/len(score_list)

0.5527467173066151

In [88]:
low_corr_features

['ext_Wood_Siding',
 '2964_water_works',
 'HalfBath',
 'BsmtFullBath',
 'Fence_Minimum Privacy',
 'BldgType_Duplex',
 'PoolArea',
 'MiscVal',
 '5651_airport',
 'MasVnrArea',
 '2523_stationery',
 'LotConfig_com_Cul-de-sac lot',
 'PoolQC',
 'RoofStyle_Mansard',
 'Neighborhood_Greens',
 'ext_Common_Brick',
 'Remod_age_bin_30-45',
 'Foundation_Slab',
 'Remod_age_bin_15-30',
 'ext_Cement_Board',
 'GarageFinish_RFn',
 'GarageType_com_Basement',
 'ext_Plywood',
 'LandContour_Hillside (downward slope on both sides)',
 'RoofMatl_Wood Shakes',
 'Alley_Paved',
 'RoofMatl_Wood Shingles',
 'MiscFeature_Other',
 'RoofStyle_Shed',
 'LandContour_Depression (upward slope on both sides)',
 '5311_dam',
 'WoodDeckSF',
 'Fence_Good Wood',
 'OpenPorchSF',
 'LandSlope_Moderate-severe',
 'attic_Unfinished',
 'MiscFeature_Tennis Court',
 'SaleType_ConLD',
 'BldgType_2fmCon',
 'ext_Face_Brick',
 'LF_Adjacent_Feeder_St',
 'Neighborhood_GrnHill',
 'PavedDrive_Partial Pavement',
 'EnclosedPorch',
 'LF_Near_EW_RR',

In [None]:
df5=df4.copy()
from datetime import datetime


clf = CatBoostRegressor(learning_rate=0.05, subsample=0.9, logging_level='Silent',max_depth=4)
params = ({'n_estimators':[500]})

score_dict_total2={}
col_dict_total2={}
j=1



while len(df5.columns)>10:
    score_dict={}
    x_train2, x_test2, y_train2, y_test2 = train_test_split(df5, price, test_size=0.3, random_state=j)
    for i in x_train2.columns:
        x_train_red=x_train2.drop(i,axis=1)
        x_test_red=x_test2.drop(i,axis=1)
        grid_search_cat = GridSearchCV(clf, param_grid=params, cv=5, n_jobs=-1)
        grid_search_cat.fit(x_train_red, y_train2)
        score=grid_search_cat.score(x_test_red, y_test2)
        score_dict[i]=score

    min_col=max(score_dict.items(), key=lambda x: x[1])[0]
    score_dict_total2[j]=max(score_dict.items(), key=lambda x: x[1])[1]
    col_dict_total2[j]=min_col
    df5=df5.drop(min_col,axis=1)
    
    print(f'{j} columns removed: {min_col}, best score is {score_dict_total2[j]}; time is {datetime.now()}')
    j+=1