In [22]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score
import time

In [23]:
pd.set_option('display.max_rows', 500)

In [24]:
df = pd.read_csv('./../data/ames_housing_price_data_v4.csv')

In [25]:
def seasonal_adjust(df, mo_avg):
    df['SalePrice_adj'] = 0 * len(df)
    for i in range(len(df)):
        price_norm = mo_avg
        df.loc[i, 'SalePrice_adj'] = (df.loc[i, 'SalePrice']) / mo_avg.loc[df.loc[i, 'MoSold'], 'monthly_avg_ratio']
    return df

In [26]:
mo_avg = df.groupby('MoSold').agg(monthly_avg = ('SalePrice', 'mean'), monthly_sd = ('SalePrice', 'std'))

In [27]:
mo_avg['monthly_avg_ratio'] = mo_avg['monthly_avg'] / np.mean(df['SalePrice'])

In [28]:
df = seasonal_adjust(df, mo_avg)

In [29]:
df['SalePrice_adj_log'] = np.log10(df['SalePrice_adj'])

In [30]:
#exclude weird sale types

In [31]:
df['SaleCondition'].value_counts()

Normal     2413
Partial      82
Abnorml      61
Family       17
Alloca        4
AdjLand       2
Name: SaleCondition, dtype: int64

In [32]:
df=df[(df['SaleCondition']=='Normal') | (df['SaleCondition']=='Partial')]

In [33]:
price = df['SalePrice']
price_log = df['SalePrice_log']
price_adj = df['SalePrice_adj']
price_adj_log = df['SalePrice_adj_log']

In [34]:
log_cols = []
for col in df.columns:
    if '_log' in col:
        log_cols.append(col)
log_cols

['SalePrice_log',
 'LotFrontage_log',
 'LotArea_log',
 '1stFlrSF_log',
 '2ndFlrSF_log',
 'GrLivArea_log',
 'SalePrice_adj_log']

In [35]:
to_dummify = [
    'Street_paved',
    'Alley',
    'LandContour',
    'Utilities',
    'LandSlope',
    'Neighborhood',
    'BldgType',
    #'OverallQual',
    #'OverallCond',
    'RoofStyle',
    'RoofMatl',
    'MasVnrType',
    #'ExterQual',
    #'ExterCond',
    'Foundation',
    'CentralAir',
    #'KitchenQual',
    #'FireplaceQu',
    'GarageFinish',
    #'GarageQual',
    #'GarageCond',
    'PavedDrive',
    #'PoolQC',
    'Fence',
    'MiscFeature',
    #'MoSold',#
    #'HeatingQC_ord',
    'LotShape_com',
    'MSZoning_com',
    'Heating_com',
    'Electrical_com',
    'LotConfig_com',
    #'BsmtCond_ord',
    #'BsmtQual_ord',
    'BsmtExposure_ord',
    'GarageType_com',
    #'number_floors',
    'attic',
    'PUD',
    #'Functional_ord',
    'Garage_age_bin',
    'Remod_age_bin',
    'SaleType',#
    'SaleCondition' #
]

In [36]:
df = pd.get_dummies(df, columns = to_dummify, drop_first = True)

In [37]:
droplist = ['SaleType', 'SaleCondition', 'Garage_age_years', 'Remod_age_years', 'MoSold']
alwaysdrop = ['PID', 'SalePrice', 'SalePrice_log', 'SalePrice_adj', 'SalePrice_adj_log', 'sold_datetime']
df3 = df.drop((alwaysdrop), axis = 1) #+ droplist

In [38]:
df3.head().T

Unnamed: 0,0,1,2,3,4
GrLivArea,856.0,1049.0,1001.0,1039.0,1665.0
LotFrontage,177.651344,42.0,60.0,80.0,70.0
LotArea,7890.0,4235.0,6060.0,8146.0,8400.0
OverallQual,6.0,5.0,5.0,4.0,8.0
OverallCond,6.0,5.0,9.0,8.0,6.0
MasVnrArea,0.0,149.0,0.0,0.0,0.0
ExterQual,3.0,4.0,4.0,4.0,4.0
ExterCond,3.0,3.0,3.0,4.0,3.0
BsmtUnfSF,618.0,104.0,100.0,405.0,167.0
TotalBsmtSF,856.0,1049.0,837.0,405.0,810.0


In [39]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

#randomForest = ensemble.RandomForestClassifier()
model = ensemble.RandomForestRegressor()

x_train, x_test, y_train, y_test = train_test_split(df3, price, test_size=0.3)


In [45]:
model.set_params(min_samples_leaf=2,min_samples_split=2,max_features=50,n_estimators=3000)
model.fit(x_train,y_train)
print(model.score(x_train, y_train))
print(model.score(x_test, y_test))
features_importance=pd.DataFrame({'Features':x_train.columns,'Score':model.feature_importances_}).sort_values('Score')

0.9765539680776146
0.8986533866354087


In [None]:
features_filtered=features_importance[features_importance['Score']<0.000354]

alwaysdrop = ['PID', 'SalePrice', 'SalePrice_log', 'SalePrice_adj', 'SalePrice_adj_log', 'sold_datetime']

droplist = list(features_filtered['Features']) + alwaysdrop
price = df['SalePrice']

avg_price_df=pd.read_csv('../data/house_surrounding_avg_prices.csv')
avg_price_df2=avg_price_df[['PID','AvgPrice-0.25']].drop_duplicates() #,'AvgPrice-0.5'
df3=df.merge(avg_price_df2,how='left')
df3=df3.merge(feature_df, how='left')

df3 = df3.drop(droplist, axis = 1) #+ droplist
df3=df3.fillna(0)
df3.shape

(2495, 208)

In [None]:
# grid_para_tree = [{
#     "min_samples_leaf": range(3, 10),
#     "min_samples_split": np.linspace(start=5, stop=20, num=5, dtype=int),
#     "n_estimators": [300],
#     "max_features":[30,35,40,45,50,55,60] #'auto', 'sqrt', 'log2', 5, 10, ,70,100,150
# }]

# grid_search_tree = GridSearchCV(
#     model, grid_para_tree, cv=5, n_jobs=-1)

# %time grid_search_tree.fit(x_train, y_train)
# print(grid_search_tree.score(x_train, y_train))
# print(grid_search_tree.score(x_test, y_test))
# grid_search_tree.best_params_
# #grid_search_tree.cv_results_['mean_test_score']

In [121]:
model3 = ensemble.RandomForestRegressor()

x_train, x_test, y_train, y_test = train_test_split(df3, price, test_size=0.3)

model3.set_params(min_samples_leaf=3,min_samples_split=3,max_features=50,n_estimators=3000)
model3.fit(x_train,y_train)
print("-"*50)
print(model3.score(x_train, y_train))
print(model3.score(x_test, y_test))
print("-"*50)

--------------------------------------------------
0.9733309399452857
0.9015840434010829
--------------------------------------------------


In [113]:
features_filtered=features_importance[features_importance['Score']<0.000354]


feature_df=pd.read_csv('../data/house_coordinates_1.0.csv')
feature_df=feature_df.drop(['Address','Coords4','latitude','longitude'],axis=1)

features_importance2=pd.DataFrame({'Features':x_train.columns,'Score':model3.feature_importances_}).sort_values('Score')

features_filtered2=features_importance[features_importance['Score']<0.0001]


alwaysdrop = ['PID', 'SalePrice', 'SalePrice_log', 'SalePrice_adj', 'SalePrice_adj_log', 'sold_datetime']

droplist = list(features_filtered['Features']) + alwaysdrop
price = df['SalePrice']

avg_price_df=pd.read_csv('../data/house_surrounding_avg_prices.csv')
avg_price_df2=avg_price_df[['PID','AvgPrice-0.25']].drop_duplicates() #,'AvgPrice-0.5'
df3=df.merge(avg_price_df2,how='left')
df3=df3.merge(feature_df, how='left')

df3 = df3.drop(droplist, axis = 1) #+ droplist
df3=df3.fillna(0)
df3.shape

CPU times: user 6.29 s, sys: 684 ms, total: 6.98 s
Wall time: 7min 25s
0.9684912914952565
0.9177168623256049


{'max_features': 50,
 'min_samples_leaf': 3,
 'min_samples_split': 5,
 'n_estimators': 300}

In [115]:
grid_search_tree.best_params_

{'max_features': 50,
 'min_samples_leaf': 3,
 'min_samples_split': 5,
 'n_estimators': 300}

Unnamed: 0,Features,Score
170,4141_beach,0.0
117,2510_general,0.0
112,2422_camp_site,0.0
176,5209_street_lamp,0.0
200,7216_vineyard,0.0
109,2402_motel,0.0
168,4101_spring,0.0
159,2961_wastewater_plant,0.0
81,2008_town_hall,0.0
187,5655_helipad,0.0


In [80]:
model2 = ensemble.RandomForestRegressor()

x_train, x_test, y_train, y_test = train_test_split(df3, price, test_size=0.3)
feat=int(len(df3.columns)/3)
model2.set_params(min_samples_leaf=2,min_samples_split=2,max_features=feat,n_estimators=3000)
model2.fit(x_train,y_train)
print("-"*50)
print(model2.score(x_train, y_train))
print(model2.score(x_test, y_test))
print("-"*50)

--------------------------------------------------
0.9758587204462502
0.9210692714530684
--------------------------------------------------


In [100]:
grid_para_tree = [{
    "min_samples_leaf": range(3, 10),
    "min_samples_split": np.linspace(start=5, stop=20, num=5, dtype=int),
    "n_estimators": [300],
    "max_features":[30,35,40,45,50,55,60] #'auto', 'sqrt', 'log2', 5, 10, ,70,100,150
}]

grid_search_tree = GridSearchCV(
    model, grid_para_tree, cv=5, n_jobs=-1)

%time grid_search_tree.fit(x_train, y_train)
print(grid_search_tree.score(x_train, y_train))
print(grid_search_tree.score(x_test, y_test))
grid_search_tree.best_params_
#grid_search_tree.cv_results_['mean_test_score']

CPU times: user 4.5 s, sys: 538 ms, total: 5.04 s
Wall time: 3min 6s
0.9658030781619812
0.9165626755107035


{'max_features': 35,
 'min_samples_leaf': 3,
 'min_samples_split': 2,
 'n_estimators': 300}

In [None]:
0.9660492504801086
0.9188001671125462