In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score
import time

In [2]:
pd.set_option('display.max_rows', 500)

In [3]:
df = pd.read_csv('./../data/ames_housing_price_data_v4.csv')

In [4]:
def seasonal_adjust(df, mo_avg):
    df['SalePrice_adj'] = 0 * len(df)
    for i in range(len(df)):
        df.loc[i, 'SalePrice_adj'] = df.loc[i, 'SalePrice'] / mo_avg[(df.loc[i, 'MoSold'])]
    return df

In [5]:
mo_avg = df.groupby('MoSold').agg(monthly_avg = ('SalePrice', 'mean'))['monthly_avg']

In [7]:
df = seasonal_adjust(df, mo_avg)

In [8]:
df['SalePrice_adj_log'] = np.log10(df['SalePrice_adj'])

In [9]:
price = df['SalePrice']
price_log = df['SalePrice_log']
price_adj = df['SalePrice_adj']
price_adj_log = df['SalePrice_adj_log']

In [10]:
log_cols = []
for col in df.columns:
    if '_log' in col:
        log_cols.append(col)
log_cols

['SalePrice_log',
 'LotFrontage_log',
 'LotArea_log',
 '1stFlrSF_log',
 '2ndFlrSF_log',
 'GrLivArea_log',
 'SalePrice_adj_log']

In [11]:
droplist = ['SaleType', 'SaleCondition', 'Garage_age_years', 'Remod_age_years', 'MoSold']
alwaysdrop = ['PID', 'SalePrice', 'SalePrice_log', 'SalePrice_adj', 'SalePrice_adj_log', 'sold_datetime']
df2 = df.drop((alwaysdrop + droplist), axis = 1)

In [12]:
to_dummify = [
    'Street_paved',
    'Alley',
    'LandContour',
    'Utilities',
    'LandSlope',
    'Neighborhood',
    'BldgType',
    'OverallQual',
    'OverallCond',
    'RoofStyle',
    'RoofMatl',
    'MasVnrType',
    'ExterQual',
    'ExterCond',
    'Foundation',
    'CentralAir',
    'KitchenQual',
    'FireplaceQu',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'PavedDrive',
    'PoolQC',
    'Fence',
    'MiscFeature',
    #'MoSold',
    'HeatingQC_ord',
    'LotShape_com',
    'MSZoning_com',
    'Heating_com',
    'Electrical_com',
    'LotConfig_com',
    'BsmtCond_ord',
    'BsmtQual_ord',
    'BsmtExposure_ord',
    'GarageType_com',
    'number_floors',
    'attic',
    'PUD',
    'Functional_ord',
    'Garage_age_bin',
    'Remod_age_bin',
    #'SaleType',
    #'SaleCondition' 
]

In [13]:
df3 = pd.get_dummies(df2, columns = to_dummify, drop_first = True)

In [14]:
lasso = Lasso(normalize = True, max_iter = 5000)

In [15]:
params = {'alpha' : [1e-2, 1e-1, 1, 10, 100]
         }

In [16]:
kfold = KFold(n_splits=10, shuffle = True, random_state = 1)

lasso_tuner = GridSearchCV(lasso, params, cv=kfold, return_train_score = True)

In [None]:
#sample_indices = np.random.choice(df3.index, size = len(df3.index)//100, replace = False)

In [None]:
#sample_df = df3.loc[sample_indices,:]

In [None]:
#sample_price = price[sample_indices]

In [None]:
#start = time.time()
#lasso_tuner.fit(sample_df, sample_price)
#end = time.time()

In [None]:
#end-start # for 1% of dataset

In [24]:
lasso_tuner.fit(df3, price)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


GridSearchCV(cv=KFold(n_splits=10, random_state=1, shuffle=True),
             estimator=Lasso(max_iter=5000, normalize=True),
             param_grid={'alpha': [0.01, 0.1, 1, 10, 100]},
             return_train_score=True)

In [25]:
lasso_tuner.cv_results_['mean_test_score']

array([0.90741343, 0.90811607, 0.90938874, 0.91068931, 0.85936976])

In [27]:
lasso_tuner.cv_results_['mean_train_score']

array([0.93768857, 0.93758827, 0.93664347, 0.92987576, 0.86943186])

In [31]:
print(lasso_tuner.best_params_)
print(max(lasso_tuner.cv_results_['mean_test_score']))

{'alpha': 10}
0.9106893120910808


In [29]:
feat_imp = pd.Series(data = lasso_tuner.best_estimator_.coef_, index = df3.columns)
feat_imp = feat_imp.sort_values(ascending = False)
print(df3.shape[1]) # number of features after dummification
print(len(feat_imp[feat_imp != 0])) # number of features with nonzero coef
feat_imp[feat_imp != 0]

230
104


Neighborhood_GrnHill                                   92227.067392
OverallQual_10                                         73840.140935
PoolQC_5                                               62681.682766
OverallQual_9                                          46923.536937
RoofMatl_Wood Shingles                                 40755.943631
Neighborhood_StoneBr                                   33985.595592
Neighborhood_NoRidge                                   25542.115240
OverallQual_8                                          21124.832990
ExterQual_5                                            19501.079137
Neighborhood_NridgHt                                   18691.303847
LotArea_log                                            18043.001358
Neighborhood_Crawfor                                   16906.751343
Neighborhood_Somerst                                   16475.982515
GarageQual_5                                           16255.331491
BsmtQual_ord_5                                  

In [17]:
params_log = {'alpha' : [1e-8, 1e-7, 1e-6, 1e-5, 1e-4]
         }

In [18]:
lasso_tuner2 = GridSearchCV(lasso, params_log, cv=kfold, return_train_score = True)
lasso_tuner2.fit(df3, price_adj_log)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


GridSearchCV(cv=KFold(n_splits=10, random_state=1, shuffle=True),
             estimator=Lasso(max_iter=5000, normalize=True),
             param_grid={'alpha': [1e-08, 1e-07, 1e-06, 1e-05, 0.0001]},
             return_train_score=True)

In [19]:
lasso_tuner2.cv_results_['mean_test_score']

array([0.91314271, 0.91357669, 0.91476895, 0.91559885, 0.8924171 ])

In [20]:
lasso_tuner2.cv_results_['mean_train_score']

array([0.93895376, 0.93891587, 0.93822477, 0.93324938, 0.90387358])

In [21]:
print(lasso_tuner2.best_params_)
print(max(lasso_tuner2.cv_results_['mean_test_score']))

{'alpha': 1e-05}
0.9155988497416165


In [22]:
feat_imp_log = pd.Series(data = lasso_tuner2.best_estimator_.coef_, index = df3.columns)
feat_imp_log = feat_imp_log.sort_values(ascending = False)
print(df3.shape[1]) # number of features after dummification
print(len(feat_imp_log[feat_imp_log != 0])) # number of features with nonzero coef
feat_imp_log[feat_imp_log != 0]

230
151


GrLivArea_log                                          0.340130
Neighborhood_GrnHill                                   0.180405
GarageQual_5                                           0.088723
1stFlrSF_log                                           0.081828
OverallQual_9                                          0.069794
LotArea_log                                            0.063894
OverallQual_10                                         0.059417
OverallCond_9                                          0.056423
Neighborhood_Crawfor                                   0.047520
PoolQC_5                                               0.046853
OverallCond_8                                          0.046055
Neighborhood_StoneBr                                   0.042516
OverallQual_8                                          0.038422
OverallCond_7                                          0.035943
Neighborhood_Somerst                                   0.035238
Neighborhood_NoRidge                    

In [23]:
print(droplist)

['SaleType', 'SaleCondition', 'Garage_age_years', 'Remod_age_years', 'MoSold']


droplists vs highest test score:

5 splits, kfold

['SaleType', 'SaleCondition', 'Garage_age_years', 'Remod_age_years']: 1e-5, 0.9209204142043642

10 splits, kfold, saleprice_log

['SaleType', 'SaleCondition', 'Garage_age_years', 'Remod_age_years']: 1e-5, 0.9226880202484138

['SaleType', 'SaleCondition', 'Garage_age_years', 'Remod_age_years', 'MoSold']: 1e-05, 0.9224768464834445

10 splits, kfold, saleprice_adj

['SaleType', 'SaleCondition', 'Garage_age_years', 'Remod_age_years', 'MoSold']: 10, 0.9106893120910808

10 splits, kfold, saleprice_adj_log

['SaleType', 'SaleCondition', 'Garage_age_years', 'Remod_age_years', 'MoSold']: 1e-05, 0.9155988497416165