# Preprocssing/Feature Engineering and Modeling

### Problem Statement: What features of a low-assessed real estate property are the most valuable for improving sale price?

## Preprocessing and Feature Engineering

In [2445]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.compose import make_column_transformer
from sklearn.feature_selection import RFE
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.formula.api import ols
import warnings

In [2446]:
pd.set_option('max_columns', None) # i want to see all columns printed out

In [2447]:
ames = pd.read_csv('../my_datasets/data_cleaning_eda_data/main_ames.csv').drop(columns = 'Unnamed: 0')
ames_test = pd.read_csv('../datasets/test.csv')
ames_test.columns = ames_test.columns.str.lower().str.replace(' ', '_') # making columns lowercase, replacing spaces with underscores

In [2448]:
def kaggle_submission(preds, fn=''): # function for making kaggle predictions
    ''' function to make csv for kaggle with passed predictions '''
    ames_test['saleprice'] = preds
    ames_test[['id', 'saleprice']].to_csv(f'kaggle_sub_{fn}.csv', index=False)

def model_metrics(m, x, y):
    ''' function for printing out model metrics with passed in model, x, y vars '''
    m_preds = m.predict(x)
    print(f'''
    R-squared = {r2_score(y, m_preds)}
    MAE = {mean_absolute_error(y, m_preds)}
    RMSE = {mean_squared_error(y, m_preds, squared = False)},
    MSE = {mean_squared_error(y, m_preds, squared = True)}
    ''')
    
def model_metrics_gs(y, preds): # 
    '''function for printing out model metrics specifically for gridsearch objects'''
    print(f'''
    R-squared = {r2_score(y, preds)}
    MAE = {mean_absolute_error(y, preds)}
    RMSE = {mean_squared_error(y, preds, squared = False)},
    MSE = {mean_squared_error(y, preds, squared = True)}
    ''')

In [2449]:
# these are the columns i am interested in
alt_cols = ['utilities', 'house_style', 'year_remod/add', 'roof_style', 'roof_matl', 'exterior_1st', 'exterior_2nd', 
            'mas_vnr_type', 'exter_qual', 'exter_cond', 'bsmt_qual', 'heating', 'heating_qc', 'central_air', 'electrical', 
           'garage_area', 'garage_qual',  'paved_drive', 'open_porch_sf', 'pool_area', 'pool_qc', 'fence']

In [2450]:
num_cols = [i[0] for i in ames.dtypes.iteritems() if (i[1] != object)] # numeric columns

In [2451]:
ames[alt_cols].head(3)

Unnamed: 0,utilities,house_style,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,exter_qual,exter_cond,bsmt_qual,heating,heating_qc,central_air,electrical,garage_area,garage_qual,paved_drive,open_porch_sf,pool_area,pool_qc,fence
0,AllPub,2Story,2005,Gable,CompShg,HdBoard,Plywood,BrkFace,Gd,TA,TA,GasA,Ex,Y,SBrkr,475,TA,Y,44.0,0.0,,
1,AllPub,2Story,1997,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,Gd,GasA,Ex,Y,SBrkr,559,TA,Y,74.0,0.0,,
2,AllPub,1Story,2007,Gable,CompShg,VinylSd,VinylSd,,TA,Gd,TA,GasA,TA,Y,SBrkr,246,TA,Y,52.0,0.0,,


In [2452]:
ames.isnull().sum()[ames.isnull().sum() > 0] # looks like columns that should actually have NA values (representing that the house is missing that feature)

alley             1911
mas_vnr_type        22
bsmt_qual           55
bsmt_cond           55
bsmt_exposure       58
bsmtfin_type_1      55
bsmtfin_type_2      56
fireplace_qu      1000
garage_type        113
garage_finish      114
garage_qual        114
garage_cond        114
pool_qc           2042
fence             1651
misc_feature      1986
dtype: int64

#### Converting ordinal columns to numerical values for regression

In [2453]:
to_ordinal_cols = ['bsmt_qual', 'exter_qual', 'exter_cond', 'heating_qc', 'kitchen_qual', 
                   'garage_qual', 'pool_qc', 'fence']

In [2454]:
ames[to_ordinal_cols] = ames[to_ordinal_cols].fillna('NA') # fill nas with 'NA'
ames_test[to_ordinal_cols] = ames_test[to_ordinal_cols].fillna('NA') # fill nas with 'NA'

In [2455]:
for col in to_ordinal_cols: # make new ordinal cols for columns
    new_ord_col = ames[col].copy(deep=True)
    new_ord_col = new_ord_col.replace({'Gd': 4, 'TA': 3, 'Ex': 5, 'Fa': 2, 'Po': 1, 'NA': 0, 'MnPrv': 3, 'GdPrv': 4, 'GdWo': 2, 'MnWw': 1})
    ames[col + '_ordinal'] = new_ord_col
    
for col in to_ordinal_cols: # make new ordinal cols for test dataset columns
    new_ord_col = ames_test[col].copy(deep=True)
    new_ord_col = new_ord_col.replace({'Gd': 4, 'TA': 3, 'Ex': 5, 'Fa': 2, 'Po': 1, 'NA': 0, 'MnPrv': 3, 'GdPrv': 4, 'GdWo': 2, 'MnWw': 1})
    ames_test[col + '_ordinal'] = new_ord_col

In [2456]:
ordinal_cols = [i for i in ames.columns if 'ordinal' in i]

ordinal_cols = [i for i in ames_test.columns if 'ordinal' in i ]
ames_test[ordinal_cols].head(3)

Unnamed: 0,bsmt_qual_ordinal,exter_qual_ordinal,exter_cond_ordinal,heating_qc_ordinal,kitchen_qual_ordinal,garage_qual_ordinal,pool_qc_ordinal,fence_ordinal
0,2,3,2,4,2,1,0,0
1,4,3,3,3,3,3,0,0
2,4,4,3,5,4,3,0,0


#### Making 'neighborhood' an ordinal value based on saleprice

In [2457]:
gb_hood_price = ames.groupby('neighborhood')['saleprice'].mean().sort_values(ascending = True) # groupby neighborhood

neighborhoods = []
rankings = []

count = 1

# assigning values to each neighborhood based on mean sale price
# higher numbers = higher mean sale price in neighborhood
for hood, _ in gb_hood_price.iteritems():
    neighborhoods.append(hood)
    rankings.append(count)
    count += 1

d = {'neighborhood': neighborhoods, 'neighborhood_ordinal': rankings}
ames = pd.merge(ames, pd.DataFrame(data=d), how='left', on = 'neighborhood')
ames_test = pd.merge(ames_test, pd.DataFrame(data=d), how='left', on = 'neighborhood')

In [2458]:
# plt.figure(figsize = (20,16))
# sns.heatmap(ames[num_cols_for_model].corr(), vmin = -1, annot = True, cmap='coolwarm');

#### Adding columns - rates, differences

In [2459]:
# want to see the poly feature x^2 for this
ames["year_remod/add - year_built"] = ames['year_remod/add'] - ames['year_built']
ames['pool_area/lot_area'] = ames['pool_area']/ames['lot_area']

ames_test["year_remod/add - year_built"] = ames_test['year_remod/add'] - ames_test['year_built']
ames_test['pool_area/lot_area'] = ames_test['pool_area']/ames_test['lot_area']

In [2460]:
ames[['year_remod/add - year_built', 'pool_area/lot_area', 'pool_area']].head(3)

Unnamed: 0,year_remod/add - year_built,pool_area/lot_area,pool_area
0,29,0.0,0.0
1,1,0.0,0.0
2,54,0.0,0.0


#### Changing ms_subclass dtype

In [2461]:
ames['ms_subclass'] = ames['ms_subclass'].astype(object)
ames_test['ms_subclass'] = ames_test['ms_subclass'].astype(object)

----

## Modeling

### Baseline

In [2466]:
num_cols_for_model = list(set(['overall_qual', 'garage_area', 'gr_liv_area', 'year_built', '2nd_flr_sf', 'full_bath']))
X = ames[num_cols_for_model] # picking predictors
y = ames['saleprice']
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)
baseline_preds = np.full_like(y_val, np.mean(y_train))
baseline_preds[:5]

array([181061, 181061, 181061, 181061, 181061])

In [2467]:
r2_score(y_val, baseline_preds) # baseline is terrible

-0.0004332658021117819

In [2468]:
 print(f'''
    R-squared = {r2_score(y_val, baseline_preds)}
    MAE = {mean_absolute_error(y_val, baseline_preds)}
    RMSE = {mean_squared_error(y_val, baseline_preds, squared = False)},
    MSE = {mean_squared_error(y_val, baseline_preds, squared = True)}
    ''')


   R-squared = -0.0004332658021117819
   MAE = 59199.95516569201
   RMSE = 78375.28304925401,
   MSE = 6142684993.050682
   


The baseline model is terrible. Based on the R-squared Score, estimating every value to the mean of the training sale price does not explain any of the variation in the test sale price.

### MLR Model 1 - Numeric and Ordinal Data

In [2469]:
num_cols_for_model = list(set(['overall_qual', 'garage_area', 'gr_liv_area', 'year_built', '2nd_flr_sf', 'full_bath']))
ordinal_cols = [i for i in ames.columns if 'ordinal' in i ]
alt_cols_sans_ords = [col for col, dt in ames.dtypes.iteritems() if (dt == object and col in alt_cols)]
alt_cols_sans_ords = list(set(alt_cols_sans_ords).difference(to_ordinal_cols).difference(num_cols_for_model))

In [2470]:
cols_for_X = num_cols_for_model+ordinal_cols
X = ames[cols_for_X] # picking predictors
y = ames['saleprice']

In [2471]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)

In [2472]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((1538, 15), (513, 15), (1538,), (513,))

In [2473]:
ss = StandardScaler() # used to scale all data

In [2474]:
X_train_ss = ss.fit_transform(X_train) # fit and transform to X_train
X_train_ss = pd.DataFrame(X_train_ss, columns=X_train.columns)

X_val_ss = ss.transform(X_val) # transform
X_val_ss = pd.DataFrame(X_val_ss, columns=X_val.columns)

In [2478]:
model = LinearRegression() # linear model
model.fit(X_train_ss, y_train)

LinearRegression()

In [2476]:
model.score(X_train_ss, y_train), model.score(X_val_ss, y_val) # decent starting point

(0.8173147937556553, 0.8484470495445733)

Right now, this model explains 81.7% and 84.8% of variance in sale price for the training and validation sets, respectively. Good start.

In [2477]:
list(zip(model.feature_names_in_, model.coef_))

[('2nd_flr_sf', -9770.718846095371),
 ('garage_area', 10031.215361923576),
 ('year_built', -1670.5949778382019),
 ('gr_liv_area', 34356.611281187514),
 ('overall_qual', 16384.946790229216),
 ('full_bath', -3777.0091780652547),
 ('bsmt_qual_ordinal', 6695.64287837259),
 ('exter_qual_ordinal', 7626.116190361337),
 ('exter_cond_ordinal', 1534.4232899331782),
 ('heating_qc_ordinal', 1742.6655089545075),
 ('kitchen_qual_ordinal', 7706.212732658927),
 ('garage_qual_ordinal', -3078.3849019361523),
 ('pool_qc_ordinal', -5579.625623465449),
 ('fence_ordinal', 1931.493010320526),
 ('neighborhood_ordinal', 17847.79952666668)]

Older homes, increases in 2nd floor square footage, improvements in garage quality, and improvements in pool quality result in decreases in sale price. Older homes correlating with decreases sale price makes sense and pools are not always an enticing feature due to upkeep. 2nd floor sq footage might decrease saleprice, since it accounts for both finished and unfinished house floors. I am confused on why garage quality improvements correlate with lower sale prices.

#### Metrics

In [734]:
model_metrics(model, X_train_ss, y_train), model_metrics(model, X_val_ss, y_val)


    R-squared = 0.813730983939358
    MAE = 22414.141754629494
    RMSE = 34322.91012852713,
    MSE = 1178062159.6909502
    

    R-squared = 0.8405841337278586
    MAE = 22774.009449294666
    RMSE = 31286.056979346038,
    MSE = 978817361.3148869
    


(None, None)

#### Assessing my chosen predictors - RFE, VIF

In [738]:
rfe = RFE(estimator = LinearRegression()) # RFE

X_train_half = rfe.fit_transform(X_train_ss, y_train) 
list(rfe.get_feature_names_out())

['2nd_flr_sf',
 'garage_area',
 'gr_liv_area',
 'overall_qual',
 'bsmt_qual_ordinal',
 'exter_qual_ordinal',
 'neighborhood_ordinal']

In [737]:
# code from https://www.geeksforgeeks.org/detecting-multicollinearity-with-vif-python/

list(zip(X.columns, [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]))

[('2nd_flr_sf', 3.1609777983504053),
 ('garage_area', 13.888489703253358),
 ('year_built', 131.4348611280781),
 ('gr_liv_area', 33.34520375356545),
 ('overall_qual', 67.36135610471815),
 ('full_bath', 17.302699097384156),
 ('bsmt_qual_ordinal', 31.141121552690194),
 ('exter_qual_ordinal', 95.28659697775892),
 ('exter_cond_ordinal', 70.68938338678862),
 ('heating_qc_ordinal', 29.143105170155646),
 ('garage_qual_ordinal', 25.537288995474704),
 ('pool_qc_ordinal', 1.069543412985405),
 ('fence_ordinal', 1.35597096954762),
 ('neighborhood_ordinal', 10.741365176919734)]

In [749]:
rfe_factors = list(rfe.get_feature_names_out())

pool_qc_ordinal, fence_ordinal, 2nd_flr_sf have good VIFs

worst: year_built, exter_qual_ordinal, overall_qual, bsmt_qual_ordinal, exter_cond_ordinal

### MLR Model 2 - Numeric, Ordinal, and Nominal

In [2495]:
num_cols_for_model = list(set(['overall_qual', 'garage_area', 'gr_liv_area', 'year_built', '2nd_flr_sf', 'full_bath']))
ordinal_cols = [i for i in ames.columns if 'ordinal' in i ]

In [2496]:
alt_cols_sans_ords = [col for col, dt in ames.dtypes.iteritems() if (dt == object and col in alt_cols)]
alt_cols_sans_ords = list(set(alt_cols_sans_ords).difference(to_ordinal_cols).difference(num_cols_for_model))
# alt_cols, ordinal_cols # ordinal_cols = originally ordinal columns + neighborhood

In [2497]:
# cols_for_X = list(set(num_cols_for_model+alt_cols_sans_ords+ordinal_cols))
cols_for_X = num_cols_for_model+ordinal_cols+alt_cols_sans_ords
for bad_col in ['exter_qual_ordinal', 'overall_qual', 'bsmt_qual_ordinal', 'exter_cond_ordinal']:
    cols_for_X.remove(bad_col)

X = ames[cols_for_X] # picking predictors
y = ames['saleprice']

In [2498]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)

In [2499]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((1538, 22), (513, 22), (1538,), (513,))

In [2500]:
ohe = OneHotEncoder(drop = 'first', handle_unknown = 'ignore', sparse = False) # one hot encoder for nominal data

In [2501]:
smart_encoder = make_column_transformer(
    (ohe, alt_cols_sans_ords),
    remainder = 'passthrough', # don't drop the other columns
    verbose_feature_names_out=False
)

In [2502]:
X_train_enc = smart_encoder.fit_transform(X_train)

In [2503]:
X_train_enc = pd.DataFrame(X_train_enc, columns=smart_encoder.get_feature_names_out())

In [2504]:
warnings.filterwarnings("ignore")
X_val_enc = pd.DataFrame(smart_encoder.transform(X_val), columns=smart_encoder.get_feature_names_out())
warnings.filterwarnings('default')

In [2505]:
X_train_enc.head(3)

Unnamed: 0,roof_matl_CompShg,roof_matl_Membran,roof_matl_Tar&Grv,roof_matl_WdShake,roof_matl_WdShngl,house_style_1.5Unf,house_style_1Story,house_style_2.5Fin,house_style_2.5Unf,house_style_2Story,house_style_SFoyer,house_style_SLvl,roof_style_Gable,roof_style_Gambrel,roof_style_Hip,roof_style_Mansard,roof_style_Shed,paved_drive_P,paved_drive_Y,heating_GasW,heating_Grav,heating_OthW,heating_Wall,central_air_Y,exterior_1st_AsphShn,exterior_1st_BrkComm,exterior_1st_BrkFace,exterior_1st_CBlock,exterior_1st_CemntBd,exterior_1st_HdBoard,exterior_1st_ImStucc,exterior_1st_MetalSd,exterior_1st_Plywood,exterior_1st_Stone,exterior_1st_Stucco,exterior_1st_VinylSd,exterior_1st_Wd Sdng,exterior_1st_WdShing,electrical_FuseF,electrical_FuseP,electrical_Mix,electrical_SBrkr,exterior_2nd_AsphShn,exterior_2nd_Brk Cmn,exterior_2nd_BrkFace,exterior_2nd_CBlock,exterior_2nd_CmentBd,exterior_2nd_HdBoard,exterior_2nd_ImStucc,exterior_2nd_MetalSd,exterior_2nd_Plywood,exterior_2nd_Stone,exterior_2nd_Stucco,exterior_2nd_VinylSd,exterior_2nd_Wd Sdng,exterior_2nd_Wd Shng,mas_vnr_type_BrkFace,mas_vnr_type_None,mas_vnr_type_Stone,mas_vnr_type_nan,2nd_flr_sf,garage_area,year_built,gr_liv_area,full_bath,heating_qc_ordinal,kitchen_qual_ordinal,garage_qual_ordinal,pool_qc_ordinal,fence_ordinal,neighborhood_ordinal
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,525.0,1971.0,1587.0,2.0,3.0,3.0,3.0,0.0,0.0,12.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1005.0,498.0,1937.0,2267.0,1.0,3.0,3.0,3.0,0.0,0.0,9.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,720.0,1950.0,864.0,1.0,4.0,4.0,3.0,0.0,0.0,12.0


In [2506]:
X_val_enc.head(3)

Unnamed: 0,roof_matl_CompShg,roof_matl_Membran,roof_matl_Tar&Grv,roof_matl_WdShake,roof_matl_WdShngl,house_style_1.5Unf,house_style_1Story,house_style_2.5Fin,house_style_2.5Unf,house_style_2Story,house_style_SFoyer,house_style_SLvl,roof_style_Gable,roof_style_Gambrel,roof_style_Hip,roof_style_Mansard,roof_style_Shed,paved_drive_P,paved_drive_Y,heating_GasW,heating_Grav,heating_OthW,heating_Wall,central_air_Y,exterior_1st_AsphShn,exterior_1st_BrkComm,exterior_1st_BrkFace,exterior_1st_CBlock,exterior_1st_CemntBd,exterior_1st_HdBoard,exterior_1st_ImStucc,exterior_1st_MetalSd,exterior_1st_Plywood,exterior_1st_Stone,exterior_1st_Stucco,exterior_1st_VinylSd,exterior_1st_Wd Sdng,exterior_1st_WdShing,electrical_FuseF,electrical_FuseP,electrical_Mix,electrical_SBrkr,exterior_2nd_AsphShn,exterior_2nd_Brk Cmn,exterior_2nd_BrkFace,exterior_2nd_CBlock,exterior_2nd_CmentBd,exterior_2nd_HdBoard,exterior_2nd_ImStucc,exterior_2nd_MetalSd,exterior_2nd_Plywood,exterior_2nd_Stone,exterior_2nd_Stucco,exterior_2nd_VinylSd,exterior_2nd_Wd Sdng,exterior_2nd_Wd Shng,mas_vnr_type_BrkFace,mas_vnr_type_None,mas_vnr_type_Stone,mas_vnr_type_nan,2nd_flr_sf,garage_area,year_built,gr_liv_area,full_bath,heating_qc_ordinal,kitchen_qual_ordinal,garage_qual_ordinal,pool_qc_ordinal,fence_ordinal,neighborhood_ordinal
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,576.0,1976.0,954.0,1.0,4.0,3.0,3.0,0.0,0.0,19.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,941.0,487.0,1966.0,2157.0,2.0,5.0,4.0,3.0,0.0,4.0,12.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1140.0,656.0,2007.0,2088.0,2.0,5.0,4.0,3.0,0.0,0.0,27.0


In [2507]:
ss = StandardScaler() # used to scale data

In [2508]:
X_train_enc_ss = ss.fit_transform(X_train_enc)
X_train_enc_ss = pd.DataFrame(X_train_enc_ss, columns=X_train_enc.columns)

X_val_enc_ss = ss.transform(X_val_enc)
X_val_enc_ss = pd.DataFrame(X_val_enc_ss, columns=X_train_enc.columns)

In [2509]:
X_train_enc_ss.head(3)

Unnamed: 0,roof_matl_CompShg,roof_matl_Membran,roof_matl_Tar&Grv,roof_matl_WdShake,roof_matl_WdShngl,house_style_1.5Unf,house_style_1Story,house_style_2.5Fin,house_style_2.5Unf,house_style_2Story,house_style_SFoyer,house_style_SLvl,roof_style_Gable,roof_style_Gambrel,roof_style_Hip,roof_style_Mansard,roof_style_Shed,paved_drive_P,paved_drive_Y,heating_GasW,heating_Grav,heating_OthW,heating_Wall,central_air_Y,exterior_1st_AsphShn,exterior_1st_BrkComm,exterior_1st_BrkFace,exterior_1st_CBlock,exterior_1st_CemntBd,exterior_1st_HdBoard,exterior_1st_ImStucc,exterior_1st_MetalSd,exterior_1st_Plywood,exterior_1st_Stone,exterior_1st_Stucco,exterior_1st_VinylSd,exterior_1st_Wd Sdng,exterior_1st_WdShing,electrical_FuseF,electrical_FuseP,electrical_Mix,electrical_SBrkr,exterior_2nd_AsphShn,exterior_2nd_Brk Cmn,exterior_2nd_BrkFace,exterior_2nd_CBlock,exterior_2nd_CmentBd,exterior_2nd_HdBoard,exterior_2nd_ImStucc,exterior_2nd_MetalSd,exterior_2nd_Plywood,exterior_2nd_Stone,exterior_2nd_Stucco,exterior_2nd_VinylSd,exterior_2nd_Wd Sdng,exterior_2nd_Wd Shng,mas_vnr_type_BrkFace,mas_vnr_type_None,mas_vnr_type_Stone,mas_vnr_type_nan,2nd_flr_sf,garage_area,year_built,gr_liv_area,full_bath,heating_qc_ordinal,kitchen_qual_ordinal,garage_qual_ordinal,pool_qc_ordinal,fence_ordinal,neighborhood_ordinal
0,0.117657,-0.025507,-0.088677,-0.051064,-0.044209,-0.07231,0.964228,-0.044209,-0.080898,-0.647162,-0.161299,-0.206668,0.511556,-0.076722,-0.486142,-0.051064,-0.036084,-0.141046,0.325084,-0.099242,-0.051064,-0.036084,-0.036084,0.27207,-0.025507,-0.036084,-0.159164,-0.036084,-0.216727,-0.415037,-0.025507,2.302885,-0.282942,-0.036084,-0.114783,-0.741892,-0.404214,-0.145768,-0.128544,-0.067618,-0.025507,0.317696,-0.036084,-0.095845,-0.117657,-0.036084,-0.216727,-0.392159,-0.076722,2.325582,-0.31271,-0.044209,-0.123213,-0.741892,-0.388841,-0.175588,1.489152,-1.208636,-0.288268,-0.108821,-0.774714,0.240259,-0.028539,0.176756,0.773523,-1.169085,-0.771101,0.275119,-0.065351,-0.489054,-0.279594
1,0.117657,-0.025507,-0.088677,-0.051064,-0.044209,-0.07231,-1.037099,-0.044209,-0.080898,1.545207,-0.161299,-0.206668,0.511556,-0.076722,-0.486142,-0.051064,-0.036084,-0.141046,0.325084,-0.099242,-0.051064,-0.036084,-0.036084,0.27207,-0.025507,-0.036084,-0.159164,-0.036084,-0.216727,-0.415037,-0.025507,-0.434238,-0.282942,-0.036084,-0.114783,-0.741892,2.473938,-0.145768,-0.128544,-0.067618,-0.025507,0.317696,-0.036084,-0.095845,-0.117657,-0.036084,-0.216727,-0.392159,-0.076722,-0.43,-0.31271,-0.044209,-0.123213,-0.741892,2.571743,-0.175588,-0.671523,0.827379,-0.288268,-0.108821,1.585964,0.116516,-1.15571,1.517269,-1.034498,-1.169085,-0.771101,0.275119,-0.065351,-0.489054,-0.686293
2,0.117657,-0.025507,-0.088677,-0.051064,-0.044209,-0.07231,0.964228,-0.044209,-0.080898,-0.647162,-0.161299,-0.206668,0.511556,-0.076722,-0.486142,-0.051064,-0.036084,-0.141046,0.325084,-0.099242,-0.051064,-0.036084,-0.036084,0.27207,-0.025507,-0.036084,-0.159164,-0.036084,-0.216727,-0.415037,-0.025507,-0.434238,-0.282942,-0.036084,-0.114783,1.347906,-0.404214,-0.145768,-0.128544,-0.067618,-0.025507,0.317696,-0.036084,-0.095845,-0.117657,-0.036084,-0.216727,-0.392159,-0.076722,-0.43,-0.31271,-0.044209,-0.123213,1.347906,-0.388841,-0.175588,-0.671523,0.827379,-0.288268,-0.108821,-0.774714,1.133954,-0.724733,-1.248526,-1.034498,-0.142799,0.728208,0.275119,-0.065351,-0.489054,-0.279594


In [2510]:
X_val_enc_ss.head(3)

Unnamed: 0,roof_matl_CompShg,roof_matl_Membran,roof_matl_Tar&Grv,roof_matl_WdShake,roof_matl_WdShngl,house_style_1.5Unf,house_style_1Story,house_style_2.5Fin,house_style_2.5Unf,house_style_2Story,house_style_SFoyer,house_style_SLvl,roof_style_Gable,roof_style_Gambrel,roof_style_Hip,roof_style_Mansard,roof_style_Shed,paved_drive_P,paved_drive_Y,heating_GasW,heating_Grav,heating_OthW,heating_Wall,central_air_Y,exterior_1st_AsphShn,exterior_1st_BrkComm,exterior_1st_BrkFace,exterior_1st_CBlock,exterior_1st_CemntBd,exterior_1st_HdBoard,exterior_1st_ImStucc,exterior_1st_MetalSd,exterior_1st_Plywood,exterior_1st_Stone,exterior_1st_Stucco,exterior_1st_VinylSd,exterior_1st_Wd Sdng,exterior_1st_WdShing,electrical_FuseF,electrical_FuseP,electrical_Mix,electrical_SBrkr,exterior_2nd_AsphShn,exterior_2nd_Brk Cmn,exterior_2nd_BrkFace,exterior_2nd_CBlock,exterior_2nd_CmentBd,exterior_2nd_HdBoard,exterior_2nd_ImStucc,exterior_2nd_MetalSd,exterior_2nd_Plywood,exterior_2nd_Stone,exterior_2nd_Stucco,exterior_2nd_VinylSd,exterior_2nd_Wd Sdng,exterior_2nd_Wd Shng,mas_vnr_type_BrkFace,mas_vnr_type_None,mas_vnr_type_Stone,mas_vnr_type_nan,2nd_flr_sf,garage_area,year_built,gr_liv_area,full_bath,heating_qc_ordinal,kitchen_qual_ordinal,garage_qual_ordinal,pool_qc_ordinal,fence_ordinal,neighborhood_ordinal
0,0.117657,-0.025507,-0.088677,-0.051064,-0.044209,-0.07231,-1.037099,-0.044209,-0.080898,-0.647162,-0.161299,4.838667,0.511556,-0.076722,-0.486142,-0.051064,-0.036084,-0.141046,0.325084,-0.099242,-0.051064,-0.036084,-0.036084,0.27207,-0.025507,-0.036084,-0.159164,-0.036084,-0.216727,-0.415037,-0.025507,2.302885,-0.282942,-0.036084,-0.114783,-0.741892,-0.404214,-0.145768,-0.128544,-0.067618,-0.025507,0.317696,-0.036084,-0.095845,-0.117657,-0.036084,-0.216727,-0.392159,-0.076722,2.325582,-0.31271,-0.044209,-0.123213,-0.741892,-0.388841,-0.175588,-0.671523,0.827379,-0.288268,-0.108821,-0.774714,0.473994,0.137221,-1.071105,-1.034498,-0.142799,-0.771101,0.275119,-0.065351,-0.489054,0.66937
1,0.117657,-0.025507,-0.088677,-0.051064,-0.044209,-0.07231,-1.037099,-0.044209,-0.080898,1.545207,-0.161299,-0.206668,0.511556,-0.076722,-0.486142,-0.051064,-0.036084,-0.141046,0.325084,-0.099242,-0.051064,-0.036084,-0.036084,0.27207,-0.025507,-0.036084,-0.159164,-0.036084,-0.216727,-0.415037,-0.025507,2.302885,-0.282942,-0.036084,-0.114783,-0.741892,-0.404214,-0.145768,-0.128544,-0.067618,-0.025507,0.317696,-0.036084,-0.095845,-0.117657,-0.036084,-0.216727,-0.392159,-0.076722,2.325582,-0.31271,-0.044209,-0.123213,-0.741892,-0.388841,-0.175588,1.489152,-1.208636,-0.288268,-0.108821,1.435632,0.066103,-0.1943,1.300422,0.773523,0.883487,0.728208,0.275119,-0.065351,2.728765,-0.279594
2,0.117657,-0.025507,-0.088677,-0.051064,-0.044209,-0.07231,-1.037099,-0.044209,-0.080898,1.545207,-0.161299,-0.206668,-1.954819,-0.076722,2.057011,-0.051064,-0.036084,-0.141046,0.325084,-0.099242,-0.051064,-0.036084,-0.036084,0.27207,-0.025507,-0.036084,-0.159164,-0.036084,-0.216727,-0.415037,-0.025507,-0.434238,-0.282942,-0.036084,-0.114783,1.347906,-0.404214,-0.145768,-0.128544,-0.067618,-0.025507,0.317696,-0.036084,-0.095845,-0.117657,-0.036084,-0.216727,-0.392159,-0.076722,-0.43,-0.31271,-0.044209,-0.123213,1.347906,-0.388841,-0.175588,-0.671523,-1.208636,3.468991,-0.108821,1.90307,0.840638,1.164935,1.164399,0.773523,0.883487,0.728208,0.275119,-0.065351,-0.489054,1.7539


In [2511]:
model = LinearRegression() # linear model
model.fit(X_train_enc_ss, y_train)

LinearRegression()

In [2512]:
model.score(X_train_enc_ss, y_train), model.score(X_val_enc_ss, y_val) #r2 scores worse than previous model

(0.8227516519218281, 0.8252813393815894)

In [2513]:
list(zip(model.feature_names_in_, model.coef_))[:5]

[('roof_matl_CompShg', 44672.664459859225),
 ('roof_matl_Membran', 11720.323224374304),
 ('roof_matl_Tar&Grv', 34389.04874209942),
 ('roof_matl_WdShake', 20164.48175457588),
 ('roof_matl_WdShngl', 21447.22292271319)]

In [2514]:
model_metrics(model, X_train_enc_ss, y_train), model_metrics(model, X_val_enc_ss, y_val);


    R-squared = 0.8227516519218281
    MAE = 22891.502756081987
    RMSE = 33481.49881019501,
    MSE = 1121010762.5770895
    

    R-squared = 0.8252813393815894
    MAE = 24641.037284717215
    RMSE = 32753.273073753127,
    MSE = 1072776897.0438416
    


This model is performing worse than the previous model, which indicates I need to do more tweaking. Lower variablity explained in R-squared, higher error metrics.

#### Assessing my chosen predictors - RFE, VIF

In [2515]:
rfe = RFE(estimator = LinearRegression()) # RFE

X_train_half = rfe.fit_transform(X_train_enc_ss, y_train) 
list(rfe.get_feature_names_out())

['roof_matl_CompShg',
 'roof_matl_Membran',
 'roof_matl_Tar&Grv',
 'roof_matl_WdShake',
 'roof_matl_WdShngl',
 'house_style_1.5Unf',
 'house_style_2Story',
 'house_style_SFoyer',
 'roof_style_Hip',
 'roof_style_Mansard',
 'exterior_1st_BrkComm',
 'exterior_1st_CBlock',
 'exterior_1st_CemntBd',
 'exterior_1st_Plywood',
 'exterior_1st_Wd Sdng',
 'exterior_2nd_Brk Cmn',
 'exterior_2nd_CmentBd',
 'exterior_2nd_Plywood',
 'exterior_2nd_Stone',
 'exterior_2nd_VinylSd',
 'exterior_2nd_Wd Sdng',
 'mas_vnr_type_BrkFace',
 'mas_vnr_type_None',
 'mas_vnr_type_Stone',
 'mas_vnr_type_nan',
 '2nd_flr_sf',
 'garage_area',
 'year_built',
 'gr_liv_area',
 'full_bath',
 'heating_qc_ordinal',
 'kitchen_qual_ordinal',
 'garage_qual_ordinal',
 'pool_qc_ordinal',
 'neighborhood_ordinal']

In [2516]:
# code from https://www.geeksforgeeks.org/detecting-multicollinearity-with-vif-python/

list(zip(X_train_enc_ss.columns, [variance_inflation_factor(X_train_enc_ss.values, i) for i in range(len(X_train_enc_ss.columns))]))

[('roof_matl_CompShg', 30.814847205209265),
 ('roof_matl_Membran', 2.822374594813686),
 ('roof_matl_Tar&Grv', 20.222578552225123),
 ('roof_matl_WdShake', 7.6207736887803605),
 ('roof_matl_WdShngl', 5.853808355857167),
 ('house_style_1.5Unf', 1.1647635828062197),
 ('house_style_1Story', 7.1530895170263395),
 ('house_style_2.5Fin', 1.3183295717005032),
 ('house_style_2.5Unf', 1.2233030806154142),
 ('house_style_2Story', 4.389087827046676),
 ('house_style_SFoyer', 1.6876971025678658),
 ('house_style_SLvl', 1.8215971199181884),
 ('roof_style_Gable', 71.74641663387432),
 ('roof_style_Gambrel', 3.736506646127248),
 ('roof_style_Hip', 68.62147804610335),
 ('roof_style_Mansard', 2.5934325801509983),
 ('roof_style_Shed', 1.8189341386435425),
 ('paved_drive_P', 1.4115423283598947),
 ('paved_drive_Y', 2.011744630010204),
 ('heating_GasW', 1.176376618505743),
 ('heating_Grav', 1.1117150206275521),
 ('heating_OthW', 1.1406514275850608),
 ('heating_Wall', 1.2281551333461198),
 ('central_air_Y', 1.71

Assessing the RFE and the VIF results, the best cols seem to be:

house_style, paved_drive, electrical

exterior_1st, exterior_2nd (maybe?) - ehhh, some categories have high VIFs

worst based off VIF: roof_style

### MLR Model 3, GridsearchCV with Ridge/Lasso - Numeric, Ordinal, and Nominal with Poly Features (Final)

#### MLR Model 3

In [2522]:
num_cols_for_model = list(set(['garage_area', 'gr_liv_area', 'year_built', '1st_flr_sf', 'open_porch_sf', 
                               'enclosed_porch', '3ssn_porch', 'screen_porch', 'full_bath', 'bsmtfin_sf_1', 
                               'fireplaces', 'overall_qual', 'year_remod/add - year_built']))

ordinal_cols_for_model = list(set([i for i in ames.columns if 'ordinal' in i ]))

nominal_cols_for_model = ['roof_matl', 'ms_subclass', 'heating', 'central_air', 'exterior_1st', 'electrical', 'paved_drive']
nominal_cols_for_model = list(set(nominal_cols_for_model)) + ['condition_1', 'condition_2']
interactions_to_examine = ['year_remod/add - year_built', '1st_flr_sf', 'open_porch_sf', 'enclosed_porch',
                           '3ssn_porch', 'screen_porch', 'neighborhood_ordinal', 'pool_qc_ordinal']

In [2523]:
# cols_for_X = list(set(num_cols_for_model+alt_cols_sans_ords+ordinal_cols))
# cols_for_X = num_cols_for_model+ordinal_cols+alt_cols_sans_ords

cols_for_X = num_cols_for_model + ordinal_cols_for_model + nominal_cols_for_model

bad_cols = [ 'roof_style', 'exterior_2nd', 'house_style', 
            'open_porch_sf', 'enclosed_porch', '3ssn_porch', 'screen_porch']

for bad_col in bad_cols:
    if bad_col in cols_for_X:
        cols_for_X.remove(bad_col)
    if bad_col in nominal_cols_for_model:
        nominal_cols_for_model.remove(bad_col)
    if bad_col in num_cols_for_model:
        num_cols_for_model.remove(bad_col)
    if bad_col in ordinal_cols_for_model:
        ordinal_cols_for_model.remove(bad_col)
    if bad_col in interactions_to_examine:
        interactions_to_examine.remove(bad_col)    

X = ames[cols_for_X] # picking predictors
y = ames['saleprice']

In [2524]:
cols_for_X

['fireplaces',
 'overall_qual',
 'full_bath',
 'year_remod/add - year_built',
 'garage_area',
 'year_built',
 '1st_flr_sf',
 'gr_liv_area',
 'bsmtfin_sf_1',
 'neighborhood_ordinal',
 'exter_cond_ordinal',
 'garage_qual_ordinal',
 'exter_qual_ordinal',
 'kitchen_qual_ordinal',
 'bsmt_qual_ordinal',
 'pool_qc_ordinal',
 'fence_ordinal',
 'heating_qc_ordinal',
 'roof_matl',
 'paved_drive',
 'heating',
 'central_air',
 'ms_subclass',
 'exterior_1st',
 'electrical',
 'condition_1',
 'condition_2']

In [2525]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42, train_size=.85) # tts

In [2526]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((1743, 27), (308, 27), (1743,), (308,))

In [2527]:
pf = PolynomialFeatures(include_bias=True, interaction_only=False)

In [2528]:
smart_poly = make_column_transformer(
    (pf, interactions_to_examine),
    remainder = 'passthrough',
    verbose_feature_names_out = False
)

In [2529]:
X_train_pf = smart_poly.fit_transform(X_train)
X_val_pf = smart_poly.transform(X_val)

X_train_pf = pd.DataFrame(X_train_pf, columns = smart_poly.get_feature_names_out())
X_val_pf = pd.DataFrame(X_val_pf, columns = smart_poly.get_feature_names_out())

In [2530]:
list(set(X_train_pf.columns).difference(set(X_train.columns)))

['1st_flr_sf^2',
 'pool_qc_ordinal^2',
 'year_remod/add - year_built pool_qc_ordinal',
 '1st_flr_sf pool_qc_ordinal',
 'neighborhood_ordinal pool_qc_ordinal',
 'year_remod/add - year_built neighborhood_ordinal',
 'neighborhood_ordinal^2',
 'year_remod/add - year_built^2',
 '1',
 '1st_flr_sf neighborhood_ordinal',
 'year_remod/add - year_built 1st_flr_sf']

In [2531]:
ohe = OneHotEncoder(drop = 'first', handle_unknown = 'ignore', sparse = False) # encoding nominal variables

In [2532]:
smart_encoder = make_column_transformer(
    (ohe, nominal_cols_for_model),
    remainder = 'passthrough', # don't drop the other columns
    verbose_feature_names_out=False
)

In [2533]:
X_train_pf_enc = smart_encoder.fit_transform(X_train_pf)

In [2534]:
X_train_pf_enc = pd.DataFrame(X_train_pf_enc, columns=smart_encoder.get_feature_names_out())

In [2535]:
X_val_pf_enc = pd.DataFrame(smart_encoder.transform(X_val_pf), columns=smart_encoder.get_feature_names_out())

In [2536]:
X_train_pf_enc.head(3)

Unnamed: 0,roof_matl_CompShg,roof_matl_Membran,roof_matl_Tar&Grv,roof_matl_WdShake,roof_matl_WdShngl,paved_drive_P,paved_drive_Y,heating_GasW,heating_Grav,heating_OthW,heating_Wall,central_air_Y,ms_subclass_30,ms_subclass_40,ms_subclass_45,ms_subclass_50,ms_subclass_60,ms_subclass_70,ms_subclass_75,ms_subclass_80,ms_subclass_85,ms_subclass_90,ms_subclass_120,ms_subclass_150,ms_subclass_160,ms_subclass_180,ms_subclass_190,exterior_1st_AsphShn,exterior_1st_BrkComm,exterior_1st_BrkFace,exterior_1st_CBlock,exterior_1st_CemntBd,exterior_1st_HdBoard,exterior_1st_ImStucc,exterior_1st_MetalSd,exterior_1st_Plywood,exterior_1st_Stone,exterior_1st_Stucco,exterior_1st_VinylSd,exterior_1st_Wd Sdng,exterior_1st_WdShing,electrical_FuseF,electrical_FuseP,electrical_Mix,electrical_SBrkr,condition_1_Feedr,condition_1_Norm,condition_1_PosA,condition_1_PosN,condition_1_RRAe,condition_1_RRAn,condition_1_RRNe,condition_1_RRNn,condition_2_Feedr,condition_2_Norm,condition_2_PosA,condition_2_PosN,condition_2_RRAn,condition_2_RRNn,1,year_remod/add - year_built,1st_flr_sf,neighborhood_ordinal,pool_qc_ordinal,year_remod/add - year_built^2,year_remod/add - year_built 1st_flr_sf,year_remod/add - year_built neighborhood_ordinal,year_remod/add - year_built pool_qc_ordinal,1st_flr_sf^2,1st_flr_sf neighborhood_ordinal,1st_flr_sf pool_qc_ordinal,neighborhood_ordinal^2,neighborhood_ordinal pool_qc_ordinal,pool_qc_ordinal^2,fireplaces,overall_qual,full_bath,garage_area,year_built,gr_liv_area,bsmtfin_sf_1,exter_cond_ordinal,garage_qual_ordinal,exter_qual_ordinal,kitchen_qual_ordinal,bsmt_qual_ordinal,fence_ordinal,heating_qc_ordinal
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1398.0,14.0,0.0,1.0,1398.0,14.0,0.0,1954404.0,19572.0,0.0,196.0,0.0,0.0,0.0,6,2.0,542,1990,1398,904,3,3,4,4,4,3,4
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,916.0,16.0,0.0,0.0,0.0,0.0,0.0,839056.0,14656.0,0.0,256.0,0.0,0.0,1.0,6,2.0,386,1996,1636,138,3,3,3,3,4,0,4
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,936.0,12.0,0.0,0.0,0.0,0.0,0.0,876096.0,11232.0,0.0,144.0,0.0,0.0,1.0,5,1.0,288,1962,936,734,3,3,3,3,3,0,3


In [2537]:
ss = StandardScaler()
ss_cols = num_cols_for_model+ordinal_cols_for_model+list(set(X_train_pf.columns).difference(set(X_train.columns)))

In [2538]:
ss.fit(X_train_pf_enc[ss_cols])
X_train_pf_enc_ss = X_train_pf_enc.copy(deep=True)
X_train_pf_enc_ss[ss_cols] = ss.transform(X_train_pf_enc[ss_cols])

X_val_pf_enc_ss = X_val_pf_enc.copy(deep=True)
X_val_pf_enc_ss[ss_cols] = ss.transform(X_val_pf_enc[ss_cols])

In [2539]:
X_train_pf_enc_ss.head(3)

Unnamed: 0,roof_matl_CompShg,roof_matl_Membran,roof_matl_Tar&Grv,roof_matl_WdShake,roof_matl_WdShngl,paved_drive_P,paved_drive_Y,heating_GasW,heating_Grav,heating_OthW,heating_Wall,central_air_Y,ms_subclass_30,ms_subclass_40,ms_subclass_45,ms_subclass_50,ms_subclass_60,ms_subclass_70,ms_subclass_75,ms_subclass_80,ms_subclass_85,ms_subclass_90,ms_subclass_120,ms_subclass_150,ms_subclass_160,ms_subclass_180,ms_subclass_190,exterior_1st_AsphShn,exterior_1st_BrkComm,exterior_1st_BrkFace,exterior_1st_CBlock,exterior_1st_CemntBd,exterior_1st_HdBoard,exterior_1st_ImStucc,exterior_1st_MetalSd,exterior_1st_Plywood,exterior_1st_Stone,exterior_1st_Stucco,exterior_1st_VinylSd,exterior_1st_Wd Sdng,exterior_1st_WdShing,electrical_FuseF,electrical_FuseP,electrical_Mix,electrical_SBrkr,condition_1_Feedr,condition_1_Norm,condition_1_PosA,condition_1_PosN,condition_1_RRAe,condition_1_RRAn,condition_1_RRNe,condition_1_RRNn,condition_2_Feedr,condition_2_Norm,condition_2_PosA,condition_2_PosN,condition_2_RRAn,condition_2_RRNn,1,year_remod/add - year_built,1st_flr_sf,neighborhood_ordinal,pool_qc_ordinal,year_remod/add - year_built^2,year_remod/add - year_built 1st_flr_sf,year_remod/add - year_built neighborhood_ordinal,year_remod/add - year_built pool_qc_ordinal,1st_flr_sf^2,1st_flr_sf neighborhood_ordinal,1st_flr_sf pool_qc_ordinal,neighborhood_ordinal^2,neighborhood_ordinal pool_qc_ordinal,pool_qc_ordinal^2,fireplaces,overall_qual,full_bath,garage_area,year_built,gr_liv_area,bsmtfin_sf_1,exter_cond_ordinal,garage_qual_ordinal,exter_qual_ordinal,kitchen_qual_ordinal,bsmt_qual_ordinal,fence_ordinal,heating_qc_ordinal
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.48509,0.578409,-0.009882,-0.069408,-0.371817,-0.458138,-0.390526,-0.023959,0.331391,0.144156,-0.059853,-0.258438,-0.062739,-0.064296,-0.921975,-0.086671,0.755067,0.306912,0.603998,-0.207956,0.989031,-0.232538,0.272019,1.017405,0.718273,0.562995,1.937083,-0.14795
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.527423,-0.621914,0.259251,-0.069408,-0.372339,-0.515176,-0.453819,-0.023959,-0.519835,-0.233836,-0.059853,0.012248,-0.062739,-0.064296,0.642779,-0.086671,0.755067,-0.406589,0.802279,0.263074,-0.658136,-0.232538,0.272019,-0.69266,-0.777485,0.562995,-0.484966,-0.14795
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.527423,-0.572108,-0.279016,-0.069408,-0.372339,-0.515176,-0.453819,-0.023959,-0.491566,-0.497108,-0.059853,-0.493033,-0.062739,-0.064296,0.642779,-0.789309,-1.047784,-0.854815,-0.32131,-1.122308,0.623471,-0.232538,0.272019,-0.69266,-0.777485,-0.567536,-0.484966,-1.179459


In [2540]:
model = LinearRegression()
model.fit(X_train_pf_enc_ss, y_train)

LinearRegression()

In [2541]:
model.score(X_train_pf_enc_ss, y_train), model.score(X_val_pf_enc_ss, y_val) #r2 better than other two models

(0.8997779598523226, 0.9103185114786305)

In [2542]:
model.intercept_

-198947.51627501927

In [2543]:
X_train['heating'].value_counts()

GasA    1717
GasW      16
Grav       5
Wall       3
OthW       2
Name: heating, dtype: int64

In [2544]:
list(zip(model.feature_names_in_, model.coef_))

[('roof_matl_CompShg', 365906.9811343587),
 ('roof_matl_Membran', 396109.62959623797),
 ('roof_matl_Tar&Grv', 359105.01879519573),
 ('roof_matl_WdShake', 344924.61155800225),
 ('roof_matl_WdShngl', 406367.9066530341),
 ('paved_drive_P', 5211.09295231229),
 ('paved_drive_Y', 4139.815297270368),
 ('heating_GasW', 1519.5341470369312),
 ('heating_Grav', -18214.105563044577),
 ('heating_OthW', -56885.37781801576),
 ('heating_Wall', 3612.1548715576046),
 ('central_air_Y', 1932.3451750470158),
 ('ms_subclass_30', 5186.057823917552),
 ('ms_subclass_40', -4058.921332038513),
 ('ms_subclass_45', 9644.808835979096),
 ('ms_subclass_50', -5715.436634941107),
 ('ms_subclass_60', -727.4574448290077),
 ('ms_subclass_70', -4391.048178964673),
 ('ms_subclass_75', 1067.2969533474043),
 ('ms_subclass_80', -1169.839845260531),
 ('ms_subclass_85', 3318.078461652316),
 ('ms_subclass_90', -10684.106332290874),
 ('ms_subclass_120', -24355.07863850054),
 ('ms_subclass_150', -50540.95960825795),
 ('ms_subclass_1

In [2545]:
model_metrics(model, X_train_pf_enc_ss, y_train), model_metrics(model, X_val_pf_enc_ss, y_val);


    R-squared = 0.8997779598523226
    MAE = 17091.356867643277
    RMSE = 25154.426077321627,
    MSE = 632745151.2794383
    

    R-squared = 0.9103185114786305
    MAE = 17266.134090462554
    RMSE = 23340.084121497584,
    MSE = 544759526.7985836
    


Best model yet. Around 90-91% of sale price variation explained through model. Lower metrics indicate my average residuals have improved as well.

##### Assessing my chosen predictors - RFE

In [2546]:
rfe = RFE(estimator = LinearRegression()) # RFE

X_train_half = rfe.fit_transform(X_train_pf_enc_ss, y_train) 
# set(['_'.join(i.split('_')[:-1]) for i in list(rfe.get_feature_names_out())])
list(rfe.get_feature_names_out())

['roof_matl_CompShg',
 'roof_matl_Membran',
 'roof_matl_Tar&Grv',
 'roof_matl_WdShake',
 'roof_matl_WdShngl',
 'heating_Grav',
 'heating_OthW',
 'ms_subclass_30',
 'ms_subclass_45',
 'ms_subclass_85',
 'ms_subclass_90',
 'ms_subclass_120',
 'ms_subclass_150',
 'ms_subclass_160',
 'ms_subclass_180',
 'ms_subclass_190',
 'exterior_1st_AsphShn',
 'exterior_1st_BrkComm',
 'exterior_1st_BrkFace',
 'exterior_1st_CemntBd',
 'exterior_1st_ImStucc',
 'exterior_1st_Stone',
 'electrical_Mix',
 'condition_1_Norm',
 'condition_1_PosA',
 'condition_1_PosN',
 'condition_1_RRAn',
 'condition_1_RRNe',
 'condition_1_RRNn',
 'condition_2_PosA',
 'condition_2_PosN',
 'condition_2_RRAn',
 'condition_2_RRNn',
 'neighborhood_ordinal',
 '1st_flr_sf^2',
 '1st_flr_sf neighborhood_ordinal',
 'neighborhood_ordinal^2',
 'neighborhood_ordinal pool_qc_ordinal',
 'pool_qc_ordinal^2',
 'overall_qual',
 'year_built',
 'gr_liv_area',
 'bsmtfin_sf_1',
 'kitchen_qual_ordinal']

#### GridSearchCV with Ridge/Lasso Regression

In [2557]:
# creating a list of alphas to check.
alpha_list = [50, 150] 
num = 2
while (num < 150):
    alpha_list.append(num)
    num = num+5
    
print(alpha_list)
    
ridge_parameters = {
    'alpha': alpha_list
}
lasso_parameters = {
    'alpha': alpha_list
}
gs_ridge = GridSearchCV(Ridge(), param_grid = ridge_parameters, cv = 5, verbose = 1)
gs_lasso = GridSearchCV(Lasso(), param_grid = lasso_parameters, cv = 5, verbose = 1)

[50, 150, 2, 7, 12, 17, 22, 27, 32, 37, 42, 47, 52, 57, 62, 67, 72, 77, 82, 87, 92, 97, 102, 107, 112, 117, 122, 127, 132, 137, 142, 147]


In [2558]:
warnings.filterwarnings("ignore")
gs_ridge.fit(X_train_pf_enc_ss, y_train) # using X_train from previous model
gs_lasso.fit(X_train_pf_enc_ss, y_train) # using X_train from previous model
warnings.filterwarnings('default')

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [2559]:
gs_ridge.best_score_, gs_ridge.best_params_, model_metrics_gs(y_val, gs_ridge.predict(X_val_pf_enc_ss))


    R-squared = 0.9108550369609337
    MAE = 17207.02146906805
    RMSE = 23270.162582620145,
    MSE = 541500466.6215746
    


(0.8645828592320004, {'alpha': 2}, None)

In [2561]:
model_metrics_gs(y_train, gs_ridge.predict(X_train_pf_enc_ss)) # different r2 score...


    R-squared = 0.8980976712236541
    MAE = 17298.949516306853
    RMSE = 25364.414860590554,
    MSE = 643353541.220147
    


In [2550]:
gs_lasso.best_score_, gs_lasso.best_params_, model_metrics_gs(y_val, gs_lasso.predict(X_val_pf_enc_ss))


    R-squared = 0.9110011345433869
    MAE = 17066.471108041154
    RMSE = 23251.086302529384,
    MSE = 540613014.2476696
    


(0.868363751876514, {'alpha': 42}, None)

In [2562]:
model_metrics_gs(y_train, gs_lasso.predict(X_train_pf_enc_ss)) # different r2 score...


    R-squared = 0.896316722597457
    MAE = 17478.841080197704
    RMSE = 25585.10194091056,
    MSE = 654597441.3267853
    


In [2552]:
list(zip(gs_lasso.feature_names_in_, gs_lasso.best_estimator_.coef_))[:5]

[('roof_matl_CompShg', 0.0),
 ('roof_matl_Membran', 0.0),
 ('roof_matl_Tar&Grv', -0.0),
 ('roof_matl_WdShake', -7364.02684694561),
 ('roof_matl_WdShngl', 28869.072810397927)]

In [2555]:
gs_ridge.best_estimator_.intercept_, gs_lasso.best_estimator_.intercept_

(176657.20266364867, 177393.8881152109)

In [2553]:
gs_ridge.best_params_, gs_lasso.best_params_

({'alpha': 2}, {'alpha': 42})

### Modeling Results/Conclusions

|       | R^2 (train) | R^2 (test) | RMSE (train) | RMSE (test) |
|-------|-------------|------------|--------------|-------------|
| MLR 1 | 0.813       | 0.841      | 34322.91     | 31286.06    |
| MLR 2 | 0.823       | 0.825      | 33481.49     | 32753.27    |
| MLR 3 | 0.899       | 0.910      | 25154.43     | 23340.08    |
| Ridge | 0.898       | 0.911      | 25364.41     | 23270.16    |
| Lasso | 0.896       | 0.911      | 25585.10     | 23251.09    |

My final multiple linear regression model and the optimized ridge and lasso models performed similarly, ranging around 90-91% for my R^2 scores. The root mean squared error metrics, which estimate how off my predictions were on average, were also fairly similar.

#### Coefficients

In [2572]:
for i in list(zip(model.feature_names_in_, model.coef_)):
     if 'heating' in i[0]:
        print(f'{i[0]}: {i[1]}')

heating_GasW: 1519.5341470369312
heating_Grav: -18214.105563044577
heating_OthW: -56885.37781801576
heating_Wall: 3612.1548715576046
heating_qc_ordinal: 1422.530524500893


- Heating system and heating quality were two separate predictors included in my model. The former represented the type of heating available in the property, and the latter is the quality of the heating system as deemed by the assessors. 
- Every 1 standard deviation increase in quality improves sale price by 1422 dollars
- The type of heating system also plays a part in sale price. Compared to a gas forced warm air furnace, gas forced hot water/steam heat system improves sale price by 1519 dollars, a wall furnace increases by 3612 dollars, whereas hot water/steam heat from a source other than gas decreases sale price by 56885 dollars




In [2573]:
for i in list(zip(model.feature_names_in_, model.coef_)):
    if 'electrical' in i[0]:
        print(f'{i[0]}: {i[1]}')

electrical_FuseF: 4168.116796567703
electrical_FuseP: 7308.694862200362
electrical_Mix: -32846.33730820223
electrical_SBrkr: -422.50169283815194


- Standard Circuit Breaker systems are very much prevalent throughout all properties in Ames. 
- Despite the model assigning a coefficient of -422 dollars, it is the preferred system to have.
- The main concern is for houses with an electrical system assessed as mixed, which is associated with a 32846 dollar decrease in sale price. Mixed systems are very uncommon, but should definitely be upgraded.


In [2574]:
for i in list(zip(model.feature_names_in_, model.coef_)):
    if 'gr_liv_area' in i[0]:
        print(f'{i[0]}: {i[1]}')

gr_liv_area: 26526.743059383618


- Area a strong predictor for sale price, and Ground living area specifically is one of the stronger predictors
- Based on it’s coefficient, and holding all other variables constant, an increase in area by one standard deviation would increase sale price by 26526 dollars. But, this would be a cost-intensive option. 


In [2576]:
for i in list(zip(model.feature_names_in_, model.coef_)):
    if 'condition_1' in i[0]:
        print(f'{i[0]}: {i[1]}')

condition_1_Feedr: 2672.9355403883383
condition_1_Norm: 9625.121389109168
condition_1_PosA: 17185.03114898981
condition_1_PosN: 24392.65687340092
condition_1_RRAe: -1028.94671792376
condition_1_RRAn: 6950.144890937301
condition_1_RRNe: 9448.521892311677
condition_1_RRNn: -11308.57472398736


- Property conditions represent various features within the proximity of the property, such as parks, or certain roads.

- Based on the model, and in comparison to proximity to an artery street, proximity to a feeder street increase sales by 2672 dollars, homes with no prominent features perform 9625 dollars better, and adjacency to positive features, like parks and greenbelts, increase price by 17185 dollars.

- This example is to clarify that the changes you may add to your property can only increase sale price by so much if it is located near an artery street, for example

#### Conclusions

- In conclusion, if you would like to increase the potential sale price for your property, consider factors like your heating system, electrical system, and ground/garage living area
- Quality assessments by the Ames Assessor’s office play a big role in the prediction of sale price. If time permits, request reassessment for changes on your property during the annual assessment period.
- And lastly, accept that certain features about your property may limit its potential, like its proximity to certain features. Despite this, there are likely accessible changes to be made.
- Attempts to identify interaction terms (like between neighborhood and certain property features) should be added to improve model and results

----

## Kaggle Submission

In [2007]:
num_cols_for_model = list(set(['garage_area', 'gr_liv_area', 'year_built', '1st_flr_sf', 'open_porch_sf', 
                               'enclosed_porch', '3ssn_porch', 'screen_porch', 'full_bath', 'bsmtfin_sf_1', 
                               'fireplaces', 'overall_qual', 'year_remod/add - year_built']))

ordinal_cols_for_model = list(set([i for i in ames.columns if 'ordinal' in i ]))

nominal_cols_for_model = ['roof_matl', 'ms_subclass', 'exterior_1st', 'electrical', 'paved_drive']
nominal_cols_for_model = list(set(nominal_cols_for_model))
interactions_to_examine = ['year_remod/add - year_built', '1st_flr_sf', 'open_porch_sf', 'enclosed_porch',
                           '3ssn_porch', 'screen_porch', 'neighborhood_ordinal', 'pool_qc_ordinal']



# cols_for_X = list(set(num_cols_for_model+alt_cols_sans_ords+ordinal_cols))
# cols_for_X = num_cols_for_model+ordinal_cols+alt_cols_sans_ords

cols_for_X = num_cols_for_model + ordinal_cols_for_model + nominal_cols_for_model

bad_cols = [ 'roof_style', 'heating', 'central_air', 'exterior_2nd', 'house_style', 
            'open_porch_sf', 'enclosed_porch', '3ssn_porch', 'screen_porch']

for bad_col in bad_cols:
    if bad_col in cols_for_X:
        cols_for_X.remove(bad_col)
    if bad_col in nominal_cols_for_model:
        nominal_cols_for_model.remove(bad_col)
    if bad_col in num_cols_for_model:
        num_cols_for_model.remove(bad_col)
    if bad_col in ordinal_cols_for_model:
        ordinal_cols_for_model.remove(bad_col)
    if bad_col in interactions_to_examine:
        interactions_to_examine.remove(bad_col)    

k_at = ames_test[cols_for_X] # picking predictors

k_at_pf = smart_poly.transform(k_at)

k_at_pf = pd.DataFrame(k_at_pf, columns = smart_poly.get_feature_names_out())

k_at_pf_enc = smart_encoder.transform(k_at_pf)

k_at_pf_enc = pd.DataFrame(k_at_pf_enc, columns=smart_encoder.get_feature_names_out())

k_at_pf_enc_ss = ss.transform(k_at_pf_enc)
k_at_pf_enc_ss = pd.DataFrame(k_at_pf_enc_ss, columns=k_at_pf_enc.columns)




In [2010]:
k_at_pf_enc_ss.head(3)

Unnamed: 0,roof_matl_CompShg,roof_matl_Membran,roof_matl_Tar&Grv,roof_matl_WdShake,roof_matl_WdShngl,paved_drive_P,paved_drive_Y,ms_subclass_30,ms_subclass_40,ms_subclass_45,ms_subclass_50,ms_subclass_60,ms_subclass_70,ms_subclass_75,ms_subclass_80,ms_subclass_85,ms_subclass_90,ms_subclass_120,ms_subclass_150,ms_subclass_160,ms_subclass_180,ms_subclass_190,exterior_1st_AsphShn,exterior_1st_BrkComm,exterior_1st_BrkFace,exterior_1st_CBlock,exterior_1st_CemntBd,exterior_1st_HdBoard,exterior_1st_ImStucc,exterior_1st_MetalSd,exterior_1st_Plywood,exterior_1st_Stone,exterior_1st_Stucco,exterior_1st_VinylSd,exterior_1st_Wd Sdng,exterior_1st_WdShing,electrical_FuseF,electrical_FuseP,electrical_Mix,electrical_SBrkr,1,year_remod/add - year_built,1st_flr_sf,neighborhood_ordinal,pool_qc_ordinal,year_remod/add - year_built^2,year_remod/add - year_built 1st_flr_sf,year_remod/add - year_built neighborhood_ordinal,year_remod/add - year_built pool_qc_ordinal,1st_flr_sf^2,1st_flr_sf neighborhood_ordinal,1st_flr_sf pool_qc_ordinal,neighborhood_ordinal^2,neighborhood_ordinal pool_qc_ordinal,pool_qc_ordinal^2,fireplaces,overall_qual,full_bath,garage_area,year_built,gr_liv_area,bsmtfin_sf_1,exter_cond_ordinal,garage_qual_ordinal,exter_qual_ordinal,bsmt_qual_ordinal,fence_ordinal,heating_qc_ordinal
0,0.123056,-0.023959,-0.093169,-0.04796,-0.053636,-0.14315,0.320104,-0.223607,-0.04796,-0.072044,-0.327673,-0.488678,-0.214985,-0.086686,-0.207576,-0.120631,-0.182195,-0.26578,-0.023959,-0.210566,-0.067904,6.142746,-0.023959,-0.033893,-0.17536,-0.033893,-0.214985,-0.407291,-0.023959,-0.437479,-0.287417,-0.033893,-0.113063,-0.743029,-0.398635,-0.151286,-0.134564,15.748016,-0.023959,-3.156305,0.0,1.165887,-0.641837,-1.35555,-0.069408,0.46292,0.966673,0.269532,-0.023959,-0.530971,-1.081474,-0.059853,-1.070497,-0.062739,-0.064296,-0.921975,-0.086671,0.755067,-0.159608,-2.039739,0.840976,-0.954884,-2.916733,-2.500664,-0.69266,-1.698066,-0.484966,-0.14795
1,0.123056,-0.023959,-0.093169,-0.04796,-0.053636,-0.14315,0.320104,-0.223607,-0.04796,-0.072044,-0.327673,-0.488678,-0.214985,-0.086686,-0.207576,-0.120631,5.488625,-0.26578,-0.023959,-0.210566,-0.067904,-0.162794,-0.023959,-0.033893,-0.17536,-0.033893,-0.214985,-0.407291,-0.023959,-0.437479,3.479262,-0.033893,-0.113063,-0.743029,-0.398635,-0.151286,-0.134564,-0.0635,-0.023959,0.316826,0.0,-0.527423,1.995388,-0.682716,-0.069408,-0.372339,-0.515176,-0.453819,-0.023959,1.792666,0.000448,-0.059853,-0.777254,-0.062739,-0.064296,-0.921975,-0.789309,0.755067,0.480714,0.174391,0.918161,-0.954884,-0.232538,0.272019,-0.69266,0.562995,-0.484966,-1.179459
2,0.123056,-0.023959,-0.093169,-0.04796,-0.053636,-0.14315,0.320104,-0.223607,-0.04796,-0.072044,-0.327673,2.046338,-0.214985,-0.086686,-0.207576,-0.120631,-0.182195,-0.26578,-0.023959,-0.210566,-0.067904,-0.162794,-0.023959,-0.033893,-0.17536,-0.033893,-0.214985,-0.407291,-0.023959,-0.437479,-0.287417,-0.033893,-0.113063,1.345842,-0.398635,-0.151286,-0.134564,-0.0635,-0.023959,0.316826,0.0,-0.527423,-1.249469,0.259251,-0.069408,-0.372339,-0.515176,-0.453819,-0.023959,-0.823707,-0.543857,-0.059853,0.012248,-0.062739,-0.064296,0.642779,0.615968,0.755067,-0.22364,1.132746,-0.014003,0.236409,-0.232538,0.272019,1.017405,0.562995,-0.484966,0.883558


In [2026]:
ames['neighborhood_ordinal'].max(), ames_test['neighborhood_ordinal'].max()

(28, 28)

In [2034]:
kaggle_submission(gs_lasso.predict(k_at_pf_enc_ss), fn = "0417_lasso")