## Model Tuning

In [146]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, Lasso, Ridge
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [11]:
train = pd.read_csv('../datasets/train_cleaned.csv', keep_default_na=False, na_values='')
test = pd.read_csv('../datasets/test_cleaned.csv', keep_default_na=False, na_values='')

### Ridge Model

In [84]:
# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

SimpleImputer.get_feature_names_out = (lambda self, names=None:
                                       self.feature_names_in_)

numeric = ['mas_vnr_area', 'bsmtfin_sf_1', 'total_bsmt_sf', '1st_flr_sf', 'gr_liv_area', 'garage_area']
numeric_transformer = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='mean')),
        ('ss', StandardScaler())
    ])

categorical = [
    'overall_qual', 'overall_cond', 'roof_matl', 'mas_vnr_type', 
    'foundation', 'exter_qual', 'exter_cond', 'bsmt_qual',
    'bsmt_cond', 'bsmt_exposure', 'bsmtfin_type_1', 
    'bsmtfin_type_2', 'heating_qc', 'central_air', 'electrical',
    'bsmt_full_bath', 'full_bath', 'kitchen_qual', 'totrms_abvgrd',
    'fireplaces', 'fireplace_qu', 'garage_type', 'garage_finish', 
    'garage_cars', 'garage_qual', 'paved_drive'
]
categorical_transformer = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ohe', OneHotEncoder(drop='first', handle_unknown='ignore', sparse=False))
    ])

preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, numeric),
        ('cat', categorical_transformer, categorical)
    ], remainder='passthrough', verbose_feature_names_out=False)

ridge_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('classifier', RidgeCV())
    ])

In [85]:
X = train[['mas_vnr_area', 'bsmtfin_sf_1', 'total_bsmt_sf', 
    '1st_flr_sf', 'gr_liv_area', 'garage_area', 'overall_qual', 
    'overall_cond', 'roof_matl', 'mas_vnr_type', 'foundation', 
    'exter_qual', 'exter_cond', 'bsmt_qual','bsmt_cond', 
    'bsmt_exposure', 'bsmtfin_type_1', 'bsmtfin_type_2', 
    'heating_qc', 'central_air', 'electrical','bsmt_full_bath', 
    'full_bath', 'kitchen_qual', 'totrms_abvgrd', 'fireplaces', 
    'fireplace_qu', 'garage_type', 'garage_finish', 'garage_cars', 
    'garage_qual', 'paved_drive']]
y = train['saleprice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [86]:
ridge_pipe.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['mas_vnr_area',
                                                   'bsmtfin_sf_1',
                                                   'total_bsmt_sf',
                                                   '1st_flr_sf', 'gr_liv_area',
                                                   'garage_area']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                         

In [103]:
ridge_pipe.score(X_train, y_train), ridge_pipe.score(X_test, y_test)



(0.9178700362072286, 0.9004000793875788)

These r2 scores are much better than the linear regression model, and much closer together.

In [104]:
mean_squared_error(y_train, ridge_pipe.predict(X_train))**0.5

20793.622830997287

In [105]:
mean_squared_error(y_test, ridge_pipe.predict(X_test))**0.5



24607.075784686036

The MSE's here is similar to the linear regression model - it's doing much better than the baseline.

In [None]:
ridge_pipe.named_steps['preprocessor'].get_feature_names_out()

In [106]:
model_coefs = pd.Series(ridge_pipe.named_steps['classifier'].coef_, 
        index=ridge_pipe.named_steps['preprocessor'].get_feature_names_out())

In [107]:
model_coefs.sort_values()

roof_matl_WdShake   -15006.653597
overall_qual_4      -14763.098253
overall_qual_3      -13025.221719
overall_cond_2      -12667.290331
totrms_abvgrd_12    -12284.645018
                         ...     
exter_qual_5         26633.580037
overall_cond_8       28205.541539
overall_qual_9       45815.740605
roof_matl_WdShngl    49643.544760
overall_qual_10      72487.860762
Length: 128, dtype: float64

**This ridge model is an improvement. Since there are numerous features, I decided to do the lasso to compare.**

In [132]:
test_id = test['id']

ridge_preds = ridge_pipe.predict(test[X.columns])

ridge_preds = pd.DataFrame(ridge_preds, columns=['SalePrice'])

ridge_preds.insert(loc=0, column='Id', value=test_id)



In [134]:
ridge_preds.to_csv('../submissions/ridge.csv', index=False)

---

### Lasso Model

In [207]:
# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

SimpleImputer.get_feature_names_out = (lambda self, names=None:
                                       self.feature_names_in_)

numeric = ['mas_vnr_area', 'bsmtfin_sf_1', 'total_bsmt_sf', '1st_flr_sf', 'gr_liv_area', 'garage_area']
numeric_transformer = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='mean')),
        ('ss', StandardScaler())
    ])

categorical = [
    'overall_qual', 'overall_cond', 'roof_matl', 'mas_vnr_type', 
    'foundation', 'exter_qual', 'exter_cond', 'bsmt_qual',
    'bsmt_cond', 'bsmt_exposure', 'bsmtfin_type_1', 
    'bsmtfin_type_2', 'heating_qc', 'central_air', 'electrical',
    'bsmt_full_bath', 'full_bath', 'kitchen_qual', 'totrms_abvgrd',
    'fireplaces', 'fireplace_qu', 'garage_type', 'garage_finish', 
    'garage_cars', 'garage_qual', 'paved_drive'
]
categorical_transformer = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ohe', OneHotEncoder(drop='first', handle_unknown='ignore', sparse=False))
    ])

preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, numeric),
        ('cat', categorical_transformer, categorical)
    ], remainder='passthrough', verbose_feature_names_out=False)

lasso_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('classifier', LassoCV())
    ])

In [208]:
X = train[['mas_vnr_area', 'bsmtfin_sf_1', 'total_bsmt_sf', 
    '1st_flr_sf', 'gr_liv_area', 'garage_area', 'overall_qual', 
    'overall_cond', 'roof_matl', 'mas_vnr_type', 'foundation', 
    'exter_qual', 'exter_cond', 'bsmt_qual','bsmt_cond', 
    'bsmt_exposure', 'bsmtfin_type_1', 'bsmtfin_type_2', 
    'heating_qc', 'central_air', 'electrical','bsmt_full_bath', 
    'full_bath', 'kitchen_qual', 'totrms_abvgrd', 'fireplaces', 
    'fireplace_qu', 'garage_type', 'garage_finish', 'garage_cars', 
    'garage_qual', 'paved_drive']]
y = train['saleprice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# test predictions
lasso_pipe.predict(test[X.columns])

In [209]:
lasso_pipe.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['mas_vnr_area',
                                                   'bsmtfin_sf_1',
                                                   'total_bsmt_sf',
                                                   '1st_flr_sf', 'gr_liv_area',
                                                   'garage_area']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                         

In [210]:
lasso_pipe.score(X_train, y_train), lasso_pipe.score(X_test, y_test)



(0.9150387095355722, 0.902490526449802)

In [211]:
mean_squared_error(y_train, lasso_pipe.predict(X_train))**0.5

21149.00289241998

In [212]:
mean_squared_error(y_test, lasso_pipe.predict(X_test))**0.5



24347.4743240593

In [None]:
lasso_pipe.named_steps['preprocessor'].get_feature_names_out()

In [213]:
model_coefs = pd.Series(lasso_pipe.named_steps['classifier'].coef_, 
        index=lasso_pipe.named_steps['preprocessor'].get_feature_names_out())

In [214]:
model_coefs[model_coefs != 0].sort_values()

overall_cond_2      -12929.894114
overall_cond_3      -11993.864034
bsmt_cond_2.0        -8516.231447
overall_qual_4       -7639.261346
overall_cond_4       -6127.787860
                         ...     
gr_liv_area          22120.671408
overall_qual_8       24705.529883
roof_matl_WdShngl    35417.521047
overall_qual_9       53680.716768
overall_qual_10      84908.616006
Length: 76, dtype: float64

In [155]:
test_id = test['id']

lasso_preds = lasso_pipe.predict(test[X.columns])

lasso_preds = pd.DataFrame(lasso_preds, columns=['SalePrice'])

lasso_preds.insert(loc=0, column='Id', value=test_id)

In [156]:
lasso_preds.to_csv('../submissions/lasso_2.csv', index=False)

In [263]:
np.arange(0.001, 10.0, 0.05).shape

(200,)

In [250]:
lasso_pipe_params = {
    'classifier__alphas': np.arange(0.001, 10.0, 0.05)
}

In [251]:
gs = GridSearchCV(
    lasso_pipe,
    lasso_pipe_params,
    cv = 5,
    verbose=2,
    n_jobs=-1,
    error_score='raise'
)

In [269]:
gs.fit(X_train, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


AxisError: axis -1 is out of bounds for array of dimension 0