## Preprocessing and Feature Engineering

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
train = pd.read_csv('../datasets/train_cleaned.csv', keep_default_na=False, na_values='')
test = pd.read_csv('../datasets/test_cleaned.csv', keep_default_na=False, na_values='')

In [3]:
train.isna().sum().sort_values(ascending=False).loc[lambda x: x > 0]

lot_frontage      330
garage_yr_blt     114
mas_vnr_type       22
mas_vnr_area       22
bsmt_exposure       4
bsmtfin_type_2      2
bsmt_full_bath      2
bsmt_half_bath      2
garage_cond         1
bsmt_qual           1
garage_finish       1
garage_cars         1
garage_area         1
garage_qual         1
total_bsmt_sf       1
bsmtfin_type_1      1
bsmt_cond           1
bsmtfin_sf_1        1
bsmtfin_sf_2        1
bsmt_unf_sf         1
dtype: int64

In [4]:
X = train[['mas_vnr_area', 'bsmtfin_sf_1', 'total_bsmt_sf', '1st_flr_sf', 'gr_liv_area', 'garage_area']]
y = train['saleprice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
X_train.shape, X_test.shape, test.shape

((1628, 6), (408, 6), (878, 80))

In [6]:
si = SimpleImputer(strategy='most_frequent')
si.fit(X_train)
X_train_filled = si.transform(X_train)
X_train_filled = pd.DataFrame(X_train_filled, columns=X_train.columns)
X_test_filled = si.transform(X_test)
X_test_filled = pd.DataFrame(X_test_filled, columns=si.feature_names_in_)

In [7]:
lr = LinearRegression()
lr.fit(X_train_filled, y_train)

LinearRegression()

In [8]:
lr.score(X_train_filled, y_train), lr.score(X_test_filled, y_test)

(0.7491278104750866, 0.7710962807576048)

In [9]:
mean_squared_error(y_train, lr.predict(X_train_filled))**0.5, mean_squared_error(y_test, lr.predict(X_test_filled))**0.5

(36341.730762700405, 37304.13166141556)

In [10]:
cross_val_score(lr, X_train_filled, y_train, scoring='r2').mean()

0.7408818048031779

In [11]:
baseline = np.full_like(y_test, y_train.mean())
mean_squared_error(y_test, baseline)**0.5

78574.84257325473

In [12]:
pd.DataFrame(zip(X_train.columns, lr.coef_))

Unnamed: 0,0,1
0,mas_vnr_area,49.438406
1,bsmtfin_sf_1,25.748452
2,total_bsmt_sf,47.131638
3,1st_flr_sf,-5.384033
4,gr_liv_area,71.565796
5,garage_area,82.753464


The model above was just with the columns that were already numeric - I wanted to see what the model looked like as is before one-hot-encoding the categorical values. It's doing better than the baseline as of right now.

In [13]:
# can use stats models for p values
# can use regularization to check - lasso (anything that gets zeroed out for coefs doesn't really matter)

---

In [44]:
# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

SimpleImputer.get_feature_names_out = (lambda self, names=None:
                                       self.feature_names_in_)

numeric = ['mas_vnr_area', 'bsmtfin_sf_1', 'total_bsmt_sf', '1st_flr_sf', 'gr_liv_area', 'garage_area']
numeric_transformer = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='mean')),
        ('ss', StandardScaler())
    ])

categorical = [
    'overall_qual', 'overall_cond', 'roof_matl', 'mas_vnr_type', 
    'foundation', 'exter_qual', 'exter_cond', 'bsmt_qual',
    'bsmt_cond', 'bsmt_exposure', 'bsmtfin_type_1', 
    'bsmtfin_type_2', 'heating_qc', 'central_air', 'electrical',
    'bsmt_full_bath', 'full_bath', 'kitchen_qual', 'totrms_abvgrd',
    'fireplaces', 'fireplace_qu', 'garage_type', 'garage_finish', 
    'garage_cars', 'garage_qual', 'paved_drive'
]
categorical_transformer = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ohe', OneHotEncoder(drop='first', handle_unknown='ignore', sparse=False))
    ])

preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, numeric),
        ('cat', categorical_transformer, categorical)
    ], remainder='passthrough', verbose_feature_names_out=False)

lr_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('classifier', LinearRegression())
    ])

In [45]:
X = train[['mas_vnr_area', 'bsmtfin_sf_1', 'total_bsmt_sf', 
    '1st_flr_sf', 'gr_liv_area', 'garage_area', 'overall_qual', 
    'overall_cond', 'roof_matl', 'mas_vnr_type', 'foundation', 
    'exter_qual', 'exter_cond', 'bsmt_qual','bsmt_cond', 
    'bsmt_exposure', 'bsmtfin_type_1', 'bsmtfin_type_2', 
    'heating_qc', 'central_air', 'electrical','bsmt_full_bath', 
    'full_bath', 'kitchen_qual', 'totrms_abvgrd', 'fireplaces', 
    'fireplace_qu', 'garage_type', 'garage_finish', 'garage_cars', 
    'garage_qual', 'paved_drive']]
y = train['saleprice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# https://stackoverflow.com/questions/54646709/sklearn-pipeline-get-feature-names-after-onehotencode-in-columntransformer

transformed = preprocessor.fit_transform(X_train)
df_transform = pd.DataFrame(transformed, columns=lr_pipe.named_steps['preprocessor'].transformers_[0][1]\
   .named_steps['ohe'].get_feature_names_out())
df_transform.head()

KeyError: 'ohe'

In [19]:
transformed.shape

(1628, 128)

In [46]:
lr_pipe.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['mas_vnr_area',
                                                   'bsmtfin_sf_1',
                                                   'total_bsmt_sf',
                                                   '1st_flr_sf', 'gr_liv_area',
                                                   'garage_area']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                         

In [47]:
lr_pipe.score(X_train, y_train), lr_pipe.score(X_test, y_test)



(0.9187163577390317, -4.6252552309704925e+19)

In [48]:
lr_pipe.named_steps['preprocessor'].get_feature_names_out()

array(['mas_vnr_area', 'bsmtfin_sf_1', 'total_bsmt_sf', '1st_flr_sf',
       'gr_liv_area', 'garage_area', 'overall_qual_2', 'overall_qual_3',
       'overall_qual_4', 'overall_qual_5', 'overall_qual_6',
       'overall_qual_7', 'overall_qual_8', 'overall_qual_9',
       'overall_qual_10', 'overall_cond_2', 'overall_cond_3',
       'overall_cond_4', 'overall_cond_5', 'overall_cond_6',
       'overall_cond_7', 'overall_cond_8', 'overall_cond_9',
       'roof_matl_Membran', 'roof_matl_Tar&Grv', 'roof_matl_WdShake',
       'roof_matl_WdShngl', 'mas_vnr_type_BrkFace', 'mas_vnr_type_None',
       'mas_vnr_type_Stone', 'foundation_CBlock', 'foundation_PConc',
       'foundation_Slab', 'foundation_Stone', 'foundation_Wood',
       'exter_qual_3', 'exter_qual_4', 'exter_qual_5', 'exter_cond_2',
       'exter_cond_3', 'exter_cond_4', 'exter_cond_5', 'bsmt_qual_1.0',
       'bsmt_qual_2.0', 'bsmt_qual_3.0', 'bsmt_qual_4.0', 'bsmt_qual_5.0',
       'bsmt_cond_1.0', 'bsmt_cond_2.0', 'bsmt_cond_3.0

In [55]:
pd.Series(lr_pipe.named_steps['classifier'].coef_, index=lr_pipe.named_steps['preprocessor'].get_feature_names_out())

mas_vnr_area       3.886768e+03
bsmtfin_sf_1       6.235480e+03
total_bsmt_sf      7.644873e+03
1st_flr_sf         1.642373e+03
gr_liv_area        2.319474e+04
                       ...     
garage_qual_3.0    1.129755e+16
garage_qual_4.0    1.129755e+16
garage_qual_5.0    1.129755e+16
paved_drive_2     -2.489064e+03
paved_drive_3      8.267927e+03
Length: 128, dtype: float64

In [110]:
from joblib import dump, load
dump(lr_pipe, 'pipe_model.joblib')

['pipe_model.joblib']