## Preprocessing and Feature Engineering

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
train = pd.read_csv('../datasets/train_cleaned.csv', keep_default_na=False, na_values='')
test = pd.read_csv('../datasets/test_cleaned.csv', keep_default_na=False, na_values='')

In [4]:
train.isna().sum().sort_values(ascending=False).loc[lambda x: x > 0]

lot_frontage      330
garage_yr_blt     114
mas_vnr_type       22
mas_vnr_area       22
bsmt_exposure       4
bsmtfin_type_2      2
bsmt_full_bath      2
bsmt_half_bath      2
garage_cond         1
bsmt_qual           1
garage_finish       1
garage_cars         1
garage_area         1
garage_qual         1
total_bsmt_sf       1
bsmtfin_type_1      1
bsmt_cond           1
bsmtfin_sf_1        1
bsmtfin_sf_2        1
bsmt_unf_sf         1
dtype: int64

In [5]:
X = train[['mas_vnr_area', 'bsmtfin_sf_1', 'total_bsmt_sf', '1st_flr_sf', 'gr_liv_area', 'garage_area']]
y = train['saleprice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
X_train.shape, X_test.shape, test.shape

((1628, 6), (408, 6), (878, 80))

In [7]:
si = SimpleImputer(strategy='most_frequent')
si.fit(X_train)
X_train_filled = si.transform(X_train)
X_train_filled = pd.DataFrame(X_train_filled, columns=X_train.columns)
X_test_filled = si.transform(X_test)
X_test_filled = pd.DataFrame(X_test_filled, columns=si.feature_names_in_)

In [8]:
lr = LinearRegression()
lr.fit(X_train_filled, y_train)

LinearRegression()

In [9]:
lr.score(X_train_filled, y_train), lr.score(X_test_filled, y_test)

(0.7491278104750866, 0.7710962807576048)

In [10]:
mean_squared_error(y_train, lr.predict(X_train_filled))**0.5, mean_squared_error(y_test, lr.predict(X_test_filled))**0.5

(36341.730762700405, 37304.13166141556)

In [11]:
cross_val_score(lr, X_train_filled, y_train, scoring='r2').mean()

0.7408818048031779

**The baseline score is below:**
- This score is just the result of the model guessing the mean each time it makes a prediction.

In [12]:
baseline = np.full_like(y_train, y_train.mean())
mean_squared_error(y_train, baseline)**0.5

72557.00481044303

In [13]:
pd.DataFrame(zip(X_train.columns, lr.coef_))

Unnamed: 0,0,1
0,mas_vnr_area,49.438406
1,bsmtfin_sf_1,25.748452
2,total_bsmt_sf,47.131638
3,1st_flr_sf,-5.384033
4,gr_liv_area,71.565796
5,garage_area,82.753464


The model above was just with the columns that were already numeric - I wanted to see what the model looked like as is before one-hot-encoding the categorical values. It's doing better than the baseline as of right now.

---

### Linear Regression Model

In [14]:
# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

SimpleImputer.get_feature_names_out = (lambda self, names=None:
                                       self.feature_names_in_)

numeric = ['mas_vnr_area', 'bsmtfin_sf_1', 'total_bsmt_sf', '1st_flr_sf', 'gr_liv_area', 'garage_area']
numeric_transformer = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='mean')),
        ('ss', StandardScaler())
    ])

categorical = [
    'overall_qual', 'overall_cond', 'roof_matl', 'mas_vnr_type', 
    'foundation', 'exter_qual', 'exter_cond', 'bsmt_qual',
    'bsmt_cond', 'bsmt_exposure', 'bsmtfin_type_1', 
    'bsmtfin_type_2', 'heating_qc', 'central_air', 'electrical',
    'bsmt_full_bath', 'full_bath', 'kitchen_qual', 'totrms_abvgrd',
    'fireplaces', 'fireplace_qu', 'garage_type', 'garage_finish', 
    'garage_cars', 'garage_qual', 'paved_drive'
]
categorical_transformer = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ohe', OneHotEncoder(drop='first', handle_unknown='ignore', sparse=False))
    ])

preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, numeric),
        ('cat', categorical_transformer, categorical)
    ], remainder='passthrough', verbose_feature_names_out=False)

lr_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('classifier', LinearRegression())
    ])

In [15]:
X = train[['mas_vnr_area', 'bsmtfin_sf_1', 'total_bsmt_sf', 
    '1st_flr_sf', 'gr_liv_area', 'garage_area', 'overall_qual', 
    'overall_cond', 'roof_matl', 'mas_vnr_type', 'foundation', 
    'exter_qual', 'exter_cond', 'bsmt_qual','bsmt_cond', 
    'bsmt_exposure', 'bsmtfin_type_1', 'bsmtfin_type_2', 
    'heating_qc', 'central_air', 'electrical','bsmt_full_bath', 
    'full_bath', 'kitchen_qual', 'totrms_abvgrd', 'fireplaces', 
    'fireplace_qu', 'garage_type', 'garage_finish', 'garage_cars', 
    'garage_qual', 'paved_drive']]
y = train['saleprice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
lr_pipe.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  ['mas_vnr_area',
                                                   'bsmtfin_sf_1',
                                                   'total_bsmt_sf',
                                                   '1st_flr_sf', 'gr_liv_area',
                                                   'garage_area']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                         

In [19]:
lr_pipe.score(X_train, y_train), lr_pipe.score(X_test, y_test)



(0.9187163577390317, -4.6252552309704925e+19)

The difference in these r2 scores show the model to be way overfit.

In [20]:
mean_squared_error(y_train, lr_pipe.predict(X_train))**0.5

20686.209771222402

In [93]:
mean_squared_error(y_test, lr_pipe.predict(X_test))**0.5



530271592825172.6

The RMSE here is much lower than the base model for train, but for the test is way too big (and much larger than the baseline).

In [None]:
lr_pipe.named_steps['preprocessor'].get_feature_names_out()

In [85]:
model_coefs = pd.Series(lr_pipe.named_steps['classifier'].coef_, index=lr_pipe.named_steps['preprocessor'].get_feature_names_out())

In [87]:
model_coefs.sort_values()

fireplaces_4         -1.496641e+17
fireplaces_3         -1.496641e+17
fireplaces_1         -1.496641e+17
fireplaces_2         -1.496641e+17
bsmtfin_type_1_2.0   -1.129340e+17
                          ...     
fireplace_qu_5        1.496641e+17
garage_finish_1.0     1.560329e+17
garage_finish_2.0     1.560329e+17
garage_finish_3.0     1.560329e+17
garage_type_NA        2.009523e+17
Length: 128, dtype: float64

#### This was my first model fit with Linear Regression. It was way overfit with the r2 scores - the train score was wildly higher than the test score. The coefficients are also extremely large. This is not an ideal model - will keep working to tune it up.