## Preprocessing and Feature Engineering

In [75]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.compose import make_column_transformer

In [17]:
train = pd.read_csv('../datasets/train_cleaned.csv', keep_default_na=False, na_values='')
test = pd.read_csv('../datasets/test_cleaned.csv', keep_default_na=False, na_values='')

In [18]:
train.isna().sum().sort_values(ascending=False).loc[lambda x: x > 0]

lot_frontage      330
garage_yr_blt     114
mas_vnr_type       22
mas_vnr_area       22
bsmt_exposure       4
bsmtfin_type_2      2
bsmt_full_bath      2
bsmt_half_bath      2
garage_cond         1
bsmt_qual           1
garage_finish       1
garage_cars         1
garage_area         1
garage_qual         1
total_bsmt_sf       1
bsmtfin_type_1      1
bsmt_cond           1
bsmtfin_sf_1        1
bsmtfin_sf_2        1
bsmt_unf_sf         1
dtype: int64

In [54]:
X = train[['mas_vnr_area', 'bsmtfin_sf_1', 'total_bsmt_sf', '1st_flr_sf', 'gr_liv_area', 'garage_area']]
y = train['saleprice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:
X_train.shape, X_test.shape, test.shape

((1628, 6), (408, 6), (878, 80))

In [68]:
si = SimpleImputer(strategy='most_frequent')
si.fit(X_train)
X_train_filled = si.transform(X_train)
X_train_filled = pd.DataFrame(X_train_filled, columns=X_train.columns)
X_test_filled = si.transform(X_test)
X_test_filled = pd.DataFrame(X_test_filled, columns=si.feature_names_in_)

In [69]:
ss = StandardScaler()

X_train_sc = ss.fit_transform(X_train_filled)
X_train_sc = pd.DataFrame(X_train_sc, columns=ss.get_feature_names_out())

X_test_sc = ss.transform(X_test_filled)
X_test_sc = pd.DataFrame(X_test_sc, columns=ss.get_feature_names_out())

In [70]:
lr = LinearRegression()
lr.fit(X_train_sc, y_train)

LinearRegression()

In [71]:
lr.score(X_train_sc, y_train), lr.score(X_test_sc, y_test)

(0.7491278104750865, 0.7710962807576048)

In [72]:
mean_squared_error(y_train, lr.predict(X_train_sc))**0.5, mean_squared_error(y_test, lr.predict(X_test_sc))**0.5

(36341.730762700405, 37304.13166141556)

In [73]:
cross_val_score(lr, X_train_sc, y_train, scoring='r2').mean()

0.7408818048031779

In [74]:
baseline = np.full_like(y_test, y_train.mean())
mean_squared_error(y_test, baseline)**0.5

78574.84257325473