# 02_FeatureEngineering

Handle missing values, encode categorical variables, and prepare features for modeling.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Load
df = pd.read_csv('../data/train.csv')
df_original = df.copy()
print(df.shape)

In [None]:
## 1) Target and ID
if 'Id' in df.columns:
    df = df.drop('Id', axis=1)
df['SalePrice_log'] = np.log1p(df['SalePrice'])

# Drop target from X
X = df.drop(['SalePrice','SalePrice_log'], axis=1)
y = df['SalePrice_log']
X.shape

In [None]:
## 2) Simple missing value strategy for demonstration
# Numeric: fill with 0 for basement-related features or median
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

X[num_cols] = X[num_cols].fillna(0)
# Categorical: fill with 'Missing'
X[cat_cols] = X[cat_cols].fillna('Missing')

print('Missing after fill:', X.isnull().sum().sum())

In [None]:
## 3) Encode categorical features with LabelEncoder for tree-based model
from sklearn.preprocessing import OrdinalEncoder

# For large projects prefer OneHotEncoder or Target Encoding; here we show simple ordinal encoding
enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_cat = X[cat_cols].copy()
X_cat_enc = enc.fit_transform(X_cat)
X_cat_enc = pd.DataFrame(X_cat_enc, columns=cat_cols, index=X.index)

X_num = X[num_cols].reset_index(drop=True)
X_prepared = pd.concat([X_num, X_cat_enc.reset_index(drop=True)], axis=1)

X_prepared.shape

In [None]:
## 4) Handle skew for numeric features (log1p where needed)
from scipy.stats import skew
skewed_feats = X_num.apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewed_feats = skewed_feats[abs(skewed_feats) > 0.75].index

for feat in skewed_feats:
    X_prepared[feat] = np.log1p(X_prepared[feat])

print('Applied log1p to', len(skewed_feats), 'features')

In [None]:
## 5) Save prepared dataset for modeling
X_prepared['SalePrice_log'] = y.values
X_prepared.to_csv('../data/processed_train_prepared.csv', index=False)
print('Saved processed file:', '../data/processed_train_prepared.csv')

**Notes:**

- This notebook shows a simple pipeline. For production, prefer scikit-learn Pipelines with ColumnTransformer.
- Consider different imputations per feature group (mode for categorical, median for numeric), and create new features (total area, age).