In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

%config InlineBackend.figure_format = 'retina'

In [None]:
plt.rcParams["axes.spines.right"] = False
plt.rcParams["axes.spines.top"] = False

In [None]:
RANDOM_SEED = 666

In [None]:
housing_data = pd.read_csv("../../2/data/train.csv")
housing_data.head()

# Feature Engineering

## Standardization / Normalization

In [None]:
X = housing_data[["1stFlrSF"]]

In [None]:
fig, ax = plt.subplots()
ax.hist(X, bins=101)
ax.set_title("1st Floor Square Footage Histogram")
ax.set_ylabel("Counts")
None

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scaler = scaler.fit(X)

In [None]:
print(f"Mean = {scaler.mean_}, Variance = {scaler.var_}")

In [None]:
X_trans = scaler.transform(X)

In [None]:
fig, ax = plt.subplots()
ax.hist(X_trans, bins=101)
ax.set_title("Normalized 1st Floor Square Footage Histogram")
ax.set_ylabel("Counts")
None

### PowerTransformer for Gaussian-ifying Data

In [None]:
from sklearn.preprocessing import PowerTransformer

In [None]:
scaler = PowerTransformer()
scaler = scaler.fit(X)
X_trans = scaler.transform(X)
fig, ax = plt.subplots()
ax.hist(X_trans, bins=101)
ax.set_title("Power-Transformed 1st Floor Square Footage Histogram")
ax.set_ylabel("Counts")
None

# String Features

## Ordinal Encoding

```
ExterCond: Evaluates the present condition of the material on the exterior

       Ex	Excellent
       Gd	Good
       TA	Average/Typical
       Fa	Fair
       Po	Poor
```

In [None]:
X = housing_data[["ExterCond"]]
X

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
categories = ["Po", "Fa", "TA", "Gd", "Ex"]
ordinal_encoder = OrdinalEncoder(categories=[categories])

In [None]:
ordinal_encoder = ordinal_encoder.fit(X)

In [None]:
ordinal_encoder.categories_

In [None]:
X_trans = ordinal_encoder.transform(X)
X_trans

In [None]:
ordinal_encoder.inverse_transform(X_trans)

## One Hot Encoding

```
Foundation: Type of foundation

       BrkTil	Brick & Tile
       CBlock	Cinder Block
       PConc	Poured Contrete
       Slab	Slab
       Stone	Stone
       Wood	Wood
```

In [None]:
X = housing_data[["Foundation"]]
X

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
one_hot_encoder = OneHotEncoder(categories="auto", sparse=False)
X_trans = one_hot_encoder.fit_transform(X)
X_trans

In [None]:
one_hot_encoder.categories_

In [None]:
one_hot_encoder.inverse_transform(X_trans)

You can also hack this in pandas

In [None]:
pd.get_dummies(X)

## Multiple Columns

In [None]:
X = housing_data[["Foundation", "SaleCondition", "HouseStyle"]]
X

In [None]:
one_hot_encoder = OneHotEncoder(categories="auto", sparse=False)
X_trans = one_hot_encoder.fit_transform(X)
X_trans

In [None]:
one_hot_encoder.categories_

In [None]:
one_hot_encoder.inverse_transform(one_hot_encoder.fit_transform(X_trans))

# Model Selection

## Train/Test Split

In [None]:
print(f"Full Dataset: {len(housing_data)} samples")

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
TEST_SIZE = 0.30
train, test = train_test_split(
    housing_data, test_size=TEST_SIZE, random_state=RANDOM_SEED
)

In [None]:
print(f"Train Dataset: {len(train):,} samples ({len(train) / len(housing_data):.0%})")
print(f"Test Dataset: {len(test)} samples ({len(test) / len(housing_data):.0%})")

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression()

In [None]:
def get_X_y(housing_data):
    target = "SalePrice"
    features = []
    for col in housing_data.columns:
        if col == target:
            continue
        if housing_data[col].dtype in (np.dtype("int64"), np.dtype("float64")):
            features.append(col)

    X = housing_data[features].copy()
    y = housing_data[target].copy()
    return X, y

In [None]:
X_train, y_train = get_X_y(train)

In [None]:
model.fit(X_train, y_train)

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer = SimpleImputer()
X_train_trans = imputer.fit_transform(X_train)

In [None]:
model.fit(X_train_trans, y_train)

In [None]:
from sklearn.metrics import r2_score

In [None]:
y_train_pred = model.predict(X_train_trans)
r2_train = r2_score(y_train, y_train_pred)
print(f"Training R^2 = {r2_train:.3}")

In [None]:
X_test, y_test = get_X_y(test)

In [None]:
y_test_pred = model.predict(X_test)

In [None]:
X_test_trans = imputer.transform(X_test)

In [None]:
y_test_pred = model.predict(X_test_trans)
r2_test = r2_score(y_test, y_test_pred)
print(f"Test R^2 = {r2_test:.3}")

## Regularization

In [None]:
from sklearn.linear_model import Ridge

In [None]:
def fit_and_evaluate_model(model, train, test):
    X_train, y_train = get_X_y(train)
    X_test, y_test = get_X_y(test)

    imputer = SimpleImputer()
    X_train_trans = imputer.fit_transform(X_train)
    model.fit(X_train_trans, y_train)

    y_train_pred = model.predict(X_train_trans)
    y_test_pred = model.predict(imputer.transform(X_test))

    return r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)

In [None]:
r2_train, r2_test = fit_and_evaluate_model(
    Ridge(alpha=1_000, random_state=RANDOM_SEED), train, test
)

In [None]:
print(f"Train R^2 = {r2_train:.3}")
print(f"Test R^2 = {r2_test:.3}")

In [None]:
def fit_and_evaluate_model_with_scaling(model, train, test):
    X_train, y_train = get_X_y(train)
    X_test, y_test = get_X_y(test)

    # Let's also scale the data first!
    scaler = StandardScaler()
    X_train_trans = scaler.fit_transform(X_train)

    imputer = SimpleImputer()
    X_train_trans = imputer.fit_transform(X_train_trans)

    model.fit(X_train_trans, y_train)

    y_train_pred = model.predict(X_train_trans)
    y_test_pred = model.predict(imputer.transform(scaler.transform(X_test)))

    return r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)

In [None]:
r2_train, r2_test = fit_and_evaluate_model_with_scaling(
    Ridge(alpha=1_000, random_state=RANDOM_SEED), train, test
)
print(f"Train R^2 = {r2_train:.3}")
print(f"Test R^2 = {r2_test:.3}")

## Cross Validation

In [None]:
from sklearn.model_selection import KFold

In [None]:
splitter = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

In [None]:
fold = 1
for train_index, val_index in splitter.split(train):
    print(f"Fold {fold}:")
    print(
        f"\tTrain Dataset: {len(train_index):,} samples ({len(train_index) / len(train):.0%})"
    )
    print(
        f"\tVal Dataset: {len(val_index)} samples ({len(val_index) / len(train):.0%})"
    )
    fold += 1

In [None]:
print(val_index)

In [None]:
r2_trains = []
r2_vals = []
fold = 1
for train_index, val_index in splitter.split(train):
    r2_train, r2_val = fit_and_evaluate_model_with_scaling(
        Ridge(alpha=1_000, random_state=RANDOM_SEED),
        train.iloc[train_index],
        train.iloc[val_index],
    )

    r2_trains.append(r2_train)
    r2_vals.append(r2_val)

    fold += 1

In [None]:
fig, ax = plt.subplots()
ax.plot(range(1, 6), r2_trains, "o--", label="Training")
ax.plot(range(1, 6), r2_vals, "o--", label="Validation")
ax.legend(title="Dataset", bbox_to_anchor=(1, 1))
ax.set_xlabel("Fold")
ax.set_title("K-Fold $R^{2}$")
ax.set_ylim((0, ax.get_ylim()[1]))
None

## Doing a Hyperparameter Search

In [None]:
alpha_values = np.logspace(-1, 6, 8)
print(alpha_values)

In [None]:
avg_r2_trains = []
avg_r2_vals = []
for alpha in alpha_values:
    r2_trains = []
    r2_vals = []
    fold = 1
    for train_index, val_index in splitter.split(train):
        r2_train, r2_val = fit_and_evaluate_model_with_scaling(
            Ridge(alpha=alpha, random_state=RANDOM_SEED),
            train.iloc[train_index],
            train.iloc[val_index],
        )

        r2_trains.append(r2_train)
        r2_vals.append(r2_val)

        fold += 1
    avg_r2_trains.append(np.mean(r2_trains))
    avg_r2_vals.append(np.mean(r2_vals))

In [None]:
fig, ax = plt.subplots()
ax.plot(alpha_values, avg_r2_trains, "o--", label="Training")
ax.plot(alpha_values, avg_r2_vals, "o--", label="Validation")
ax.legend(title="Dataset", bbox_to_anchor=(1, 1))
ax.set_xlabel("$L^{2}$ Regularization Strength ($\\alpha$)")
ax.set_title("Grid Search $R^{2}$")
ax.semilogx()
None

# Easy to Tie Yourself Up In Knots

- We picked out features in `get_X_y()`
- To add the scaler, we had to write a new `fit_and_evaluate_model()` function.
- Have to remember to transform test data in the same order as training data.
- Remember: _everything_ is the model.

# Solution: Pipelines

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [None]:
pipeline = Pipeline(
    [("scaler", StandardScaler()), ("imputer", SimpleImputer()), ("estimator", Ridge())]
)
pipeline

In [None]:
param_grid = {"estimator__alpha": alpha_values}
grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring="r2", return_train_score=True
)

In [None]:
X_train, y_train = get_X_y(train)
X_test, y_test = get_X_y(test)

In [None]:
grid_search = grid_search.fit(X_train, y_train)

In [None]:
results = pd.DataFrame(grid_search.cv_results_)
results

In [None]:
ax = results.set_index("param_estimator__alpha")[
    ["mean_train_score", "mean_test_score"]
].plot(marker="o", linestyle="--")
ax.semilogx()
None