# Prompt

## Try-it 9.2: Predicting Wages

This activity is meant to summarize your work with regularized regression models.  You will use your earlier work with data preparation and pipelines together with what you've learned with grid searches to determine an optimal model.  In addition to the prior strategies, this example is an excellent opportunity to utilize the `TransformedTargetRegressor` estimator in scikitlearn.

## The Data

This dataset is loaded from the openml resource library.  Originally from census data, the data contains wage and demographic information on 534 individuals.  

From the dataset documentation [here](https://www.openml.org/d/534):

> The Current Population Survey (CPS) is used to supplement census information between census years. These data consist of a random sample of 534 persons from the CPS, with information on wages and other characteristics of the workers, including sex, number of years of education, years of work experience, occupational status, region of residence and union membership.

## Task

Build regression models to predict `WAGE`.  Incorporate the categorical features and transform the target using a logarithm.  Build `Ridge` models and consider some different amounts of regularization.  

After fitting your model, interpret the model and try to understand what features led to higher wages.  Consider using `permutation_importance` that you encountered in module 8.  Discuss your findings in the class forum.

For an in depth example discussing the perils of interpreting the coefficients see the example in scikitlearn examples [here](https://scikit-learn.org/stable/auto_examples/inspection/plot_linear_model_coefficient_interpretation.html).

# Imports

In [None]:
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.compose import (
    make_column_transformer,
    TransformedTargetRegressor,
    make_column_selector,
)
from sklearn.utils import Bunch
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.datasets import fetch_openml
from sklearn.metrics import mean_squared_error

from IPython.display import Image

In [None]:
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)
mpl.rcParams.update({"axes.grid": True})

# Data Load

In [None]:
df = fetch_openml(data_id=534, as_frame=True).frame

In [None]:
df = df.query("WAGE > 1 and WAGE < 30")  # snip 2 outliers

In [None]:
df.head()

# Feature Naming

In [None]:
target_feature = "WAGE"
numeric_features = df.columns[df.dtypes != "category"].to_list()
numeric_features.remove(target_feature)
one_hot_features = df.columns[df.dtypes == "category"].to_list()
display([numeric_features, one_hot_features])

# Assign to Feature and Target Frames

In [None]:
X = df.drop(columns=target_feature)
y = df[target_feature]

In [None]:
X.head()

# Develop Processing Pipeline

## Polynomial Features

In [None]:
poly_pipe = Pipeline(
    [
        (
            "scaler",
            make_column_transformer(
                (
                    StandardScaler(),
                    numeric_features,
                ),
            ),
        ),
        ("poly", PolynomialFeatures(include_bias=False)),
    ]
).fit(X)

display(poly_pipe)

In [None]:
# pd.DataFrame(
#     poly_pipe.transform(X), columns=poly_pipe.get_feature_names_out()
# )

## One Hot Features

In [None]:
ohe_pipe = Pipeline(
    [
        (
            "ohe",
            make_column_transformer(
                (
                    OneHotEncoder(drop="if_binary"),
                    one_hot_features,
                ),
            ),
        ),
    ]
).fit(X)

display(ohe_pipe)

In [None]:
# pd.DataFrame(ohe_pipe.transform(X), columns=ohe_pipe.get_feature_names_out())

## Union of Features

In [None]:
feature_union = FeatureUnion(
    [
        ("poly_pipe", poly_pipe),
        ("ohe_pipe", ohe_pipe),
    ]
).fit(X)

display(feature_union)

In [None]:
# pd.DataFrame(
#     feature_union.transform(X), columns=feature_union.get_feature_names_out()
# )

## Ridge Pipeline

In [None]:
ridge_pipe = Pipeline(
    [
        ("feature_union", feature_union),
        ("scaler", StandardScaler()),
        ("ridge", Ridge(fit_intercept=True)),
    ]
)

display(ridge_pipe)

## Transformed Target Regression

In [None]:
ttr_pipe = Pipeline(
    [
        (
            "ttr",
            TransformedTargetRegressor(
                regressor=ridge_pipe, func=np.log, inverse_func=np.exp
            ),
        )
    ]
).fit(X, y)

ttr_pipe

# Grid Search over Hyperparameters

In [None]:
(train_inds, dev_inds) = train_test_split(
    range(len(df)), random_state=42, train_size=0.75
)

In [None]:
degree_list = range(1, 4)
alpha_list = 10 ** np.linspace(-5, 5, 51)
param_grid = {
    "ttr__regressor__feature_union__poly_pipe__poly__degree": degree_list,
    "ttr__regressor__ridge__alpha": alpha_list,
}

grid_search = GridSearchCV(
    estimator=ttr_pipe,
    param_grid=param_grid,
    scoring="neg_mean_squared_error",
    cv=[[train_inds, dev_inds]],
).fit(X, y)

In [None]:
grid_search.best_params_

In [None]:
ttr_pipe_best = grid_search.best_estimator_.fit(X, y)
ttr_pipe_best

# Plot of Model Error vs. Parameters

In [None]:
results_df = pd.DataFrame(grid_search.cv_results_["params"]).join(
    pd.DataFrame({"mean_test_score": grid_search.cv_results_["mean_test_score"]})
)

results_df.columns = [name.split("__")[-1] for name in results_df.columns]
results_df

In [None]:
best_poly_degree = grid_search.best_params_[list(grid_search.best_params_.keys())[0]]
best_alpha = grid_search.best_params_[list(grid_search.best_params_.keys())[1]]

In [None]:
fig = px.line(
    results_df,
    x=np.log10(1.0 / results_df["alpha"]),
    y=-results_df["mean_test_score"],
    labels={
        "x": "log10(1/alpha)",
        "y": "MSE",
        "degree": "Polynomial Degree",
    },
    title="Model Performance vs. Ridge Alpha, Colored by Polynomial Degree<br>Best Alpha = %.2f, Best Poly Deg = %d"
    % (best_alpha, best_poly_degree),
    color="degree",
)

Image(fig.update_layout(title_x=0.5).to_image(format="png", width=1200, scale=2))

# Scatter Plot of Predicted vs. Truth Data

In [None]:
y_pred = ttr_pipe.predict(X)
y_pred_best = ttr_pipe_best.predict(X)

In [None]:
mse_basic = mean_squared_error(y, y_pred)  # was - 18.416299035907095
mse_best = mean_squared_error(ttr_pipe_best.predict(X), y)
[mse_basic, mse_best]

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=y,
        y=y_pred,
        mode="markers",
        name="Basic Pipeline, MSE = %.4f" % mse_basic,
    )
)

fig.add_trace(
    go.Scatter(
        x=y,
        y=y_pred_best,
        mode="markers",
        name="Best Pipeline, MSE = %.4f" % mse_best,
    )
)

fig.update_layout(
    title="Prediction vs. Truth for Basic and Best Pipelines, Colored by Model Type",
    xaxis_title="Wage - Truth",
    yaxis_title="Wage - Predicted",
)

Image(fig.update_layout(title_x=0.5).to_image(format="png", width=1200, scale=2))

# Permutation Importance

In [None]:
def make_permutation_importance_DataFrame(
    model,
    X: pd.DataFrame,
    y: pd.Series,
    permutation_importance_kwargs,
) -> pd.DataFrame:
    # Compute the importances
    pi = permutation_importance(
        model,
        X,
        y,
        **permutation_importance_kwargs,
    )

    # Make frame from the importances, with columns arranged from
    # lowest to highest mean importance
    ordered_features = list(np.array(X.columns)[np.argsort(pi.importances_mean)])
    return pd.DataFrame(pi.importances.T, columns=X.columns)[ordered_features]

In [None]:
# Importance per feature
df_pi = make_permutation_importance_DataFrame(
    ttr_pipe_best,
    X,
    y,
    {"random_state": 42, "n_repeats": 50},
)

df_pi.head()

In [None]:
Image(
    px.bar(
        data_frame=df_pi.mean() / df_pi.mean()[-1] * 100.0,
        color=df_pi.std(),
        orientation="h",
        title="Permutation Importance per Feature<br>Considering %d Shuffles per Feature"
        % len(df_pi),
        labels={
            "value": "Permutation Importance, as a Percentage of %s Importance"
            % df_pi.columns[-1],
            "index": "Feature",
            "color": "Standard Deviation",
        },
    )
    .update_layout(title_x=0.5)
    .update_xaxes(tickvals=list(range(5, 105, 5)), range=[0, 100])
    .to_image(format="png", width=1200, scale=2)
)