# Week 4 Exercise

Let's put our examples of classifying Chinstrap vs. the rest into a full ML workflow.
Below, look for chunks of text giving you instruction on filling out missing code.

In [None]:
### Step 0: Setup ----

# If needed in a fresh Colab:
# !pip install seaborn mlxtend statsmodels

import numpy as np # all things math
import pandas as pd # data frames, our workhorse for data

# libraries for graphics
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn packages
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.pipeline import Pipeline

# for feature selection
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import statsmodels.api as sm


In [None]:
### Step 1: Preprocessing ----

# We’ll use Chinstrap vs. the rest only so ROC/AUC is clean binary logistic.

# With pipelines, we don’t one-hot encode manually here.
# We specify the encoder in the preprocessing pipeline,
# and it will be fit only on the training data.

# Load PalmerPenguins from seaborn
penguins = sns.load_dataset("penguins")

penguins.info()

print("\nMissing values:")
print(penguins.isna().sum())


In [None]:
# Set up features and target, can do here or in feature engineering

# Numeric features (all have some missingness)
num_features = [
    "bill_length_mm",
    "bill_depth_mm",
    "flipper_length_mm",
    "body_mass_g"
]

# Categorical feature (also has missing values)
cat_features = ["sex"]

# Outcome: species, which we'll convert to 0/1
target = "species"

# Drop rows where the target is missing (just to be safe)
penguins = penguins.dropna(subset=[target])

# Define X and y
X = penguins[num_features + cat_features]

# Only predicting one species vs others for simplicity
# Binary encode: Gentoo = 1, Adelie = 0
y = (penguins[target] == "Chinstrap").astype(int)

print("\nX summary statistics:")
print(X.describe())

print("\ny summary statistics:")
print(y.describe())


In [None]:
### 2. Train/Test Split ----

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.30,
    stratify=y,
    random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)




In [None]:
### 3. Exploratory Data Analysis ----

# We'll drop missing values before plotting

train_eda = X_train.copy()
train_eda["species"] = np.where(y_train == 1, "Gentoo", "Adelie")


sns.pairplot( # Drop rows with any missing values so seaborn doesn't complain
    data=train_eda.dropna(subset=num_features + ["species"]),
    vars=num_features,
    hue="species",
    diag_kind="kde",
    corner=True # speed up plotting time by only looking at one triangle
)
plt.show()



Below, build pipelines for numeric and categorical variables that:

1. Imputes missing values using `KNNImputer` for numeric variables and `SimpleImputer` to use most frequent values for categorical variables.
2. Standardizes numeric variables.
3. Encodes categorical variables as binary using `OneHotEncoder`.

Use the `Pipeline` and `ColumnTransformer` functions to put this together. Apply to both the training and test sets.

In [None]:
### 4. Feature Engineering ----

# Here we build a preprocessor that:

# Applies KNNImputer + StandardScaler to numeric features
# Applies SimpleImputer + OneHotEncoder to sex
# We’ll use this preprocessor once to create the design matrix for modeling.

# Numeric transformer: KNNImputer + StandardScaler
numeric_transformer = [] # remove brackets and fill in the blank

# Categorical transformer: most frequent imputation + one-hot encoding
categorical_transformer = [] # remove brackets and fill in the blank

# ColumnTransformer to apply to the right columns
# Set up a dictonary of tuples specifying what transformations to apply
# to which colums
preprocessor = [] # remove brackets and fill in the blank

# Fit on training data
X_train_proc = preprocessor.fit_transform(X_train)

# this seamlessly applies to test data
X_test_proc  = preprocessor.transform(X_test)

print("Processed train shape:", X_train_proc.shape)
print("Processed test shape:", X_test_proc.shape)



In [None]:
# Get feature names after preprocessing (useful for stepwise and statsmodels later):

# Feature names from the preprocessor
num_feature_names = num_features

# Getting categorical feature names is more complex because of the binary encoding
cat_pipeline = preprocessor.named_transformers_["cat"]
ohe = cat_pipeline.named_steps["onehot"]
cat_feature_names = ohe.get_feature_names_out(cat_features)

feature_names = np.concatenate([num_feature_names, cat_feature_names])
print("Feature names after preprocessing:")
print(feature_names)


Use `StratifiedKFold` to create a cross validation object with 10 folds, stratified by the outcome variable. Set `random_state` to 42

In [None]:
### 5. Modeling ----

# We will:
# 1. Fit a “kitchen sink” logistic model using all processed features.
# 2. Use stepwise (SFS) to select a subset of features.
# 3. Evaluate both models using 10-fold cross-validation AUC on the training set.


# Define CV sets
cv = [] # remove brackets and fill in the blank


Fill out the arguments of `cross_val_score` to perform 10-fold cross validation on your logistic regression model. This is classification, so use `'roc_auc'` as the evaluation metric.

In [None]:
# Kitchen sink model
log_reg_full = LogisticRegression(
    solver="lbfgs",
    max_iter=2000
)

scores_full = c[] # remove brackets and fill in the blank

print("Full model CV AUC scores:", scores_full)
print("Full model mean AUC:", scores_full.mean())

In [None]:
# Stepwise selection using SFS (on preprocessed training data)
sfs = SFS(
    estimator=LogisticRegression(
        solver="lbfgs",
        max_iter=2000
    ),
    k_features="best",          # let SFS choose the best number of features
    forward=True,
    floating=True,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1
)

sfs = sfs.fit(X_train_proc, y_train)

selected_idx = list(sfs.k_feature_idx_)
selected_feature_names = feature_names[selected_idx]

print("\nSelected feature indices:", selected_idx)
print("Selected feature names:", selected_feature_names)



In [None]:
# Select out only columns picked by stepwise selection
X_train_step = X_train_proc[:, selected_idx]
X_test_step  = X_test_proc[:, selected_idx]


Similarly, use `cross_val_score` to perform 10-fold cross validation on the model selected by `SFS`, above. Note that we have subset the X matrices (test and train) based on the selected variables in the chunk above.

In [None]:
# Evaluate the stepwise model on the 10-fold CV set
log_reg_step = LogisticRegression(
    solver="lbfgs",
    max_iter=2000
)

scores_step = [] # remove brackets and fill in the blank

print("\nStepwise model CV AUC scores:", scores_step)
print("Stepwise model mean AUC:", scores_step.mean())


In [None]:
### 6. Evaluation on the test set ----

# Fit both models on all training data
log_reg_full.fit(X_train_proc, y_train)
log_reg_step.fit(X_train_step, y_train)

# Predict probabilities on the test set
y_prob_full = log_reg_full.predict_proba(X_test_proc)[:, 1]
y_prob_step = log_reg_step.predict_proba(X_test_step)[:, 1]

auc_full_test = roc_auc_score(y_test, y_prob_full)
auc_step_test = roc_auc_score(y_test, y_prob_step)

print(f"\nTest AUC - Full model:     {auc_full_test:.3f}")
print(f"Test AUC - Stepwise model: {auc_step_test:.3f}")


In [None]:
# plot both ROC curves
fpr_full, tpr_full, _ = roc_curve(y_test, y_prob_full)
fpr_step, tpr_step, _ = roc_curve(y_test, y_prob_step)

plt.figure(figsize=(6, 5))
plt.plot(fpr_full, tpr_full, label=f"Full model (AUC={auc_full_test:.3f})")
plt.plot(fpr_step, tpr_step, label=f"Stepwise model (AUC={auc_step_test:.3f})")
plt.plot([0, 1], [0, 1], "k--", label="Random")

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves on Test Set")
plt.legend()
plt.show()


In [None]:
### Bonus: Refit the stepwise model in statsmodels for inference ----
# 1. Take the preprocessed training data
# 2. Keep only the selected columns
# 3. Fit a statsmodels.Logit model for inference (coefficients, standard errors, etc.)

# Build a DataFrame for statsmodels with selected features
X_train_step_df = pd.DataFrame(
    X_train_step,
    columns=selected_feature_names,
    index=X_train.index
)

X_test_step_df = pd.DataFrame(
    X_test_step,
    columns=selected_feature_names,
    index=X_test.index
)

# Add intercept
X_train_sm = sm.add_constant(X_train_step_df)
X_test_sm  = sm.add_constant(X_test_step_df, has_constant="add")

# y must be aligned and numeric (already 0/1)
logit_sm = sm.Logit(y_train, X_train_sm).fit()
print(logit_sm.summary())
