# Intro

- [Kaggle](https://www.kaggle.com/)
- [Stack Overflow](https://stackoverflow.com/)
- [`scikit-learn`](https://scikit-learn.org/stable/)

# Setup

In [1]:
# !pip install numpy==1.24.3
# !pip install pandas==2.1.4
# !pip install scikit-learn==1.3.2

In [None]:
import numpy as np  # For numerical computation when a dataframe isn't available
import pandas as pd  # For reading/manipulating data
from sklearn.impute import SimpleImputer  # For imputing missing values
from sklearn.linear_model import LogisticRegression  # Simple classifier
from sklearn.model_selection import train_test_split  # Split train data into train/val
from sklearn.preprocessing import MinMaxScaler  # Simple preprocessing step

In [None]:
# Titanic data set
train = pd.read_csv("train.csv", index_col="PassengerId")
test = pd.read_csv("test.csv", index_col="PassengerId")

In [None]:
# Separate X and y
X = train.drop(columns="Survived")
y = train.Survived

In [None]:
# Split into train/val data.
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0, stratify=y)

In [None]:
# Limit to features with dtype float
cont_cols = X_train.select_dtypes(include="float").columns
X_train_float = X_train[cont_cols]
X_val_float = X_val[cont_cols]

In [None]:
# Scale data.
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_float)
X_val_scaled = scaler.transform(X_val_float)

# Impute data
imputer = SimpleImputer()
X_train_imputed = imputer.fit_transform(X_train_scaled)
X_val_imputed = imputer.transform(X_val_scaled)

In [None]:
# Fit the model and grade against val data.
clf = LogisticRegression(random_state=0)
clf.fit(X_train_imputed, y_train)
clf.score(X_val_imputed, y_val)

# [`Pipeline`](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
pipe = Pipeline(
    steps=[
        ("scale", MinMaxScaler()),
        ("impute", SimpleImputer()),
        ("clf", LogisticRegression(random_state=0)),
    ],
)
pipe.fit(X_train_float, y_train)
pipe.score(X_val_float, y_val)

In [None]:
# HTML representation of the pipe.
pipe

# Expanding Feature Space

In [None]:
from sklearn.preprocessing import OneHotEncoder, TargetEncoder

In [None]:
X_train.nunique().sort_values()

In [None]:
# Limit to categorical/ordinal features.
cat_ord_cols = ["Sex", "Pclass", "Embarked", "SibSp", "Parch"]
X_train_enc = X_train[cat_ord_cols]
X_val_enc = X_val[cat_ord_cols]

In [None]:
# One-hot-encode categorical/ordinal features.
ohe = OneHotEncoder(drop="first", sparse_output=False, max_categories=5)
X_train_ohe = ohe.fit_transform(X_train_enc)
X_val_ohe = ohe.transform(X_val_enc)

In [None]:
# Target encode categorical/ordinal features.
tgt = TargetEncoder(random_state=0)
X_train_tgt = tgt.fit_transform(X_train_enc, y_train)
X_val_tgt = tgt.transform(X_val_enc)

In [None]:
# Join One-hot-encoded features with target-encoded features.
X_train_feat_union = np.hstack((X_train_ohe, X_train_tgt))
X_val_feat_union = np.hstack((X_val_ohe, X_val_tgt))

In [None]:
clf.fit(X_train_feat_union, y_train)
clf.score(X_val_feat_union, y_val)

# [`FeatureUnion`](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.FeatureUnion.html)

In [None]:
from sklearn.pipeline import FeatureUnion

In [None]:
cat_ord = FeatureUnion(
    transformer_list=[
        ("ohe", OneHotEncoder(drop="first", sparse_output=False, max_categories=5)),
        ("tgt", TargetEncoder(random_state=0)),
    ],
)
_train = cat_ord.fit_transform(X_train_enc, y_train)
_val = cat_ord.transform(X_val_enc)

In [None]:
np.allclose(a=X_train_feat_union, b=_train)

In [None]:
np.allclose(a=X_val_feat_union, b=_val)

In [None]:
pipe = Pipeline(
    steps=[
        ("cat_ord", cat_ord),
        ("clf", LogisticRegression(random_state=0)),
    ],
)
pipe.fit(X_train_enc, y_train)
pipe.score(X_val_enc, y_val)

In [None]:
pipe

# Transform Feature Subsets

In [None]:
# Limit to features with dtype float.
cont_cols = X_train.select_dtypes(include="float").columns
X_train_float = X_train[cont_cols]
X_val_float = X_val[cont_cols]

# Continuous feature pipeline.
cont_pipe = Pipeline(
    steps=[
        ("scale", MinMaxScaler()),
        ("impute", SimpleImputer()),
    ],
)
X_train_cont = cont_pipe.fit_transform(X_train_float)
X_val_cont = cont_pipe.transform(X_val_float)

In [None]:
# Limit to categorical/ordinal features.
cat_ord_cols = ["Sex", "Pclass", "Embarked", "SibSp", "Parch"]
X_train_enc = X_train[cat_ord_cols]
X_val_enc = X_val[cat_ord_cols]

# Categorical/ordinal feature union.
cat_ord = FeatureUnion(
    transformer_list=[
        ("ohe", OneHotEncoder(drop="first", sparse_output=False, max_categories=5)),
        ("tgt", TargetEncoder(random_state=0)),
    ],
)
X_train_feat_union = cat_ord.fit_transform(X_train_enc, y_train)
X_val_feat_union = cat_ord.transform(X_val_enc)

In [None]:
# Join continuous transformations with categorical/ordinal transformations.
X_train_join = np.hstack((X_train_cont, X_train_feat_union))
X_val_join = np.hstack((X_val_cont, X_val_feat_union))

In [None]:
clf.fit(X_train_join, y_train)
clf.score(X_val_join, y_val)

# [`ColumnTransformer`](https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html)

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
col_trf = ColumnTransformer(
    transformers=[
        ("cont_pipe", cont_pipe, cont_cols),
        ("cat_ord", cat_ord, cat_ord_cols),
    ],
    remainder="drop",
)
_train = col_trf.fit_transform(X_train, y_train)
_val = col_trf.transform(X_val)

In [None]:
np.allclose(a=X_train_join, b=_train)

In [None]:
np.allclose(a=X_val_join, b=_val)

In [None]:
pipe = Pipeline(
    steps=[
        ("col_trf", col_trf),
        ("clf", LogisticRegression(random_state=0)),
    ],
)
pipe.fit(X_train, y_train)
pipe.score(X_val, y_val)

In [None]:
pipe

# Custom Transformer

In [None]:
X_train.Name.sample(n=5)

In [None]:
import re

def get_title(
    text: str,
    title_pattern: str = r"Mrs?|Miss|Master",
) -> str | None:
    """Get a passenger's title if present.
    
    If more than one title found, return the least number of characters.
    
    The defalut title_pattern will detect:
    - Mr
    - Mrs
    - Miss
    - Master
    """
    possible_titles: set[str] = set(re.findall(pattern=title_pattern, string=text))
    title: list[str] = sorted(possible_titles, key=len)
    if title:
        return title.pop(0)

In [None]:
# Assert function extracts expected title.
assert get_title("Turpin, Mr. William John Robert") == "Mr"

In [None]:
# Assert function returns nothing if not title present.
assert get_title("Rothes, the Countess. of (Lucy Noel Martha Dyer-Edwards)") is None

In [None]:
# Assert function returns title with least number of characters.
assert get_title("Mr. and Mrs. Smith") == "Mr"

In [None]:
X_train.Name.apply(get_title).value_counts(dropna=False)

In [None]:
# Get titles from names.
X_train_title = X_train.assign(Title=X_train.Name.apply(get_title))
X_val_title = X_val.assign(Title=X_val.Name.apply(get_title))

In [None]:
from sklearn.impute import KNNImputer

knn_imputer = KNNImputer()

In [None]:
# One-hot-encode title and join with age.
age_trf = ColumnTransformer(
    transformers=[
        ("age", "passthrough", ["Age"]),
        ("ohe", OneHotEncoder(drop=[None], sparse_output=False), ["Title"]),
    ],
    remainder="drop",
)

# Impute missing values (age) using title and age
knn_impute_pipe = Pipeline(
    steps=[
        ("age_trf", age_trf),
        ("knn_impute", KNNImputer()),
    ],
).set_output(transform="pandas")

In [None]:
# Get sample of passengers with missing ages.
title_missing_age_sample = X_train_title.loc[
    X_train_title.Age.isnull()
].groupby("Title").apply(lambda f: f.index[0])

# Compare imputed age using mean with KNN.
imputer = SimpleImputer().set_output(transform="pandas")
_mean = imputer.fit_transform(X_train_title[["Age"]]).loc[title_missing_age_sample]
_knn = knn_impute_pipe.fit_transform(X_train_title).loc[title_missing_age_sample]
_impute = pd.concat([_mean, _knn], keys=["Mean", "KNN"], axis="columns")
_impute

# [`FunctionTransformer`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html)

In [None]:
from sklearn.preprocessing import FunctionTransformer

In [None]:
# Vectorize get_title allowing input to be array-like.
# Note that output dtypes will all be the same (None -> "None")
get_title_vec = np.vectorize(get_title)

In [None]:
# Convert get_title into an sklearn transformer
title_func = FunctionTransformer(func=get_title_vec)

In [None]:
title_func

In [None]:
# Pipeline to get titles, then one-hot-encode.
title_pipe = Pipeline(
    steps=[
        ("title_func", title_func),
        ("ohe", OneHotEncoder(drop=["None"], sparse_output=False)),
    ],
)

In [None]:
title_pipe

In [None]:
# Pipeline to get titles and passthrough age.
age_title_trf = ColumnTransformer(
    transformers=[
        ("title_pipe", title_pipe, ["Name"]),
        ("age", "passthrough", ["Age"])
    ],
    remainder="drop",
)

In [None]:
age_title_trf

In [None]:
# Pipeline to impute age given ages of neighbors with given titles.
age_pipe = Pipeline(
    steps=[
        ("age_title_trf", age_title_trf),
        ("impute_knn", KNNImputer()),
    ],
)

In [None]:
age_pipe

In [None]:
col_trf = ColumnTransformer(
    transformers=[
        ("fare", cont_pipe, ["Fare"]),
        ("age", age_pipe, ["Age", "Name"]),
        ("cat_ord", cat_ord, cat_ord_cols),
    ],
    remainder="drop",
)

In [None]:
col_trf

In [None]:
pipe = Pipeline(
    steps=[
        ("col_trf", col_trf),
        ("clf", LogisticRegression(random_state=0)),
    ],
)

In [None]:
pipe.fit(X_train, y_train)

In [None]:
pipe.score(X_val, y_val)

In [None]:
from copy import deepcopy

title_func_ = deepcopy(title_func)
title_func_.set_params(**{"kw_args": {"title_pattern": "Mrs?"}})
_old = title_func.fit_transform(X_train["Name"])
_new = title_func_.fit_transform(X_train["Name"])

In [None]:
np.unique(_old, return_counts=True)

In [None]:
np.unique(_new, return_counts=True)