# Intro

- [Kaggle](https://www.kaggle.com/)
- [Stack Overflow](https://stackoverflow.com/)
- [`scikit-learn`](https://scikit-learn.org/stable/)

# Setup

In [1]:
# !pip install numpy==1.24.3
# !pip install pandas==2.1.4
# !pip install scikit-learn==1.3.2

In [2]:
import numpy as np  # For numerical computation when a dataframe isn't available
import pandas as pd  # For reading/manipulating data
from sklearn.impute import SimpleImputer  # For imputing missing values
from sklearn.linear_model import LogisticRegression  # Simple classifier
from sklearn.model_selection import train_test_split  # Split train data into train/val
from sklearn.preprocessing import MinMaxScaler  # Simple preprocessing step

In [3]:
# Titanic data set
train = pd.read_csv("train.csv", index_col="PassengerId")
test = pd.read_csv("test.csv", index_col="PassengerId")

In [4]:
# Separate X and y
X = train.drop(columns="Survived")
y = train.Survived

In [5]:
# Split into train/val data.
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0, stratify=y)

In [6]:
# Limit to features with dtype float
cont_cols = X_train.select_dtypes(include="float").columns
X_train_float = X_train[cont_cols]
X_val_float = X_val[cont_cols]

In [7]:
# Scale data.
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_float)
X_val_scaled = scaler.transform(X_val_float)

# Impute data
imputer = SimpleImputer()
X_train_imputed = imputer.fit_transform(X_train_scaled)
X_val_imputed = imputer.transform(X_val_scaled)

In [8]:
# Fit the model and grade against val data.
clf = LogisticRegression(random_state=0)
clf.fit(X_train_imputed, y_train)
clf.score(X_val_imputed, y_val)

0.6457399103139013

# [`Pipeline`](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)

In [9]:
from sklearn.pipeline import Pipeline

In [10]:
pipe = Pipeline(
    steps=[
        ("scale", MinMaxScaler()),
        ("impute", SimpleImputer()),
        ("clf", LogisticRegression(random_state=0)),
    ],
)
pipe.fit(X_train_float, y_train)
pipe.score(X_val_float, y_val)

0.6457399103139013

In [11]:
# HTML representation of the pipe.
pipe

In [12]:
# Dictionary mapping of names to steps
pipe.named_steps

{'scale': MinMaxScaler(),
 'impute': SimpleImputer(),
 'clf': LogisticRegression(random_state=0)}

In [13]:
# Steps as a list of tuples
pipe.steps

[('scale', MinMaxScaler()),
 ('impute', SimpleImputer()),
 ('clf', LogisticRegression(random_state=0))]

In [14]:
# Available parameters for all steps in the pipe.
pipe.get_params()

{'memory': None,
 'steps': [('scale', MinMaxScaler()),
  ('impute', SimpleImputer()),
  ('clf', LogisticRegression(random_state=0))],
 'verbose': False,
 'scale': MinMaxScaler(),
 'impute': SimpleImputer(),
 'clf': LogisticRegression(random_state=0),
 'scale__clip': False,
 'scale__copy': True,
 'scale__feature_range': (0, 1),
 'impute__add_indicator': False,
 'impute__copy': True,
 'impute__fill_value': None,
 'impute__keep_empty_features': False,
 'impute__missing_values': nan,
 'impute__strategy': 'mean',
 'clf__C': 1.0,
 'clf__class_weight': None,
 'clf__dual': False,
 'clf__fit_intercept': True,
 'clf__intercept_scaling': 1,
 'clf__l1_ratio': None,
 'clf__max_iter': 100,
 'clf__multi_class': 'auto',
 'clf__n_jobs': None,
 'clf__penalty': 'l2',
 'clf__random_state': 0,
 'clf__solver': 'lbfgs',
 'clf__tol': 0.0001,
 'clf__verbose': 0,
 'clf__warm_start': False}

In [15]:
# Modify the parameters of MinMaxScaler and LogisticRegression.
pipe.set_params(**{"scale__clip": True, "impute__strategy": "median", "clf__fit_intercept": False})

In [16]:
# Store params in a separate file
!echo {"scale__clip": false, "impute__strategy": "mean", "clf__fit_intercept": true} > params.json

In [17]:
import json


with open(file="params.json", mode="r") as f:
    params = json.loads(s=f.read())

pipe.set_params(**params)

# Expanding Feature Space

In [18]:
from sklearn.preprocessing import OneHotEncoder, TargetEncoder

In [19]:
X_train.nunique().sort_values()

Sex           2
Pclass        3
Embarked      3
SibSp         7
Parch         7
Age          82
Cabin       120
Fare        216
Ticket      537
Name        668
dtype: int64

In [20]:
# Limit to categorical/ordinal features.
cat_ord_cols = ["Sex", "Pclass", "Embarked", "SibSp", "Parch"]
X_train_enc = X_train[cat_ord_cols]
X_val_enc = X_val[cat_ord_cols]

In [21]:
# One-hot-encode categorical/ordinal features.
ohe = OneHotEncoder(drop="first", sparse_output=False, max_categories=5)
X_train_ohe = ohe.fit_transform(X_train_enc)
X_val_ohe = ohe.transform(X_val_enc)

In [22]:
# Target encode categorical/ordinal features.
tgt = TargetEncoder(random_state=0)
X_train_tgt = tgt.fit_transform(X_train_enc, y_train)
X_val_tgt = tgt.transform(X_val_enc)

In [23]:
# Join One-hot-encoded features with target-encoded features.
X_train_feat_union = np.hstack((X_train_ohe, X_train_tgt))
X_val_feat_union = np.hstack((X_val_ohe, X_val_tgt))

In [24]:
clf.fit(X_train_feat_union, y_train)
clf.score(X_val_feat_union, y_val)

0.7847533632286996

# [`FeatureUnion`](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.FeatureUnion.html)

In [25]:
from sklearn.pipeline import FeatureUnion

In [26]:
feat_union = FeatureUnion(
    transformer_list=[
        ("ohe", OneHotEncoder(drop="first", sparse_output=False, max_categories=5)),
        ("tgt", TargetEncoder(random_state=0)),
    ],
)
_train = feat_union.fit_transform(X_train_enc, y_train)
_val = feat_union.transform(X_val_enc)

In [27]:
np.allclose(a=X_train_feat_union, b=_train)

True

In [28]:
np.allclose(a=X_val_feat_union, b=_val)

True

In [29]:
pipe = Pipeline(
    steps=[
        ("feat_union", feat_union),
        ("clf", LogisticRegression(random_state=0)),
    ],
)
pipe.fit(X_train_enc, y_train)
pipe.score(X_val_enc, y_val)

0.7847533632286996

In [30]:
pipe

# Transform Feature Subsets

In [31]:
# Limit to features with dtype float.
cont_cols = X_train.select_dtypes(include="float").columns
X_train_float = X_train[cont_cols]
X_val_float = X_val[cont_cols]

# Continuous feature pipeline.
cont_pipe = Pipeline(
    steps=[
        ("scale", MinMaxScaler()),
        ("impute", SimpleImputer()),
    ],
)
X_train_cont = cont_pipe.fit_transform(X_train_float)
X_val_cont = cont_pipe.transform(X_val_float)

In [32]:
# Limit to categorical/ordinal features.
cat_ord_cols = ["Sex", "Pclass", "Embarked", "SibSp", "Parch"]
X_train_enc = X_train[cat_ord_cols]
X_val_enc = X_val[cat_ord_cols]

# Categorical/ordinal feature union.
feat_union = FeatureUnion(
    transformer_list=[
        ("ohe", OneHotEncoder(drop="first", sparse_output=False, max_categories=5)),
        ("tgt", TargetEncoder(random_state=0)),
    ],
)
X_train_feat_union = feat_union.fit_transform(X_train_enc, y_train)
X_val_feat_union = feat_union.transform(X_val_enc)

In [33]:
# Join continuous transformations with categorical/ordinal transformations.
X_train_join = np.hstack((X_train_cont, X_train_feat_union))
X_val_join = np.hstack((X_val_cont, X_val_feat_union))

In [34]:
clf.fit(X_train_join, y_train)
clf.score(X_val_join, y_val)

0.8026905829596412

# [`ColumnTransformer`](https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html)

In [35]:
from sklearn.compose import ColumnTransformer

In [36]:
col_trf = ColumnTransformer(
    transformers=[
        ("cont", cont_pipe, cont_cols),
        ("cat_ord", feat_union, cat_ord_cols),
    ],
    remainder="drop",
)
_train = col_trf.fit_transform(X_train, y_train)
_val = col_trf.transform(X_val)

In [37]:
np.allclose(a=X_train_join, b=_train)

True

In [38]:
np.allclose(a=X_val_join, b=_val)

True

In [39]:
pipe = Pipeline(
    steps=[
        ("col_trf", col_trf),
        ("clf", LogisticRegression(random_state=0)),
    ],
)
pipe.fit(X_train, y_train)
pipe.score(X_val, y_val)

0.8026905829596412

In [40]:
pipe

# Custom Transformer

In [41]:
X_train.Name.sample(n=5)

PassengerId
286        Stankovic, Mr. Ivan
518          Ryan, Mr. Patrick
381      Bidois, Miss. Rosalie
230    Lefebre, Miss. Mathilde
17        Rice, Master. Eugene
Name: Name, dtype: object

In [42]:
X_train.Name.str.split(pat=r"[\,\.]?\s", regex=True).sample(n=5)

PassengerId
764       [Carter, Mrs, William, Ernest, (Lucile, Polk)]
532                                  [Toufik, Mr, Nakli]
342                    [Fortune, Miss, Alice, Elizabeth]
174                         [Sivola, Mr, Antti, Wilhelm]
166    [Goldsmith, Master, Frank, John, William, "Fra...
Name: Name, dtype: object

In [43]:
import re

def get_title(
    text: str,
    title_pattern: str = r"Mrs?|Miss|Master",
) -> str | None:
    """Get a passenger's title if present.
    
    If more than one title found, return the least number of characters.
    
    The defalut title_pattern will detect:
    - Mr
    - Mrs
    - Miss
    - Master
    """
    possible_titles: set[str] = set(re.findall(pattern=title_pattern, string=text))
    title: list[str] = sorted(possible_titles, key=len)
    if title:
        return title.pop(0)

In [44]:
# Assert function extracts expected title.
assert get_title("Turpin, Mr. William John Robert") == "Mr"

In [45]:
# Assert function returns nothing if not title present.
assert get_title("Rothes, the Countess. of (Lucy Noel Martha Dyer-Edwards)") is None

In [46]:
# Assert function returns title with least number of characters.
assert get_title("Mr. and Mrs. Smith") == "Mr"

In [47]:
X_train.Name.apply(get_title).value_counts(dropna=False)

Name
Mr        384
Miss      133
Mrs        98
Master     33
None       20
Name: count, dtype: int64

In [48]:
# Get titles from names.
X_train_title = X_train.assign(Title=X_train.Name.apply(get_title))
X_val_title = X_val.assign(Title=X_val.Name.apply(get_title))

In [80]:
from sklearn.impute import KNNImputer

In [81]:
knn_imputer = KNNImputer()

In [96]:
age_trf = ColumnTransformer(
    transformers=[
        ("age", "passthrough", ["Age"]),
        ("ohe", OneHotEncoder(drop=[None], sparse_output=False), ["Title"]),
    ],
    remainder="drop",
)

knn_impute_pipe = Pipeline(
    steps=[
        ("age_trf", age_trf),
        ("knn_impute", KNNImputer()),
    ],
).set_output(transform="pandas")

In [112]:
X_train_title.loc[
    X_train_title.Age.isnull()
    & X_train_title.Title.eq("Master")
]

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
177,3,"Lefebre, Master. Henry Forbes",male,,3,1,4133,25.4667,,S,Master
160,3,"Sage, Master. Thomas Henry",male,,8,2,CA. 2343,69.55,,S,Master
710,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C,Master
66,3,"Moubarek, Master. Gerios",male,,1,1,2661,15.2458,,C,Master


In [113]:
imputer = SimpleImputer().set_output(transform="pandas")
imputer.fit_transform(X_train_title[["Age"]]).loc[177]

Age    29.796842
Name: 177, dtype: float64

In [114]:
knn_impute_pipe.fit_transform(X_train_title).loc[177]

age__Age             4.8
ohe__Title_Master    1.0
ohe__Title_Miss      0.0
ohe__Title_Mr        0.0
ohe__Title_Mrs       0.0
Name: 177, dtype: float64

In [115]:
pipe

In [60]:
# One-hot-encode title.
col_trf = ColumnTransformer(
    transformers=[
        ("cont", cont_pipe, cont_cols),
        ("cat_ord", feat_union, cat_ord_cols),
        ("title", OneHotEncoder(drop=[None], sparse_output=False), ["Title"])
    ],
    remainder="drop",
)

In [61]:
pipe = Pipeline(
    steps=[
        ("col_trf", col_trf),
        ("clf", LogisticRegression(random_state=0)),
    ],
)
pipe.fit(X_train_title, y_train)
pipe.score(X_val_title, y_val)

0.8026905829596412

In [62]:
pipe

# [`FunctionTransformer`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html)

In [None]:
from sklearn.preprocessing import FunctionTransformer