# Intro

- [Kaggle](https://www.kaggle.com/)
- [Stack Overflow](https://stackoverflow.com/)
- [`scikit-learn`](https://scikit-learn.org/stable/)

# Setup

In [1]:
# !pip install numpy==1.24.3
# !pip install pandas==2.1.4
# !pip install scikit-learn==1.3.2

In [2]:
import numpy as np  # For numerical computation when a dataframe isn't available
import pandas as pd  # For reading/manipulating data
from sklearn.impute import SimpleImputer  # For imputing missing values
from sklearn.linear_model import LogisticRegression  # Simple classifier
from sklearn.model_selection import train_test_split  # Split train data into train/val
from sklearn.preprocessing import MinMaxScaler  # Simple preprocessing step

In [3]:
# Titanic data set
train = pd.read_csv("train.csv", index_col="PassengerId")
test = pd.read_csv("test.csv", index_col="PassengerId")

In [4]:
# Separate X and y
X = train.drop(columns="Survived")
y = train.Survived

In [5]:
# Split into train/val data.
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0, stratify=y)

In [6]:
# Limit to features with dtype float
X_train_float = X_train.select_dtypes(include="float")
X_val_float = X_val[X_train_float.columns]

In [7]:
# Scale data.
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_float)
X_val_scaled = scaler.transform(X_val_float)

# Impute data
imputer = SimpleImputer()
X_train_imputed = imputer.fit_transform(X_train_scaled)
X_val_imputed = imputer.transform(X_val_scaled)

In [8]:
# Fit the model and grade against val data.
clf = LogisticRegression(random_state=0)
clf.fit(X_train_imputed, y_train)
clf.score(X_val_imputed, y_val)

0.6457399103139013

# [`Pipeline`](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)

In [9]:
from sklearn.pipeline import Pipeline

In [10]:
pipe = Pipeline(
    steps=[
        ("scale", MinMaxScaler()),
        ("impute", SimpleImputer()),
        ("clf", LogisticRegression(random_state=0)),
    ],
)
pipe.fit(X_train_float, y_train)
pipe.score(X_val_float, y_val)

0.6457399103139013

In [11]:
# HTML representation of the pipe.
pipe

In [12]:
# Dictionary mapping of names to steps
pipe.named_steps

{'scale': MinMaxScaler(),
 'impute': SimpleImputer(),
 'clf': LogisticRegression(random_state=0)}

In [13]:
# Steps as a list of tuples
pipe.steps

[('scale', MinMaxScaler()),
 ('impute', SimpleImputer()),
 ('clf', LogisticRegression(random_state=0))]

In [14]:
# Available parameters for all steps in the pipe.
pipe.get_params()

{'memory': None,
 'steps': [('scale', MinMaxScaler()),
  ('impute', SimpleImputer()),
  ('clf', LogisticRegression(random_state=0))],
 'verbose': False,
 'scale': MinMaxScaler(),
 'impute': SimpleImputer(),
 'clf': LogisticRegression(random_state=0),
 'scale__clip': False,
 'scale__copy': True,
 'scale__feature_range': (0, 1),
 'impute__add_indicator': False,
 'impute__copy': True,
 'impute__fill_value': None,
 'impute__keep_empty_features': False,
 'impute__missing_values': nan,
 'impute__strategy': 'mean',
 'clf__C': 1.0,
 'clf__class_weight': None,
 'clf__dual': False,
 'clf__fit_intercept': True,
 'clf__intercept_scaling': 1,
 'clf__l1_ratio': None,
 'clf__max_iter': 100,
 'clf__multi_class': 'auto',
 'clf__n_jobs': None,
 'clf__penalty': 'l2',
 'clf__random_state': 0,
 'clf__solver': 'lbfgs',
 'clf__tol': 0.0001,
 'clf__verbose': 0,
 'clf__warm_start': False}

In [15]:
# Modify the parameters of MinMaxScaler and LogisticRegression.
pipe.set_params(**{"scale__clip": True, "impute__strategy": "median", "clf__fit_intercept": False})

In [16]:
# Store params in a separate file
!echo {"scale__clip": false, "impute__strategy": "mean", "clf__fit_intercept": true} > params.json

In [17]:
import json


with open(file="params.json", mode="r") as f:
    params = json.loads(s=f.read())

pipe.set_params(**params)

# Expanding Feature Space

In [18]:
from sklearn.preprocessing import OneHotEncoder, TargetEncoder

In [19]:
X_train.nunique().sort_values()

Sex           2
Pclass        3
Embarked      3
SibSp         7
Parch         7
Age          82
Cabin       120
Fare        216
Ticket      537
Name        668
dtype: int64

In [20]:
# One-hot-encode categorical/ordinal features
cat_ord_cols = ["Sex", "Pclass", "Embarked", "SibSp", "Parch"]
X_train_enc = X_train[cat_ord_cols]
X_val_enc = X_val[cat_ord_cols]
ohe = OneHotEncoder(drop="first", sparse_output=False, max_categories=5)
X_train_ohe = ohe.fit_transform(X_train_enc)
X_val_ohe = ohe.transform(X_val_enc)

In [23]:
# Target encode categorical/ordinal features
tgt = TargetEncoder(random_state=0)
X_train_tgt = tgt.fit_transform(X_train_enc, y_train)
X_val_tgt = tgt.transform(X_val_enc)

In [24]:
# Join One-hot-encoded features with target-encoded features.
X_train_feat_union = np.hstack((X_train_ohe, X_train_tgt))
X_val_feat_union = np.hstack((X_val_ohe, X_val_tgt))

In [25]:
clf.fit(X_train_feat_union, y_train)
clf.score(X_val_feat_union, y_val)

0.7847533632286996

# [`FeatureUnion`](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.FeatureUnion.html)

In [26]:
from sklearn.pipeline import FeatureUnion

In [None]:
feat_union = FeatureUnion(
    transformer_list=[
        ("ohe", OneHotEncoder(drop="first", sparse_output=False, max_categories=5)),
        ("tgt", TargetEncoder(random_state=0)),
    ],
)

In [38]:
feat_union.fit_transform(X_train_enc, y_train).shape

(668, 19)

In [33]:
X_train_feat_union.shape

(668, 19)

In [37]:
X_train_feat_union[0]

array([1.        , 0.        , 0.        , 0.        , 1.        ,
       0.        , 1.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.17978439,
       0.62047407, 0.33518245, 0.52637007, 0.34886092])

In [39]:
feat_union.fit_transform(X_train_enc, y_train)[0]

array([1.        , 0.        , 0.        , 0.        , 1.        ,
       0.        , 1.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.17978439,
       0.62047407, 0.33518245, 0.52637007, 0.34886092])

In [40]:
pipe = Pipeline(
    steps=[
        ("feat_union", feat_union),
        ("clf", LogisticRegression(random_state=0)),
    ],
)
pipe.fit(X_train_enc, y_train)
pipe.score(X_val_enc, y_val)

0.7847533632286996

In [41]:
pipe

# Transform Feature Subsets

In [32]:
# Separate significant from non-significant features.
f_vals, p_vals = f_classif(X_train, y_train)
sig_feats = np.nonzero(p_vals < 0.05)[0]
non_sig_feats = [*set(range(X_train.shape[1])).difference(sig_feats)]
X_train_sig = X_train[:, sig_feats]
X_train_nonsig = X_train[:, non_sig_feats]
X_val_sig = X_val[:, sig_feats]
X_val_nonsig = X_val[:, non_sig_feats]

In [33]:
# Reduce non-signficant feature space.
pca = PCA(n_components=0.90, random_state=0)
X_train_nonsig_pca = pca.fit_transform(X_train_nonsig)
X_val_nonsig_pca = pca.transform(X_val_nonsig)

In [62]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder

In [30]:
# Assign samples to one of four clusters.
kmeans = KMeans(n_clusters=4, n_init="auto", random_state=0)
X_train_kmeans = kmeans.fit_predict(X_train).reshape(-1, 1)
X_val_kmeans = kmeans.predict(X_val).reshape(-1, 1)

In [31]:
# One-hot-encode the cluster labels
ohe = OneHotEncoder(drop="first", sparse_output=False)
X_train_kmeans_ohe = ohe.fit_transform(X_train_kmeans)
X_val_kmeans_ohe = ohe.transform(X_val_kmeans)

In [34]:
# Join components and one-hot-encoded cluster predictions with significant features.
X_train_feat_union = np.hstack((X_train_sig, X_train_nonsig_pca, X_train_kmeans_ohe))
X_val_feat_union = np.hstack((X_val_sig, X_val_nonsig_pca, X_val_kmeans_ohe))

In [35]:
clf.fit(X_train_feat_union, y_train)
clf.score(X_val_feat_union, y_val)

0.96

# [`ColumnTransformer`](https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html)

In [37]:
from sklearn.compose import ColumnTransformer

In [None]:
ColumnTransformer(
    transformers=[
        ("sig_feats", "passthrough", sig_feats),
        ("pca", PCA(n_components=0.90, random_state=0), non_sig_feats),
    ],
)

In [58]:
col_trf = ColumnTransformer(
    transformers=[
        ("ohe", OneHotEncoder(drop="first", sparse_output=False), [-1]),
        ("pca", PCA(n_components=0.90, random_state=0), non_sig_feats),
    ],
    remainder="passthrough",
)
X_train_and_kmeans = np.hstack((X_train, X_train_kmeans))
X_val_and_kmeans = np.hstack((X_val, X_val_kmeans))
col_trf.fit_transform(X_train_and_kmeans)

array([[ 0.        ,  0.        ,  1.        , ..., -1.79716462,
         0.34907941, -2.5249973 ],
       [ 0.        ,  1.        ,  0.        , ...,  0.33003511,
        -1.15952991, -0.65959457],
       [ 1.        ,  0.        ,  0.        , ...,  0.79667211,
         1.37963563,  0.17920256],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.39039269,
        -1.17912247, -1.48998451],
       [ 0.        ,  0.        ,  0.        , ...,  1.26056885,
        -2.4424848 , -3.0576637 ],
       [ 0.        ,  0.        ,  1.        , ...,  0.07912172,
        -0.18646711, -0.95821122]])

In [59]:
pipe = Pipeline(
    steps=[
        ("col_trf", col_trf),
        ("clf", LogisticRegression(random_state=0)),
    ],
)
pipe.fit(X_train_and_kmeans, y_train)
pipe.score(X_val_and_kmeans, y_val)

0.96

In [60]:
pipe

# [`FunctionTransformer`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html)

In [61]:
from sklearn.preprocessing import FunctionTransformer

In [65]:
func_trf = FunctionTransformer(func=kmeans.predict)

In [66]:
func_trf.fit_transform(X_train)

array([3, 2, 1, 3, 2, 0, 0, 3, 2, 1, 2, 1, 3, 2, 1, 2, 3, 1, 2, 2, 3, 0,
       3, 1, 1, 2, 1, 3, 3, 1, 1, 3, 3, 0, 3, 3, 0, 2, 2, 2, 1, 0, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 0, 1, 2, 3, 3, 2, 0, 1, 3, 2, 1, 3, 2, 1, 1,
       3, 3, 3, 3, 0, 1, 0, 0, 3])

In [67]:
func_trf.transform(X_val)

array([3, 3, 3, 0, 2, 3, 2, 1, 3, 2, 3, 0, 3, 0, 2, 2, 1, 2, 2, 0, 2, 0,
       1, 1, 1])