# Getting started with Pipeline's

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from typing import List, Any

In [2]:
URL_TO_DATA = (
    "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
)
TEST_SIZE = 0.2
VALID_SIZE = 0.25
RANDOM_STATE = 42
NUMERIC_TRANSFORMER_REPLACEMENT = "median"

In [3]:
# in case of CERTIFICATE_VERIFY_FAILED run Install Certificates.command
# see also https://stackoverflow.com/questions/50236117/scraping-ssl-certificate-verify-failed-error-for-http-en-wikipedia-org
df = pd.read_csv(filepath_or_buffer=URL_TO_DATA, index_col=0)


df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
df["Title"] = "NA"
df["Title"] = df.Name.str.extract("([A-Za-z]+)\.")

In [4]:
y = df["Survived"]
X = df.drop(columns=["Survived"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=VALID_SIZE, random_state=RANDOM_STATE
)  # 0.25 x 0.8 = 0.2

In [5]:
X_train.dtypes
X_train.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
461,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.55,E12,S,1,Mr
302,3,"McCoy, Mr. Bernard",male,,2,0,367226,23.25,,Q,3,Mr
386,2,"Davies, Mr. Charles Henry",male,18.0,0,0,S.O.C. 14879,73.5,,S,1,Mr
321,3,"Dennis, Mr. Samuel",male,22.0,0,0,A/5 21172,7.25,,S,1,Mr
346,2,"Brown, Miss. Amelia ""Mildred""",female,24.0,0,0,248733,13.0,F33,S,1,Miss


In [6]:
numeric_features = ["Age", "Fare"]
categorical_features = [
    "Pclass",
    "Sex",
    "SibSp",
    "Parch",
    "Embarked",
    "Title",
    "FamilySize",
]
discretized_features = ["FamilySize"]
corrector_features = ["Title"]
BINS = [0, 1, 2, 4, np.Inf]
LABELS = ["ALONE", "SMALL", "MED", "LARGE"]
KNOWN_PROBLEMS = [
    "Mlle",
    "Mme",
    "Ms",
    "Dr",
    "Major",
    "Lady",
    "Countess",
    "Jonkheer",
    "Col",
    "Rev",
    "Capt",
    "Sir",
    "Don",
]

KNOWN_CORRECTIONS = [
    "Miss",
    "Miss",
    "Miss",
    "Mr",
    "Mr",
    "Mrs",
    "Mrs",
    "Other",
    "Other",
    "Other",
    "Mr",
    "Mr",
    "Mr",
]

In [55]:
class TitleCorrector(BaseEstimator, TransformerMixin):
    """
    Use transformer to correct the Title column and to One Hot Encoding.
    """

    def __init__(self, known_problems: List, known_corrections: List):
        self.known_corrections = known_corrections
        self.known_problems = known_problems
        self.enc = OneHotEncoder(handle_unknown="ignore")

    def fit(self, X, y=None) -> object:
        return self

    def transform(self, X, one_hot_encode: bool = True) -> object:
        X_corrected = X.replace(self.known_problems, self.known_corrections)
        self.feature_names = X_corrected.columns.tolist()

        if one_hot_encode:
            X_corrected = self.enc.fit_transform(X_corrected)
            self.feature_names = self.enc.get_feature_names_out()

        return X_corrected

    def get_feature_names_out(self, input_features=None) -> list:
        return self.feature_names

In [56]:
# class Discretizer(BaseEstimator, TransformerMixin):
#     """
#     Use transformer to discretize numeric data. Interface to pandas:`~pandas.cut`

#     """

#     def __init__(self, bins: Any, labels: Any = None, **kwargs):

#         # self.column = column
#         self.bins = bins
#         self.labels = labels
#         self.kwargs = kwargs
#         self.enc = OneHotEncoder(handle_unknown="ignore")

#     def fit(self, X, y=None):
#         return self

#     def transform(self, X):
#         print(str(X.shape))
#         X['test'] = np.where(X['FamilySize'] > 1, "ALONE", "NOT")
#        # print(str(temp.shape))
#         return self.enc.fit_transform([X['test']])

In [57]:
# fmt: off
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy=NUMERIC_TRANSFORMER_REPLACEMENT)),
        ("scaler", StandardScaler()),
    ]
)
numeric_transformer
# fmt: on

In [58]:
# fmt: off
preprocessor = ColumnTransformer(
    transformers=[
        #("discretize", Discretizer(bins=BINS, labels=LABELS), discretized_features),
        ("correct",    TitleCorrector(KNOWN_PROBLEMS, KNOWN_CORRECTIONS), corrector_features),
        ("num", numeric_transformer, numeric_features),
        ("onehot", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)
preprocessor
# fmt: on

In [61]:
preprocess = Pipeline(steps=[("preprocessor", preprocessor)])

df_to_inspect = pd.DataFrame.sparse.from_spmatrix(
    preprocess.named_steps["preprocessor"].fit_transform(X_train)
)

df_to_inspect.columns = preprocess["preprocessor"].get_feature_names_out()
# df_to_inspect.head()

['Title_Master' 'Title_Miss' 'Title_Mr' 'Title_Mrs' 'Title_Other']


In [None]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier())]
)

In [None]:
clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))
print("model score: %.3f" % clf.score(X_val, y_val))

Acknowledgement:
- Gunes Evitan's Kaggle Notebook on [Titanic - Advanced Feature Engineering Tutorial](https://www.kaggle.com/code/gunesevitan/titanic-advanced-feature-engineering-tutorial/notebook)
- Ashwini Swain's Kaggle Notebook SWAIN [EDA To Prediction(DieTanic)](https://www.kaggle.com/ash316/eda-to-prediction-dietanic)
- Petro Morales's sklearn Tutorial on [Column Transformer with Mixed Types](https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html?highlight=standardscaler)

In [34]:
(preprocess["preprocessor"].get_feature_names_out())

['Title']


array(['correct__Title', 'num__Age', 'num__Fare', 'onehot__Pclass_1',
       'onehot__Pclass_2', 'onehot__Pclass_3', 'onehot__Sex_female',
       'onehot__Sex_male', 'onehot__SibSp_0', 'onehot__SibSp_1',
       'onehot__SibSp_2', 'onehot__SibSp_3', 'onehot__SibSp_4',
       'onehot__SibSp_5', 'onehot__SibSp_8', 'onehot__Parch_0',
       'onehot__Parch_1', 'onehot__Parch_2', 'onehot__Parch_3',
       'onehot__Parch_4', 'onehot__Parch_5', 'onehot__Parch_6',
       'onehot__Embarked_C', 'onehot__Embarked_Q', 'onehot__Embarked_S',
       'onehot__Embarked_nan', 'onehot__Title_Capt', 'onehot__Title_Col',
       'onehot__Title_Countess', 'onehot__Title_Dr', 'onehot__Title_Lady',
       'onehot__Title_Master', 'onehot__Title_Miss', 'onehot__Title_Mlle',
       'onehot__Title_Mme', 'onehot__Title_Mr', 'onehot__Title_Mrs',
       'onehot__Title_Ms', 'onehot__Title_Rev', 'onehot__FamilySize_1',
       'onehot__FamilySize_2', 'onehot__FamilySize_3',
       'onehot__FamilySize_4', 'onehot__FamilyS

___


In [60]:
preprocess["preprocessor"].get_feature_names_out()

['Title_Master' 'Title_Miss' 'Title_Mr' 'Title_Mrs' 'Title_Other']


array(['correct__Title_Master', 'correct__Title_Miss',
       'correct__Title_Mr', 'correct__Title_Mrs', 'correct__Title_Other',
       'num__Age', 'num__Fare', 'onehot__Pclass_1', 'onehot__Pclass_2',
       'onehot__Pclass_3', 'onehot__Sex_female', 'onehot__Sex_male',
       'onehot__SibSp_0', 'onehot__SibSp_1', 'onehot__SibSp_2',
       'onehot__SibSp_3', 'onehot__SibSp_4', 'onehot__SibSp_5',
       'onehot__SibSp_8', 'onehot__Parch_0', 'onehot__Parch_1',
       'onehot__Parch_2', 'onehot__Parch_3', 'onehot__Parch_4',
       'onehot__Parch_5', 'onehot__Parch_6', 'onehot__Embarked_C',
       'onehot__Embarked_Q', 'onehot__Embarked_S', 'onehot__Embarked_nan',
       'onehot__Title_Capt', 'onehot__Title_Col',
       'onehot__Title_Countess', 'onehot__Title_Dr', 'onehot__Title_Lady',
       'onehot__Title_Master', 'onehot__Title_Miss', 'onehot__Title_Mlle',
       'onehot__Title_Mme', 'onehot__Title_Mr', 'onehot__Title_Mrs',
       'onehot__Title_Ms', 'onehot__Title_Rev', 'onehot__Family

preprocess['preprocessor'].get_feature_names_out()