# Getting started with pilines

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from typing import List, Any

In [2]:
from tkinter import NUMERIC


URL_TO_DATA = (
    "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
)
TEST_SIZE = 0.2
VALID_SIZE = 0.25
RANDOM_STATE = 42
NUMERIC_TRANSFORMER_REPLACEMENT = "median"

In [17]:
df = pd.read_csv(filepath_or_buffer=URL_TO_DATA, index_col=0)

df["FamilySize"] = df["SibSp"] + df["Parch"] + 1

In [18]:
y = df["Survived"]
X = df.drop(columns=["Survived"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=VALID_SIZE, random_state=RANDOM_STATE
)  # 0.25 x 0.8 = 0.2

In [16]:
X_train.dtypes

X_train.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,isAlone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
461,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.55,E12,S,1,True
302,3,"McCoy, Mr. Bernard",male,,2,0,367226,23.25,,Q,3,False
386,2,"Davies, Mr. Charles Henry",male,18.0,0,0,S.O.C. 14879,73.5,,S,1,True
321,3,"Dennis, Mr. Samuel",male,22.0,0,0,A/5 21172,7.25,,S,1,True
346,2,"Brown, Miss. Amelia ""Mildred""",female,24.0,0,0,248733,13.0,F33,S,1,True


In [41]:
numeric_features = ["Age", "Fare"]
categorical_features = ["Pclass", "Sex", "SibSp", "Parch", "Embarked"]
discretized_features = ["FamilySize"]
BINS = [0, 1, 2, 4, np.Inf]

LABLES = ['ALONE', 'SMALL',  'MED', 'LARGE']

In [45]:
class Discretizer(BaseEstimator, TransformerMixin):
    """
    Use transformer to discretize numeric data. Interface to pandas:`~pandas.cut`

    """

    def __init__(self, bins: Any, labels: Any = None, **kwargs):

        #self.column = column
        self.bins = bins
        self.labels = labels
        self.kwargs = kwargs

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.cut(
            x=X, bins=self.bins, labels=self.labels, **self.kwargs
        )



In [43]:
numeric_transformer = Pipeline(
    steps=[
        (
            "imputer",
            SimpleImputer(strategy=NUMERIC_TRANSFORMER_REPLACEMENT),
        ),
        ("scaler", StandardScaler()),
    ]
)
numeric_transformer

In [46]:
preprocessor = ColumnTransformer(
    transformers=[
        ("dis", Discretizer(bins = BINS, labels = LABLES), discretized_features),
        ("num", numeric_transformer, numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)
preprocessor

In [9]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier())]
)
clf

In [12]:
# Group passengers by ticket number
same_ticket = df_total.groupby("Ticket")["PassengerId"].nunique().reset_index()
same_ticket.rename(columns={"PassengerId": "passenger count"}, inplace=True)

# Create a new variable 'travel size'
travel_group_2ppl = same_ticket[same_ticket["passenger count"] == 2]
travel_group_3ppl = same_ticket[same_ticket["passenger count"] == 3]
travel_group_4ppl = same_ticket[same_ticket["passenger count"] == 4]
travel_group_5ppl = same_ticket[same_ticket["passenger count"] >= 5]

df_total["travel size"] = "a. alone"
df_total.loc[
    df_total["Ticket"].isin(travel_group_2ppl["Ticket"]), "travel size"
] = "b. 2 ppl"
df_total.loc[
    df_total["Ticket"].isin(travel_group_3ppl["Ticket"]), "travel size"
] = "c. 3 ppl"
df_total.loc[
    df_total["Ticket"].isin(travel_group_4ppl["Ticket"]), "travel size"
] = "d. 4 ppl"
df_total.loc[
    df_total["Ticket"].isin(travel_group_5ppl["Ticket"]), "travel size"
] = "e. 5+ ppl"

# Survival rate by travel size
df_total.groupby(["travel size"])["Survived"].describe()

NameError: name 'df_total' is not defined

Acknowledgement:
- Gunes Evitan's Kaggle Nootebook on [Titanic - Advanced Feature Engineering Tutorial](https://www.kaggle.com/code/gunesevitan/titanic-advanced-feature-engineering-tutorial/notebook)
- Petro Morales's sklearn Tutorial on [Column Transformer with Mixed Types](https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html?highlight=standardscaler)