In [42]:
import numpy as np
import pandas as pd


In [43]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier

In [44]:
df = pd.read_csv("../dataset/train.csv")

In [45]:
df.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
645,646,1,1,"Harper, Mr. Henry Sleeper",male,48.0,1,0,PC 17572,76.7292,D33,C
493,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
31,32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C
343,344,0,2,"Sedgwick, Mr. Charles Frederick Waddington",male,25.0,0,0,244361,13.0,,S
874,875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",female,28.0,1,0,P/PP 3381,24.0,,C
547,548,1,2,"Padro y Manent, Mr. Julian",male,,0,0,SC/PARIS 2146,13.8625,,C
18,19,0,3,"Vander Planke, Mrs. Julius (Emelia Maria Vande...",female,31.0,1,0,345763,18.0,,S
41,42,0,2,"Turpin, Mrs. William John Robert (Dorothy Ann ...",female,27.0,1,0,11668,21.0,,S
865,866,1,2,"Bystrom, Mrs. (Karolina)",female,42.0,0,0,236852,13.0,,S
413,414,0,2,"Cunningham, Mr. Alfred Fleming",male,,0,0,239853,0.0,,S


In [46]:
df.drop(columns=['PassengerId', 'Name', 'Ticket',
        'Cabin'], inplace=True, axis="columns")

In [47]:
df.sample()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
316,1,2,female,24.0,1,0,26.0,S


In [48]:
X = df.drop("Survived", axis=1)
y = df["Survived"]

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [50]:
X_train.sample()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
143,3,male,19.0,0,0,6.75,Q


In [51]:
trf1 = ColumnTransformer(
    [
        ("impute_age", SimpleImputer(), [2]),
        ("impute_embarked", SimpleImputer(strategy="most_frequent"), [6])
    ], remainder="passthrough")

In [52]:
trf2 = ColumnTransformer(
    [
        ("ohe_sex_embarked", OneHotEncoder(
            sparse=False, handle_unknown="ignore"), [1, 6])
    ], remainder="passthrough"
)

In [53]:
trf3 = ColumnTransformer(
    [
        ("scale", MinMaxScaler(), slice(0, 10))
    ]
)

In [54]:
trf4 = SelectKBest(score_func=chi2, k=8)

In [55]:
trf5 = DecisionTreeClassifier()

In [56]:
pipe = Pipeline(
    [
        ("trf1", trf1),
        ("trf2", trf2),
        ("trf3", trf3),
        ("trf4", trf4),
        ("trf5", trf5)
    ]
)

In [57]:
pipe.fit(X_train, y_train)

