In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier

In [None]:
df = pd.read_csv("Titanic-Dataset.csv")

In [None]:
df.head()

In [None]:
df.drop(columns= ["Name","PassengerId","Ticket","Cabin"],inplace =True)

In [None]:
#step 1 train/test/split
x_train, x_test, y_train, y_test = train_test_split(df.drop(columns = ["Survived"]),df["Survived"], test_size = 0.2,random_state=42)

In [None]:
x_train.head()

In [None]:
y_train.sample(8)

In [None]:
#imputation transformer
trf1 = ColumnTransformer([
    ("impute_age",SimpleImputer(),[2]),
    ("impute_embarked",SimpleImputer(strategy = "most_frequent"),[6])
],remainder="passthrough")

In [None]:
#one hot encoder
trf2 = ColumnTransformer([("ohe_sex_embarked",OneHotEncoder(sparse_output=False,handle_unknown="ignore"),[1,6])],remainder ="passthrough")

In [None]:
#scaling
trf3 = ColumnTransformer([("scale",MinMaxScaler(),slice(0,10))])

In [None]:
#feature selection
trf4 = SelectKBest(score_func=chi2,k=5)

In [None]:
#train the model
trf5 = DecisionTreeClassifier()

Create Pipeline

In [None]:
pipe = Pipeline([
    ("trf1",trf1),
    ("trf2",trf2),
    ("trf3",trf3),
    ("trf4",trf4),
    ("trf5", trf5)
])

In [None]:
#altetnate syntax
pipe = make_pipeline(trf1,trf2,trf3,trf4, trf5)

In [None]:
x_trf1 = trf1.fit_transform(x_train)
print(x_trf1.shape)

In [None]:
# Step 2: one-hot encoding
x_trf2 = trf2.fit_transform(x_trf1)
print(x_trf2.shape)

In [None]:
# Step 3: scaling
x_trf3 = trf3.fit_transform(x_trf2)
print(x_trf3.shape)

In [None]:
#display pipeline
from sklearn import set_config
set_config(display='diagram')

In [None]:
#train
pipe.fit(x_train,y_train)

explore the pipeline

In [None]:
pipe.named_steps["columntransformer-1"].transformers_[1][1].statistics_

In [None]:
print(pipe.named_steps.keys())

In [None]:
y_pred = pipe.predict(x_test)

In [None]:
y_pred

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

GridSearch using Pipeline

In [None]:
#GridsearchCV
params = {
    "decisiontreeclassifier__max_depth":[1, 2, 3, 4, 5, None]
}

In [None]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(
    estimator=pipe,
    param_grid=params,
    cv=5,
    scoring="accuracy"
)

grid.fit(x_train, y_train)

In [None]:
print("Best Parameters:", grid.best_params_)
print("Best Score:", grid.best_score_)

In [None]:
print(x_train.shape)
print(y_train.shape)

In [None]:
print(pipe.get_params().keys())

Exporting the Pipeline

In [None]:
import pickle
pickle.dump(pipe,open("pipe.pkl","wb"))