# **Selecting best model in Pipeline**

To select the best model when using multiple models in a pipeline, you can use techniques like cross-validation and evaluation metrics to compare their performance. Here's an example of how to accomplish this on the Titanic dataset:


In [4]:
# import all the necessary libraries
import pandas as pd
import seaborn as sns
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

# load the data 
titanic_data = sns.load_dataset('titanic')

# selecting the features and targets or labels from the dataset
X = titanic_data[['pclass', 'sex', 'age', 'fare', 'embarked']]       # These are the features we are going to use to predict the target 
y = titanic_data['survived']                                         # This is the target we are going to predict 

# spliting the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=42)

# creating the list of models which we want to use to predict the target 
models = [
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('XGBoost', XGBClassifier(random_state=42)),
    ('Logistic Regression', LogisticRegression(random_state=42)),
    ('SVM', SVC(random_state=42))
]

best_model = None 
best_accuracy = 0

# creating the pipeline for all the models by using for loop
for name, model in models:
    # creating the pipeline for each model:
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ('model', model)
    ])

    # cross validation score for each model
    score = cross_val_score(pipeline, X_train , y_train, cv=5, scoring='accuracy')

    # mean of the cross validation score
    mean_score = score.mean()

    # fit the model 
    pipeline.fit(X_train, y_train)

    # predict the target
    y_pred = pipeline.predict(X_test)

    # accuracy of the model 
    accuracy = accuracy_score(y_test, y_pred)

    print('Model:', name)
    print('Cross Validation Score:', mean_score)
    print('Accuracy:', accuracy)
    print()
    # if the accuracy of the model is greater than the best accuracy then update the best accuracy and best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = pipeline

    # print the best model 
print('Best Model:', best_model)  

Model: Random Forest
Cross Validation Score: 0.7991529597163399
Accuracy: 0.8379888268156425

Model: Gradient Boosting
Cross Validation Score: 0.8061952132374668
Accuracy: 0.7988826815642458

Model: XGBoost
Cross Validation Score: 0.8076233625529401
Accuracy: 0.7932960893854749

Model: Logistic Regression
Cross Validation Score: 0.7977839062346105
Accuracy: 0.8100558659217877

Model: SVM
Cross Validation Score: 0.8160248202501723
Accuracy: 0.8044692737430168

Best Model: Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('model', RandomForestClassifier(random_state=42))])
