In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [4]:
df=sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [5]:
x=df[['pclass','sex','age','fare','embarked']]
y=df['survived']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [8]:
# create a list of models
models = [
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('XGBoost', XGBClassifier(random_state=42)),
]

best_model = None
best_accuracy=0.0

# iterate over the models and evaluate their performance
for name, model in models:
  # create a pipeline for each model
  pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore')),
    ('model', model)
  ])
  # Perform cross-validation
  scores = cross_val_score(pipeline, x_train, y_train, cv=5)
  # calculate mean accuracy
  mean_accuracy=scores.mean()
  # Fit the pipeline on the training data
  pipeline.fit(x_train, y_train)
  # Predict the target variable on the test data
  y_pred = pipeline.predict(x_test)

  # calculate accuracy score
  accuracy=accuracy_score(y_test, y_pred)

  # Print performance metrics
  print("Model: ",name)
  print("Cross-validation Accuracy: ",mean_accuracy)
  print("Test Accuracy: ",accuracy)
  print()

  # Check if the current model has the best accuracy
  if accuracy>best_accuracy:
    best_accuracy=accuracy
    best_model=pipeline

# Print the best model
print("Best Model: ",best_model)

Model:  Random Forest
Cross-validation Accuracy:  0.7991529597163399
Test Accuracy:  0.8379888268156425

Model:  Gradient Boosting
Cross-validation Accuracy:  0.8061952132374668
Test Accuracy:  0.7988826815642458

Model:  XGBoost
Cross-validation Accuracy:  0.8076233625529401
Test Accuracy:  0.7932960893854749

Best Model:  Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('model', RandomForestClassifier(random_state=42))])
