# 1. Import Libraries

In [18]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import joblib

# 2. Load Dataset and Data Cleaning

In [2]:
# Load the Titanic dataset from Seaborn
titanic = sns.load_dataset('titanic')

In [3]:
# Drop columns that won't be used in the model
titanic.drop(columns=['deck', 'embark_town', 'alive'], inplace=True)

# 3. Split Dataset

In [4]:
# Define the features and target variable
X = titanic.drop(columns=['survived'])
y = titanic['survived']

In [5]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Define Preprocessing Steps

In [6]:
# Define preprocessing for numeric features (imputation + scaling)
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [7]:
# Define preprocessing for categorical features (imputation + one-hot encoding)
categorical_features = ['embarked', 'sex', 'class', 'who', 'adult_male', 'sibsp', 'parch']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [8]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 5. Define the Pipeline with Multiple Models

In [9]:
# Define the pipeline with preprocessing and classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# 6. Define Hyperparameter Grid and Perform Grid Search

In [10]:
# Define a parameter grid for GridSearchCV
param_grid = [
    {
        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_features': [None, 'sqrt', 'log2']
    },
    {
        'classifier': [SVC()],
        'classifier__kernel': ['linear', 'rbf'],
        'classifier__C': [0.1, 1, 10]
    },
    {
        'classifier': [LogisticRegression()],
        'classifier__penalty': ['l1', 'l2'],
        'classifier__C': [0.1, 1, 10],
        'classifier__solver': ['liblinear']
    }
]

In [11]:
# Perform Grid Search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# 7. Evaluate the Best Model

In [12]:
# Get the best model from grid search
best_model = grid_search.best_estimator_
best_model

In [13]:
# Make predictions on the test set
y_pred = best_model.predict(X_test)

In [15]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Best Model: {grid_search.best_params_}")
print(f"Accuracy: {accuracy * 100:.2f}%")

Best Model: {'classifier': SVC(), 'classifier__C': 1, 'classifier__kernel': 'rbf'}
Accuracy: 82.12%


In [17]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,True


In [20]:
# Save the model
joblib.dump(best_model, 'titanic_model.pkl')

# Save the label encoders
for column, le in label_encoders.items():
    joblib.dump(le, f'label_encoder_{column}.pkl')

NameError: name 'label_encoders' is not defined