# Titanic Model V3 - Stacking
Build a Stacking Classifier ensemble to combine base models with a meta-model

## Section 1: Import Libraries

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from datetime import datetime

## Section 2: Load Data and Feature Engineering (reuse V3 logic)

In [11]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

# Feature Engineering
def extract_title(df):
    df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
    df['Title_Grouped'] = df['Title'].replace({
        'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs',
        'Lady': 'Rare', 'Countess': 'Rare', 'Capt': 'Rare',
        'Col': 'Rare', 'Don': 'Rare', 'Dr': 'Rare', 'Major': 'Rare',
        'Rev': 'Rare', 'Sir': 'Rare', 'Jonkheer': 'Rare', 'Dona': 'Rare'
    })
    return df

def add_family_features(df):
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']
    return df

def engineer_features(df):
    df = extract_title(df)
    df = add_family_features(df)
    return df

train = engineer_features(train)
test = engineer_features(test)

# Impute Age
title_age_medians = train.groupby('Title_Grouped')['Age'].median()

def impute_age(df):
    df['Age'] = df.apply(
        lambda row: title_age_medians[row['Title_Grouped']] if pd.isnull(row['Age']) else row['Age'], axis=1
    )
    return df

train = impute_age(train)
test = impute_age(test)

train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])
test['Fare'] = test['Fare'].fillna(test['Fare'].median())

# Select Features
features = [
    'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
    'Title_Grouped', 'FamilySize', 'IsAlone', 'FarePerPerson'
]

X = train[features]
y = train['Survived']
X_test_final = test[features]

## Section 3: Preprocessing Pipeline

In [12]:
numeric_features = ['Age', 'Fare', 'FamilySize', 'FarePerPerson']
categorical_features = ['Pclass', 'Sex', 'Title_Grouped', 'IsAlone']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

## Section 4: Define and Build StackingClassifier Pipeline

In [16]:
stacking_clf = Pipeline([
    ('preprocessor', preprocessor),
    ('stacking', StackingClassifier(
        estimators=[
            ('lr', LogisticRegression(max_iter=1000, random_state=42)),
            ('rf', RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)),
            ('xgb', XGBClassifier(eval_metric='logloss', n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42))
        ],
        final_estimator=LogisticRegression(max_iter=1000, random_state=42),
        cv=5,
        n_jobs=-1
    ))
])

## Section 5: Cross-Validation Accuracy

In [17]:
scores = cross_val_score(stacking_clf, X, y, cv=5, scoring='accuracy')
print(f"Cross-validation Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")

Cross-validation Accuracy: 0.8271 ± 0.0243


## Section 6: Train on Full Data and Predict Test Set

In [19]:
stacking_clf.fit(X, y)
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': stacking_clf.predict(X_test_final)
})
filename = f"../output/titanic_submission_v3-c_stacking_ensemble_2025-07-31.csv"
submission.to_csv(filename, index=False)
print(f"✅ Submission saved as: {filename}")

✅ Submission saved as: ../output/titanic_submission_v3-c_stacking_ensemble_2025-07-31.csv
