# Titanic Model V3 - Voting
Build a VotingClassifier ensemble using the best-performing base models from V3-a.

## Section 1: Import Libraries

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from datetime import datetime

## Section 2: Load Data and Feature Engineering (reuse V3-a logic)

In [12]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

# Feature Engineering
def extract_title(df):
    df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
    df['Title_Grouped'] = df['Title'].replace({
        'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs',
        'Lady': 'Rare', 'Countess': 'Rare', 'Capt': 'Rare',
        'Col': 'Rare', 'Don': 'Rare', 'Dr': 'Rare', 'Major': 'Rare',
        'Rev': 'Rare', 'Sir': 'Rare', 'Jonkheer': 'Rare', 'Dona': 'Rare'
    })
    return df

def add_family_features(df):
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']
    return df

def engineer_features(df):
    df = extract_title(df)
    df = add_family_features(df)
    return df

train = engineer_features(train)
test = engineer_features(test)

# Impute Age
title_age_medians = train.groupby('Title_Grouped')['Age'].median()

def impute_age(df):
    df['Age'] = df.apply(
        lambda row: title_age_medians[row['Title_Grouped']] if pd.isnull(row['Age']) else row['Age'], axis=1
    )
    return df

train = impute_age(train)
test = impute_age(test)

train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])
test['Fare'] = test['Fare'].fillna(test['Fare'].median())

# Select Features
features = [
    'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
    'Title_Grouped', 'FamilySize', 'IsAlone', 'FarePerPerson'
]

X = train[features]
y = train['Survived']
X_test_final = test[features]

## Section 3: Preprocessing Pipeline (shared by all base models)

In [13]:
numeric_features = ['Age', 'Fare', 'FamilySize', 'FarePerPerson']
categorical_features = ['Pclass', 'Sex', 'Title_Grouped', 'IsAlone']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

## Section 4: Define Base Models

In [14]:
clf1 = LogisticRegression(max_iter=1000, random_state=42)
clf2 = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
clf3 = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

## Section 5: Build Voting Classifier

In [15]:
voting_clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('voting', VotingClassifier(estimators=[
        ('lr', clf1),
        ('rf', clf2),
        ('xgb', clf3)
    ], voting='soft'))
])

## Section 6: Cross-Validation Accuracy

In [16]:
scores = cross_val_score(voting_clf, X, y, cv=5, scoring='accuracy')
print(f"Cross-validation Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Cross-validation Accuracy: 0.8316 ± 0.0179


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


## Section 7: Train on Full Data and Predict Test Set

In [17]:
voting_clf.fit(X, y)
y_test_pred = voting_clf.predict(X_test_final)

submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': y_test_pred
})

filename = f"../output/titanic_submission_v3-b_voting_ensemble_2025-07-31.csv"
submission.to_csv(filename, index=False)
print(f"✅ Submission saved as: {filename}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Submission saved as: ../output/titanic_submission_v3-b_voting_ensemble_2025-07-31.csv
