# Load Processed Data

In [None]:
# Titanic Survival Prediction - Voting Ensemble with Feature Selection

# 1. Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

# 2. Load data
train = pd.read_csv("./data/train_processed.csv")
test = pd.read_csv("./data/test_processed.csv")

# 3. Define features and target
X = train.drop(columns=['Survived'])
y = train['Survived']

# 4. Encode categorical variables
for col in ['Sex', 'Title']:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    test[col] = le.transform(test[col])

# 5. Split training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Feature selection using SelectFromModel
selector = SelectFromModel(
    RandomForestClassifier(n_estimators=100, random_state=42),
    threshold='mean'
)
selector.fit(X_train, y_train)

X_train_selected = selector.transform(X_train)
X_val_selected = selector.transform(X_val)
X_test_selected = selector.transform(test)

selected_features = X_train.columns[selector.get_support()]
print("Selected features:", list(selected_features))

# 7. Hyperparameter tuning - RandomForest
rf_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5]
}
rf_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_grid, cv=5, scoring='accuracy')
rf_search.fit(X_train_selected, y_train)
best_rf = rf_search.best_estimator_
print("Best RF params:", rf_search.best_params_)

# 8. Hyperparameter tuning - GradientBoosting
gb_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5]
}
gb_search = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_grid, cv=5, scoring='accuracy')
gb_search.fit(X_train_selected, y_train)
best_gb = gb_search.best_estimator_
print("Best GB params:", gb_search.best_params_)

# 9. Hyperparameter tuning - XGBoost
xgb_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5]
}
xgb_search = GridSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    xgb_grid, cv=5, scoring='accuracy'
)
xgb_search.fit(X_train_selected, y_train)
best_xgb = xgb_search.best_estimator_
print("Best XGB params:", xgb_search.best_params_)

# 10. VotingClassifier (soft voting)
voting_clf = VotingClassifier(
    estimators=[('rf', best_rf), ('gb', best_gb), ('xgb', best_xgb)],
    voting='soft'
)
voting_clf.fit(X_train_selected, y_train)

# 11. Evaluate on validation set
val_preds = voting_clf.predict(X_val_selected)
acc = accuracy_score(y_val, val_preds)
print("Validation accuracy:", round(acc, 4))

# 12. Predict on test set and save submission
test_preds = voting_clf.predict(X_test_selected)
submission = pd.DataFrame({
    'PassengerId': np.arange(892, 892 + len(test_preds)),
    'Survived': test_preds
})
submission_path = "./submissions/submission_v6_feature_selected2.csv"
submission.to_csv(submission_path, index=False)
print("Submission saved to", submission_path)


✅ Selected Features (threshold='mean'): ['Sex', 'Age', 'Fare', 'Title']
Best RF: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
Best GB: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Best XGB: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Validation Accuracy (Feature Selected Voting): 0.8146
✅ Feature Selected Voting submission saved as submission_v6_feature_selected2.csv
