In [5]:
'''
Wrapper Methods: Unlike filter which uses statistical methods to select features,Wrapper treats model as as a black box and evaluate feature combinations based on their impact on model's performance.
Process: Subset generation -> model train on that subset->evaluate performance->Selection based on performance->repeat until desired number of features or performance is achieved.
Strategies in Wrapper methods:
1)Forward selection:start with empty set , select a model, by adding a feature train model and evaluate performance
2)Backward Selection: start with entire dataset, select a model, calc performance step by step remove features and calc performance.
3)Recursive Feature Elimination (RFE):Rank features and eliminate least imp recursively
Feature Importance: It uses the estimator's intrinsic feature importance method (like coefficients for linear models or feature importance for tree-based models) to score every feature.
Disadvantages:Computationally costlier.
'''

"\nWrapper Methods: Unlike filter which uses statistical methods to select features,Wrapper treats model as as a black box and evaluate feature combinations based on their impact on model's performance.\nProcess: Subset generation -> model train on that subset->evaluate performance->Selection based on performance->repeat until desired number of features or performance is achieved.\nStrategies in Wrapper methods:\n1)Forward selection:start with empty set , select a model, by adding a feature train model and evaluate performance \n2)Backward Selection: start with entire dataset, select a model, calc performance step by step remove features and calc performance.\n3)Recursive Feature Elimination (RFE):Rank features and eliminate least imp recursively\nFeature Importance: It uses the estimator's intrinsic feature importance method (like coefficients for linear models or feature importance for tree-based models) to score every feature.\nDisadvantages:Computationally costlier.\n"

In [6]:
'''Dataset and model(estimator) selection and processing'''
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import RFE
import seaborn as sns

# 1. Load the Titanic dataset
try:
    df = sns.load_dataset('titanic')
except:
    # Fallback for environments without seaborn data
    print("Could not load Titanic from seaborn; please ensure it's installed.")
    df = pd.DataFrame()

# 2. Select Titanic-like features and the target
# We use Pclass, Sex, Age, SibSp, Parch, Fare, and Embarked
data = df[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'survived']].copy()

# 3. Handle Missing Values
# Fill 'Age' with the median
data['age'].fillna(data['age'].median(), inplace=True)
# Fill 'Embarked' with the most frequent value (mode)
data['embarked'].fillna(data['embarked'].mode()[0], inplace=True)

# 4. Handle Categorical Variables (One-Hot Encoding)
data = pd.get_dummies(data, columns=['sex', 'embarked'], drop_first=True)

# 5. Define Features (X) and Target (y)
X = data.drop('survived', axis=1)
y = data['survived']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Initialize the model (estimator)
log_reg = LogisticRegression(solver='liblinear', random_state=42)

print(f"Initial features: {list(X.columns)}")
print(f"Training set shape: {X_train.shape}")
print("\n" + "="*50 + "\n")

Initial features: ['pclass', 'age', 'sibsp', 'parch', 'fare', 'sex_male', 'embarked_Q', 'embarked_S']
Training set shape: (623, 8)




The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['age'].fillna(data['age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['embarked'].fillna(data['embarked'].mode()[0], inplace=True)


In [7]:
print("--- 2. Forward Selection ---")

# Initialize SFS with forward=True
sfs_forward = SFS(
    estimator=log_reg,
    k_features='best', # Find the optimal number of features between 1 and 8
    forward=True,
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

# Fit SFS to the training data
sfs_forward = sfs_forward.fit(X_train, y_train)

# Get the best feature subset
selected_features_fs = list(sfs_forward.k_feature_names_)

print(f"Optimal features count: {sfs_forward.k_feature_idx_}")
print(f"Best features: {selected_features_fs}")
print(f"Cross-validation score: {sfs_forward.k_score_:.4f}")

# Train the final model and test
log_reg.fit(X_train[selected_features_fs], y_train)
y_pred_fs = log_reg.predict(X_test[selected_features_fs])
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_fs):.4f}")
print("\n" + "="*50 + "\n")

--- 2. Forward Selection ---
Optimal features count: (0, 1, 2, 5)
Best features: ['pclass', 'age', 'sibsp', 'sex_male']
Cross-validation score: 0.8057
Test Accuracy: 0.7948




In [8]:
print("--- 3. Backward Elimination ---")

# Initialize SFS with forward=False (Backward Elimination)
sfs_backward = SFS(
    estimator=log_reg,
    k_features='best', # Find the optimal number of features
    forward=False,
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

# Fit SBS to the training data
sfs_backward = sfs_backward.fit(X_train, y_train)

# Get the best feature subset
selected_features_bs = list(sfs_backward.k_feature_names_)

print(f"Optimal features count: {sfs_backward.k_feature_idx_}")
print(f"Best features: {selected_features_bs}")
print(f"Cross-validation score: {sfs_backward.k_score_:.4f}")

# Train the final model and test
log_reg.fit(X_train[selected_features_bs], y_train)
y_pred_bs = log_reg.predict(X_test[selected_features_bs])
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_bs):.4f}")
print("\n" + "="*50 + "\n")

--- 3. Backward Elimination ---
Optimal features count: (0, 1, 2, 5)
Best features: ['pclass', 'age', 'sibsp', 'sex_male']
Cross-validation score: 0.8057
Test Accuracy: 0.7948




In [9]:

print("--- 4. Recursive Feature Elimination (RFE) ---")
# We'll aim to select the best 4 features (just an arbitrary choice for RFE)
num_features_rfe = 4

# Initialize RFE
rfe_selector = RFE(
    estimator=log_reg,
    n_features_to_select=num_features_rfe,
    step=1 # Remove 1 feature at each iteration
)

# Fit RFE to the training data
rfe_selector = rfe_selector.fit(X_train, y_train)

# Get the selected features
selected_mask = rfe_selector.support_
selected_features_rfe = X.columns[selected_mask].tolist()

print(f"Best features (k={num_features_rfe}): {selected_features_rfe}")

# Train the final model and test
log_reg.fit(X_train[selected_features_rfe], y_train)
y_pred_rfe = log_reg.predict(X_test[selected_features_rfe])
print(f"Test Accuracy: {accuracy_score(y_test, y_pred_rfe):.4f}")
print("\n" + "="*50 + "\n")

--- 4. Recursive Feature Elimination (RFE) ---
Best features (k=4): ['pclass', 'sex_male', 'embarked_Q', 'embarked_S']
Test Accuracy: 0.7761


