In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.base import BaseEstimator, TransformerMixin

# Example custom transformer for feature engineering
class CustomFeatureEngineering(BaseEstimator, TransformerMixin):
    def __init__(self, drop_columns=None):
        self.drop_columns = drop_columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        # Example: Creating a new feature based on existing features
        X['new_feature'] = X['feature1'] * X['feature2']
        
        if self.drop_columns:
            X.drop(self.drop_columns, axis=1, inplace=True)
        
        return X

# Assume X is your feature set and y is your target variable
# For demonstration, let's create a simple DataFrame
data = {
    'age': [25, 30, 35, 40],
    'income': [50000, 60000, 70000, 80000],
    'gender': ['male', 'female', 'female', 'male'],
    'occupation': ['engineer', 'teacher', 'engineer', 'teacher'],
    'feature1': [1, 2, 3, 4],
    'feature2': [10, 20, 30, 40],
    'unnecessary_feature': [5, 6, 7, 8]
}
X = pd.DataFrame(data)
y = [0, 1, 0, 1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the steps for the pipeline
numeric_features = ['age', 'income']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = ['gender', 'occupation']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the custom feature engineering step
feature_engineering = CustomFeatureEngineering(drop_columns=['unnecessary_feature'])

# Define the feature selection step
feature_selection = SelectKBest(score_func=f_classif, k=5)

# Combine all steps into a pipeline
pipeline = Pipeline(steps=[
    ('feature_engineering', feature_engineering),
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selection),
    ('classifier', RandomForestClassifier())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Print the feature names after transformation
print(pipeline.named_steps['preprocessor'].get_feature_names_out())


['num__age' 'num__income' 'cat__gender_female' 'cat__gender_male'
 'cat__occupation_engineer' 'cat__occupation_teacher']


  f = msb / msw
