### Handling Missing Values - Imputation within ML Pipelines
**Description**: Implement a machine learning pipeline that includes imputation and a classifier.

In [3]:
import pandas as pd
import logging
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Set up logging
logging.basicConfig(level=logging.INFO)

# Sample dataset
data = pd.DataFrame({
    'age': [25, 30, None, 45, 35],
    'income': [50000, 60000, 55000, None, 52000],
    'city': ['NY', 'LA', 'NY', 'SF', None],
    'purchased': [1, 0, 1, 0, 1]
})

try:
    # Validate required columns
    required_columns = {'age', 'income', 'city', 'purchased'}
    if not required_columns.issubset(data.columns):
        raise ValueError(f"Missing required columns: {required_columns - set(data.columns)}")

    # Split features and target
    X = data.drop('purchased', axis=1)
    y = data['purchased']

    # Define feature types
    numeric_features = ['age', 'income']
    categorical_features = ['city']

    # Pipeline for numerical features: mean imputation + standard scaling
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    # Pipeline for categorical features: mode imputation + one-hot encoding
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine transformations using ColumnTransformer
    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Full pipeline with logistic regression
    clf_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression())
    ])

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the model
    clf_pipeline.fit(X_train, y_train)
    logging.info("Pipeline training completed.")

    # Predict and evaluate
    y_pred = clf_pipeline.predict(X_test)
    print("Classification Report:\n", classification_report(y_test, y_pred))

except Exception as e:
    logging.error(f"Error during pipeline execution: {e}")


INFO:root:Pipeline training completed.


Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
