In [4]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np

In [2]:
iris = load_iris()
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=42,stratify=y) 

# --- 2. DEFINE PIPELINE STEPS ---
# The Pipeline is a list of (name, estimator) tuples
pipeline_steps = [('scaler', StandardScaler()),('model', LogisticRegression(random_state=42, max_iter=200))]

model_pipeline = Pipeline(pipeline_steps)
model_pipeline.fit(X_train, y_train)

y_pred = model_pipeline.predict(X_test)

# Calculate and print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Pipeline Training Complete.")
print(f"Model Steps: {model_pipeline.steps}")
print(f"Test Accuracy: {accuracy:.4f}")

Pipeline Training Complete.
Model Steps: [('scaler', StandardScaler()), ('model', LogisticRegression(max_iter=200, random_state=42))]
Test Accuracy: 0.9111


In [3]:
#for a nested pipeline (meaning another pipeline is present in another making it called as sub-pipeline)

In [5]:

# --- 1. CREATE MOCK DATA & SPLIT ---
data = {
    'Age': [25, 45, np.nan, 30, 50, 22],
    'Salary': [50000, 75000, 60000, np.nan, 90000, 45000],
    'City': ['NYC', 'London', 'Paris', 'NYC', 'London', 'Paris'],
    'Education': ['High', 'College', 'College', 'High', 'Grad', 'High'],
    'Target': [0, 1, 0, 1, 0, 1]
}
df = pd.DataFrame(data)
X = df.drop('Target', axis=1)
y = df['Target']

# Split data (necessary for proper fit/transform logic)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# --- 2. DEFINE SUB-PIPELINES (Nested Pipelines) ---

# A. Numeric Sub-Pipeline: Impute (mean) -> Scale
numeric_features = ['Age', 'Salary']
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),('scaler', StandardScaler())])

# B. Categorical Sub-Pipeline: Impute (most frequent) -> One-Hot Encode
categorical_features = ['City', 'Education']
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore'))])

# --- 3. CREATE COLUMN TRANSFORMER (Feature Unification) ---

# Directs data flow: applies the correct sub-pipeline to the correct columns
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),('cat', categorical_transformer, categorical_features)],remainder='passthrough')

# --- 4. CREATE MAIN PIPELINE ---
main_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression(random_state=42))])

# --- 5. EXECUTE ---
# Trains the entire nested workflow in one call
main_pipeline.fit(X_train, y_train)

# Demonstrate prediction/evaluation (predict the test data)
y_pred = main_pipeline.predict(X_test)

print("Nested Pipeline Training and Execution Complete.")
print(f"Original Data Shape: {X_test.shape}")
print(f"Transformed Data Output (Features Scaled & Encoded): {main_pipeline['preprocessor'].transform(X_test).shape}")

Nested Pipeline Training and Execution Complete.
Original Data Shape: (3, 4)
Transformed Data Output (Features Scaled & Encoded): (3, 8)
