In [None]:
# 🚀 Setup for Google Colab
import sys
if 'google.colab' in sys.modules:
    print("🔧 Setting up for Google Colab...")
    
    # Install required dependencies
    !pip install -q matplotlib seaborn scikit-learn numpy pandas
    
    # Note: SSL framework code will be included in subsequent cells for Colab compatibility
    print("✅ Dependencies installed! SSL framework will be defined in the next cells.")
else:
    print("📝 Running locally - using installed SSL framework")

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yourusername/pyssl/blob/main/notebooks/04_tabular_data_pipeline.ipynb)

# 🏭 Production-Ready SSL Pipelines - Enterprise Scale

This notebook demonstrates how to build production-ready semi-supervised learning pipelines using scikit-learn's Pipeline and ColumnTransformer.

**What you'll learn:**
- Building ML pipelines with SSL integration
- Handling mixed data types (numerical, categorical, text)
- Proper preprocessing for SSL workflows
- Model serialization and deployment patterns
- Error handling and validation

**Dataset:** Adult Census Income (real-world tabular data)
**Scenario:** Predict income level with limited labeled data

## 1. Setup & Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
import joblib
import warnings
warnings.filterwarnings('ignore')

# Import our SSL framework
import sys
sys.path.append('../')
from ssl_framework.main import SelfTrainingClassifier
from ssl_framework.strategies import ConfidenceThreshold, TopKFixedCount, AppendAndGrow

# Set style
plt.style.use('default')
sns.set_palette("husl")

print("✅ All imports successful!")

## 2. Load and Explore Dataset

We'll use the Adult Census dataset, which contains both numerical and categorical features.

In [None]:
# Create synthetic Adult Census-like dataset (for reproducibility)
def generate_adult_census_data(n_samples=10000, random_state=42):
    """
    Generate synthetic data similar to Adult Census dataset
    """
    np.random.seed(random_state)
    
    # Generate features
    age = np.random.normal(39, 13, n_samples).clip(17, 90).astype(int)
    education_num = np.random.choice(range(1, 17), n_samples, 
                                   p=[0.05, 0.08, 0.12, 0.15, 0.15, 0.12, 0.1, 0.08, 0.05, 
                                      0.03, 0.02, 0.02, 0.01, 0.01, 0.005, 0.005])
    
    # Categorical features
    workclass = np.random.choice(['Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 
                                'Local-gov', 'State-gov', 'Without-pay'], n_samples,
                               p=[0.7, 0.08, 0.04, 0.03, 0.06, 0.05, 0.04])
    
    marital_status = np.random.choice(['Married-civ-spouse', 'Never-married', 'Divorced', 
                                     'Separated', 'Widowed', 'Married-spouse-absent'], n_samples,
                                    p=[0.46, 0.33, 0.12, 0.03, 0.03, 0.03])
    
    occupation = np.random.choice(['Tech-support', 'Craft-repair', 'Other-service', 'Sales',
                                 'Exec-managerial', 'Prof-specialty', 'Handlers-cleaners',
                                 'Machine-op-inspct', 'Adm-clerical', 'Farming-fishing'], n_samples,
                                p=[0.08, 0.13, 0.12, 0.12, 0.13, 0.14, 0.06, 0.06, 0.11, 0.05])
    
    race = np.random.choice(['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other'], 
                          n_samples, p=[0.85, 0.096, 0.03, 0.01, 0.014])
    
    sex = np.random.choice(['Male', 'Female'], n_samples, p=[0.67, 0.33])
    
    capital_gain = np.random.exponential(500, n_samples).clip(0, 99999).astype(int)
    capital_loss = np.random.exponential(300, n_samples).clip(0, 4356).astype(int)
    
    hours_per_week = np.random.normal(40, 12, n_samples).clip(1, 99).astype(int)
    
    native_country = np.random.choice(['United-States', 'Mexico', 'Philippines', 'Germany', 
                                     'Canada', 'Puerto-Rico', 'Other'], n_samples,
                                    p=[0.9, 0.02, 0.015, 0.01, 0.01, 0.005, 0.04])
    
    # Create target variable (income > 50K) based on features
    # More realistic relationship between features and target
    income_score = (
        (age - 30) * 0.02 +
        education_num * 0.15 +
        (hours_per_week - 40) * 0.03 +
        np.log1p(capital_gain) * 0.1 +
        (sex == 'Male').astype(int) * 0.3 +
        (marital_status == 'Married-civ-spouse').astype(int) * 0.4 +
        np.random.normal(0, 0.5, n_samples)  # Add noise
    )
    
    # Convert to binary classification (roughly 24% positive class)
    income = (income_score > np.percentile(income_score, 76)).astype(int)
    
    # Create DataFrame
    data = pd.DataFrame({
        'age': age,
        'workclass': workclass,
        'education_num': education_num,
        'marital_status': marital_status,
        'occupation': occupation,
        'race': race,
        'sex': sex,
        'capital_gain': capital_gain,
        'capital_loss': capital_loss,
        'hours_per_week': hours_per_week,
        'native_country': native_country,
        'income': income
    })
    
    return data

# Generate dataset
print("📊 Generating Adult Census-like dataset...")
df = generate_adult_census_data(n_samples=8000, random_state=42)

print(f"✅ Dataset created: {df.shape}")
print(f"\n📋 Dataset Info:")
print(df.info())

print(f"\n🎯 Target Distribution:")
target_counts = df['income'].value_counts()
for income, count in target_counts.items():
    label = '>50K' if income else '<=50K'
    print(f"   {label}: {count} samples ({count/len(df)*100:.1f}%)")

In [None]:
# Explore the dataset
print("🔍 Dataset Sample:")
print(df.head())

print(f"\n📊 Feature Statistics:")
print(df.describe())

# Visualize some key relationships
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Age vs Income
df.boxplot(column='age', by='income', ax=axes[0, 0])
axes[0, 0].set_title('Age Distribution by Income')
axes[0, 0].set_xlabel('Income (0: <=50K, 1: >50K)')

# Education vs Income
df.boxplot(column='education_num', by='income', ax=axes[0, 1])
axes[0, 1].set_title('Education Level by Income')
axes[0, 1].set_xlabel('Income (0: <=50K, 1: >50K)')

# Hours per week vs Income
df.boxplot(column='hours_per_week', by='income', ax=axes[1, 0])
axes[1, 0].set_title('Hours per Week by Income')
axes[1, 0].set_xlabel('Income (0: <=50K, 1: >50K)')

# Sex distribution
income_by_sex = pd.crosstab(df['sex'], df['income'], normalize='index')
income_by_sex.plot(kind='bar', ax=axes[1, 1])
axes[1, 1].set_title('Income Distribution by Sex')
axes[1, 1].set_xlabel('Sex')
axes[1, 1].legend(['<=50K', '>50K'])
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 3. Data Preprocessing Pipeline

We'll create a robust preprocessing pipeline that handles different data types.

In [None]:
# Separate features and target
X = df.drop('income', axis=1)
y = df['income']

# Identify different types of features
numeric_features = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
categorical_features = ['workclass', 'marital_status', 'occupation', 'race', 'sex', 'native_country']

print(f"📊 Feature Types:")
print(f"   Numeric features ({len(numeric_features)}): {numeric_features}")
print(f"   Categorical features ({len(categorical_features)}): {categorical_features}")

# Create preprocessing pipelines for different feature types
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'  # Drop any columns not specified
)

print(f"\n✅ Preprocessing pipeline created!")
print(f"   Numeric pipeline: Imputation → Scaling")
print(f"   Categorical pipeline: Imputation → One-Hot Encoding")

## 4. Create SSL-Compatible Pipeline

Now we'll create a pipeline that integrates our SSL classifier with preprocessing.

In [None]:
# Create a custom wrapper to make SSL work with pipelines
class SSLPipelineWrapper:
    """
    Wrapper to make SelfTrainingClassifier work seamlessly with sklearn pipelines.
    """
    
    def __init__(self, base_model, selection_strategy, integration_strategy, 
                 max_iter=10, labeling_convergence_threshold=5):
        self.base_model = base_model
        self.selection_strategy = selection_strategy
        self.integration_strategy = integration_strategy
        self.max_iter = max_iter
        self.labeling_convergence_threshold = labeling_convergence_threshold
        self.ssl_model = None
        self.X_unlabeled_processed = None
        
    def set_unlabeled_data(self, X_unlabeled_processed):
        """Set the processed unlabeled data for SSL training."""
        self.X_unlabeled_processed = X_unlabeled_processed
        
    def fit(self, X, y):
        """Fit the SSL model. X should be processed by the preprocessor."""
        self.ssl_model = SelfTrainingClassifier(
            base_model=self.base_model,
            selection_strategy=self.selection_strategy,
            integration_strategy=self.integration_strategy,
            max_iter=self.max_iter,
            labeling_convergence_threshold=self.labeling_convergence_threshold
        )
        
        if self.X_unlabeled_processed is not None:
            # Use SSL with unlabeled data
            self.ssl_model.fit(X, y, self.X_unlabeled_processed)
        else:
            # Fallback to supervised learning
            print("⚠️  No unlabeled data provided, falling back to supervised learning")
            self.ssl_model.base_model.fit(X, y)
            
        return self
    
    def predict(self, X):
        """Make predictions."""
        if self.ssl_model is None:
            raise ValueError("Model not fitted yet")
        return self.ssl_model.predict(X)
    
    def predict_proba(self, X):
        """Predict class probabilities."""
        if self.ssl_model is None:
            raise ValueError("Model not fitted yet")
        return self.ssl_model.predict_proba(X)
    
    def score(self, X, y):
        """Return accuracy score."""
        return accuracy_score(y, self.predict(X))
    
    @property
    def history_(self):
        """Access training history."""
        return self.ssl_model.history_ if self.ssl_model else []
    
    @property
    def stopping_reason_(self):
        """Access stopping reason."""
        return self.ssl_model.stopping_reason_ if self.ssl_model else 'Not fitted'

print("✅ SSL Pipeline Wrapper created!")

## 5. Data Splitting for SSL

Create proper train/labeled/unlabeled/test splits for our pipeline.

In [None]:
# First split: separate test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Second split: separate validation set
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.15, stratify=y_train, random_state=42
)

# Third split: create labeled/unlabeled split
# Simulate having limited labeled data
n_labeled = 200  # Only 200 labeled examples

# Stratified sampling for labeled set
from sklearn.model_selection import train_test_split
X_labeled, X_unlabeled, y_labeled, y_unlabeled_true = train_test_split(
    X_train, y_train, train_size=n_labeled, stratify=y_train, random_state=42
)

print(f"📊 Data Splits:")
print(f"   Labeled: {len(X_labeled)} samples")
print(f"   Unlabeled: {len(X_unlabeled)} samples")
print(f"   Validation: {len(X_val)} samples")
print(f"   Test: {len(X_test)} samples")

print(f"\n🏷️ Labeled Set Class Distribution:")
labeled_dist = np.bincount(y_labeled)
for i, count in enumerate(labeled_dist):
    label = '>50K' if i else '<=50K'
    print(f"   {label}: {count} samples ({count/len(y_labeled)*100:.1f}%)")

print(f"\n🌊 Unlabeled Set Class Distribution (true, hidden):")
unlabeled_dist = np.bincount(y_unlabeled_true)
for i, count in enumerate(unlabeled_dist):
    label = '>50K' if i else '<=50K'
    print(f"   {label}: {count} samples ({count/len(y_unlabeled_true)*100:.1f}%)")

## 6. Build Complete SSL Pipeline

Now we'll create the complete pipeline that handles preprocessing and SSL training.

In [None]:
# Create complete SSL pipeline
def create_ssl_pipeline(strategy_name, selection_strategy):
    """
    Create a complete SSL pipeline with preprocessing.
    """
    ssl_wrapper = SSLPipelineWrapper(
        base_model=LogisticRegression(random_state=42, max_iter=1000),
        selection_strategy=selection_strategy,
        integration_strategy=AppendAndGrow(),
        max_iter=12,
        labeling_convergence_threshold=5
    )
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', ssl_wrapper)
    ])
    
    return pipeline

# Create baseline pipeline (supervised only)
def create_baseline_pipeline():
    """
    Create a baseline supervised pipeline.
    """
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(random_state=42, max_iter=1000))
    ])
    
    return pipeline

print("✅ Pipeline creation functions ready!")

## 7. Train and Compare Models

Let's train both baseline and SSL pipelines and compare their performance.

In [None]:
print("🔄 Training baseline supervised pipeline...")

# Train baseline model
baseline_pipeline = create_baseline_pipeline()
baseline_pipeline.fit(X_labeled, y_labeled)

# Evaluate baseline
baseline_pred = baseline_pipeline.predict(X_test)
baseline_accuracy = accuracy_score(y_test, baseline_pred)
baseline_f1 = f1_score(y_test, baseline_pred)

print(f"   ✅ Baseline trained")
print(f"   Accuracy: {baseline_accuracy:.3f}")
print(f"   F1-score: {baseline_f1:.3f}")

In [None]:
print("\n🔄 Training SSL pipelines...")

# Test different SSL strategies
ssl_strategies = [
    ('Confidence-0.90', ConfidenceThreshold(threshold=0.90)),
    ('Confidence-0.85', ConfidenceThreshold(threshold=0.85)),
    ('TopK-10', TopKFixedCount(k=10)),
    ('TopK-15', TopKFixedCount(k=15)),
]

ssl_results = []

for strategy_name, selection_strategy in ssl_strategies:
    print(f"\n🧪 Training {strategy_name}...")
    
    # Create SSL pipeline
    ssl_pipeline = create_ssl_pipeline(strategy_name, selection_strategy)
    
    # Preprocess unlabeled data for SSL
    X_unlabeled_processed = ssl_pipeline.named_steps['preprocessor'].fit_transform(X_unlabeled)
    
    # Set unlabeled data in the SSL wrapper
    ssl_pipeline.named_steps['classifier'].set_unlabeled_data(X_unlabeled_processed)
    
    # Train SSL pipeline
    ssl_pipeline.fit(X_labeled, y_labeled)
    
    # Evaluate
    ssl_pred = ssl_pipeline.predict(X_test)
    ssl_accuracy = accuracy_score(y_test, ssl_pred)
    ssl_f1 = f1_score(y_test, ssl_pred)
    
    # Calculate improvements
    acc_improvement = (ssl_accuracy - baseline_accuracy) / baseline_accuracy * 100
    f1_improvement = (ssl_f1 - baseline_f1) / baseline_f1 * 100
    
    result = {
        'strategy': strategy_name,
        'pipeline': ssl_pipeline,
        'accuracy': ssl_accuracy,
        'f1_score': ssl_f1,
        'predictions': ssl_pred,
        'acc_improvement': acc_improvement,
        'f1_improvement': f1_improvement,
        'history': ssl_pipeline.named_steps['classifier'].history_
    }
    
    ssl_results.append(result)
    
    print(f"   ✅ {strategy_name} trained")
    print(f"   Accuracy: {ssl_accuracy:.3f} (+{acc_improvement:+.1f}%)")
    print(f"   F1-score: {ssl_f1:.3f} (+{f1_improvement:+.1f}%)")

# Find best SSL strategy
best_ssl = max(ssl_results, key=lambda x: x['f1_score'])
print(f"\n🏆 Best SSL Strategy: {best_ssl['strategy']}")
print(f"   F1-score improvement: +{best_ssl['f1_improvement']:.1f}%")

## 8. Pipeline Performance Analysis

Let's analyze the performance of our production pipelines.

In [None]:
# Create comprehensive results comparison
all_results = [
    {
        'Method': 'Baseline (Supervised)',
        'Accuracy': baseline_accuracy,
        'F1-Score': baseline_f1,
        'Improvement': 0.0
    }
] + [
    {
        'Method': r['strategy'],
        'Accuracy': r['accuracy'],
        'F1-Score': r['f1_score'],
        'Improvement': r['f1_improvement']
    }
    for r in ssl_results
]

results_df = pd.DataFrame(all_results)

# Visualization
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot 1: Accuracy comparison
bars = axes[0].bar(range(len(results_df)), results_df['Accuracy'], alpha=0.7)
bars[0].set_color('red')  # Baseline in red
axes[0].set_title('Pipeline Accuracy Comparison', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Accuracy')
axes[0].set_xticks(range(len(results_df)))
axes[0].set_xticklabels(results_df['Method'], rotation=45, ha='right')
for i, v in enumerate(results_df['Accuracy']):
    axes[0].text(i, v + 0.005, f'{v:.3f}', ha='center', va='bottom', fontsize=9)

# Plot 2: F1-Score comparison
bars = axes[1].bar(range(len(results_df)), results_df['F1-Score'], alpha=0.7, color='orange')
bars[0].set_color('red')  # Baseline in red
axes[1].set_title('Pipeline F1-Score Comparison', fontsize=14, fontweight='bold')
axes[1].set_ylabel('F1-Score')
axes[1].set_xticks(range(len(results_df)))
axes[1].set_xticklabels(results_df['Method'], rotation=45, ha='right')
for i, v in enumerate(results_df['F1-Score']):
    axes[1].text(i, v + 0.005, f'{v:.3f}', ha='center', va='bottom', fontsize=9)

# Plot 3: Improvement percentages
colors = ['red'] + ['green' if x > 0 else 'orange' for x in results_df['Improvement'][1:]]
bars = axes[2].bar(range(len(results_df)), results_df['Improvement'], alpha=0.7, color=colors)
axes[2].set_title('F1-Score Improvement over Baseline', fontsize=14, fontweight='bold')
axes[2].set_ylabel('Improvement (%)')
axes[2].set_xticks(range(len(results_df)))
axes[2].set_xticklabels(results_df['Method'], rotation=45, ha='right')
axes[2].axhline(y=0, color='black', linestyle='--', alpha=0.5)
for i, v in enumerate(results_df['Improvement']):
    axes[2].text(i, v + 0.3, f'{v:.1f}%', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

print("\n📊 Pipeline Results Summary:")
print(results_df.round(3))

## 9. Model Serialization & Deployment

Let's save our best pipeline for production deployment.

In [None]:
# Save the best pipeline
best_pipeline = best_ssl['pipeline']
model_filename = f'best_ssl_pipeline_{best_ssl["strategy"].lower().replace("-", "_")}.joblib'

print(f"💾 Saving best pipeline: {best_ssl['strategy']}")
joblib.dump(best_pipeline, model_filename)
print(f"   ✅ Saved to: {model_filename}")

# Save model metadata
metadata = {
    'model_type': 'SSL Pipeline',
    'strategy': best_ssl['strategy'],
    'accuracy': best_ssl['accuracy'],
    'f1_score': best_ssl['f1_score'],
    'improvement_over_baseline': best_ssl['f1_improvement'],
    'training_samples': len(X_labeled),
    'unlabeled_samples': len(X_unlabeled),
    'features': {
        'numeric': numeric_features,
        'categorical': categorical_features
    },
    'preprocessing': 'StandardScaler + OneHotEncoder',
    'base_model': 'LogisticRegression'
}

import json
metadata_filename = model_filename.replace('.joblib', '_metadata.json')
with open(metadata_filename, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"   ✅ Metadata saved to: {metadata_filename}")

# Test loading the model
print(f"\n🔄 Testing model loading...")
loaded_pipeline = joblib.load(model_filename)
loaded_pred = loaded_pipeline.predict(X_test[:5])
original_pred = best_pipeline.predict(X_test[:5])

print(f"   Original predictions: {original_pred}")
print(f"   Loaded predictions: {loaded_pred}")
print(f"   ✅ Model loading successful: {np.array_equal(original_pred, loaded_pred)}")

## 10. Production Inference Example

Let's create a production-ready inference function.

In [None]:
def predict_income(model_path, input_data):
    """
    Production inference function.
    
    Parameters:
    -----------
    model_path : str
        Path to the saved model
    input_data : dict or pd.DataFrame
        Input features
    
    Returns:
    --------
    dict : Prediction results
    """
    try:
        # Load the model
        model = joblib.load(model_path)
        
        # Convert input to DataFrame if needed
        if isinstance(input_data, dict):
            input_df = pd.DataFrame([input_data])
        else:
            input_df = input_data.copy()
        
        # Make prediction
        prediction = model.predict(input_df)
        probabilities = model.predict_proba(input_df)
        
        # Format results
        results = []
        for i, (pred, proba) in enumerate(zip(prediction, probabilities)):
            results.append({
                'prediction': '>50K' if pred == 1 else '<=50K',
                'probability_low_income': proba[0],
                'probability_high_income': proba[1],
                'confidence': max(proba)
            })
        
        return {
            'status': 'success',
            'predictions': results
        }
        
    except Exception as e:
        return {
            'status': 'error',
            'message': str(e)
        }

# Test the inference function
print("🧪 Testing production inference function...")

# Example input
sample_inputs = [
    {
        'age': 39,
        'workclass': 'State-gov',
        'education_num': 13,
        'marital_status': 'Never-married',
        'occupation': 'Adm-clerical',
        'race': 'White',
        'sex': 'Male',
        'capital_gain': 2174,
        'capital_loss': 0,
        'hours_per_week': 40,
        'native_country': 'United-States'
    },
    {
        'age': 50,
        'workclass': 'Self-emp-not-inc',
        'education_num': 13,
        'marital_status': 'Married-civ-spouse',
        'occupation': 'Exec-managerial',
        'race': 'White',
        'sex': 'Male',
        'capital_gain': 0,
        'capital_loss': 0,
        'hours_per_week': 13,
        'native_country': 'United-States'
    }
]

for i, sample in enumerate(sample_inputs):
    result = predict_income(model_filename, sample)
    
    if result['status'] == 'success':
        pred = result['predictions'][0]
        print(f"\n   Sample {i+1}:")
        print(f"     Age: {sample['age']}, Education: {sample['education_num']}, Hours/week: {sample['hours_per_week']}")
        print(f"     Prediction: {pred['prediction']}")
        print(f"     Confidence: {pred['confidence']:.3f}")
    else:
        print(f"   Error: {result['message']}")

print(f"\n✅ Production inference function working correctly!")

## 11. Pipeline Monitoring & Validation

Let's create some monitoring functions for production deployment.

In [None]:
def validate_pipeline_performance(model_path, X_test, y_test, threshold_accuracy=0.8):
    """
    Validate pipeline performance on test data.
    """
    try:
        model = joblib.load(model_path)
        
        # Make predictions
        predictions = model.predict(X_test)
        probabilities = model.predict_proba(X_test)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, predictions)
        f1 = f1_score(y_test, predictions)
        
        # Check if performance meets threshold
        performance_ok = accuracy >= threshold_accuracy
        
        # Calculate confidence distribution
        confidences = np.max(probabilities, axis=1)
        low_confidence_ratio = np.mean(confidences < 0.7)
        
        return {
            'status': 'success',
            'performance_ok': performance_ok,
            'metrics': {
                'accuracy': accuracy,
                'f1_score': f1,
                'mean_confidence': np.mean(confidences),
                'low_confidence_ratio': low_confidence_ratio
            },
            'threshold_accuracy': threshold_accuracy
        }
        
    except Exception as e:
        return {
            'status': 'error',
            'message': str(e)
        }

def check_feature_drift(X_new, X_reference, threshold=0.1):
    """
    Simple feature drift detection using statistical tests.
    """
    from scipy import stats
    
    drift_detected = []
    
    for i in range(X_new.shape[1]):
        if np.issubdtype(X_new.dtype, np.number):  # Numeric features
            # Use Kolmogorov-Smirnov test
            statistic, p_value = stats.ks_2samp(X_reference[:, i], X_new[:, i])
            drift_detected.append(p_value < threshold)
        else:
            # For categorical features, use chi-square test
            # (This is a simplified version)
            drift_detected.append(False)
    
    return {
        'drift_detected': any(drift_detected),
        'features_with_drift': sum(drift_detected),
        'drift_details': drift_detected
    }

# Test validation functions
print("🔍 Testing pipeline validation...")

validation_result = validate_pipeline_performance(model_filename, X_test, y_test)

if validation_result['status'] == 'success':
    metrics = validation_result['metrics']
    print(f"\n📊 Pipeline Validation Results:")
    print(f"   Performance OK: {validation_result['performance_ok']}")
    print(f"   Accuracy: {metrics['accuracy']:.3f}")
    print(f"   F1-Score: {metrics['f1_score']:.3f}")
    print(f"   Mean Confidence: {metrics['mean_confidence']:.3f}")
    print(f"   Low Confidence Ratio: {metrics['low_confidence_ratio']:.3f}")
    
    if validation_result['performance_ok']:
        print(f"   ✅ Pipeline meets performance threshold")
    else:
        print(f"   ❌ Pipeline below performance threshold")
else:
    print(f"   Error: {validation_result['message']}")

print(f"\n✅ Monitoring functions ready for production!")

## 12. Key Production Insights

Based on our comprehensive pipeline development, here are the key insights:

In [None]:
# Calculate final insights
best_improvement = best_ssl['f1_improvement']
total_pseudo_labels = len(X_unlabeled)
labels_added = sum(h['new_labels_count'] for h in best_ssl['history'])
utilization_rate = labels_added / total_pseudo_labels * 100

print("🏭 PRODUCTION SSL PIPELINE INSIGHTS")
print("=" * 45)

print(f"\n📈 Performance Results:")
print(f"   • Best strategy: {best_ssl['strategy']}")
print(f"   • F1-score improvement: +{best_improvement:.1f}%")
print(f"   • Final accuracy: {best_ssl['accuracy']:.3f}")
print(f"   • Training efficiency: {utilization_rate:.1f}% of unlabeled data used")

print(f"\n🔧 Pipeline Architecture Benefits:")
print(f"   • Handles mixed data types automatically")
print(f"   • Integrated preprocessing and SSL training")
print(f"   • Serializable for production deployment")
print(f"   • Compatible with scikit-learn ecosystem")

print(f"\n💡 Production Best Practices:")
print(f"   • Use ColumnTransformer for mixed data types")
print(f"   • Implement proper error handling and validation")
print(f"   • Save both model and metadata for tracking")
print(f"   • Monitor performance and feature drift")
print(f"   • Validate model performance against thresholds")

print(f"\n🚨 Key Considerations:")
print(f"   • SSL requires careful data splitting (labeled/unlabeled/test)")
print(f"   • Preprocessing must be applied consistently")
print(f"   • Monitor pseudo-label quality in production")
print(f"   • Consider retraining when performance degrades")

print(f"\n📊 Data Efficiency Gains:")
print(f"   • Labeled data used: {len(X_labeled)} samples")
print(f"   • Unlabeled data leveraged: {len(X_unlabeled)} samples")
print(f"   • Effective training data: ~{len(X_labeled) + labels_added} samples")
print(f"   • Data efficiency multiplier: {(len(X_labeled) + labels_added) / len(X_labeled):.1f}x")

print(f"\n🎯 When to Use SSL Pipelines:")
print(f"   • Limited labeled data (< 1000 samples)")
print(f"   • Abundant unlabeled data available")
print(f"   • Labeling costs are high")
print(f"   • Domain expertise required for labeling")
print(f"   • Continuous learning scenarios")

print(f"\n⚡ Quick Deployment Checklist:")
print(f"   ✅ Model saved with joblib")
print(f"   ✅ Metadata documented")
print(f"   ✅ Inference function created")
print(f"   ✅ Validation functions implemented")
print(f"   ✅ Error handling included")
print(f"   ✅ Performance monitoring ready")

## 🔗 Next Steps

Congratulations! You've built a complete production-ready SSL pipeline.

### 🏗️ What You've Built:
- **Complete preprocessing pipeline** handling mixed data types
- **SSL integration** with scikit-learn Pipeline
- **Model serialization** for deployment
- **Production inference** functions
- **Performance monitoring** and validation
- **Error handling** and robustness

### 🎯 Production Deployment Options:

1. **Batch Processing**: Load model, process files, save predictions
2. **REST API**: Flask/FastAPI service with `predict_income()` function
3. **Streaming**: Process data streams with real-time SSL updates
4. **MLOps Pipeline**: Integration with MLflow, Kubeflow, or similar

### 📚 Advanced Topics to Explore:
- **`05_hyperparameter_tuning.ipynb`** - Optimize your pipeline parameters
- **`06_production_patterns.ipynb`** - Advanced deployment patterns
- **Model versioning** and A/B testing strategies
- **Automated retraining** pipelines

### 🚀 Ready for Enterprise Scale:
Your SSL pipeline is now ready for:
- **High-volume predictions** with consistent preprocessing
- **Continuous learning** from new unlabeled data
- **Monitoring and alerting** for model performance
- **Scalable deployment** across multiple environments

**Happy deploying! 🏭✨**