# ML Training Pipeline Example - Customer Churn Prediction

**Execution:** `uvx --with "flowerpower[rq],pandas>=2.0.0,scikit-learn>=1.3.0,matplotlib,seaborn" jupyter lab`

This notebook demonstrates FlowerPower for ML workflows with customer churn prediction.

## Quick Start

In [2]:
import sys
import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Add FlowerPower source to path
sys.path.insert(0, str(Path().absolute().parents[2] / "src"))

from flowerpower.flowerpower import FlowerPowerProject

# Initialize project
project = FlowerPowerProject.load(".")


In [4]:

print("🤖 FlowerPower ML Training Pipeline")
print("===================================")
print(f"📁 Project: {project.pipeline_manager.project_cfg.name}")
print(f"🎯 Pipeline: customer_churn")
print(f"⚡ Quick execution mode")

# Quick pipeline execution
result = project.pipeline_manager.run(
    "customer_churn",
    inputs={"training_date": datetime.now().isoformat()},
    final_vars=["model_evaluation_report"]
)

print("✅ ML pipeline completed successfully!")
if "model_evaluation_report" in result:
    evaluation = result["model_evaluation_report"]
    print(f"📊 Model accuracy: {evaluation['model_metrics']['accuracy']:.3f}")
    print(f"🎯 F1 score: {evaluation['model_metrics']['f1_score']:.3f}")
    print(f"📄 Full report: {evaluation['report_file']}")

🤖 FlowerPower ML Training Pipeline
📁 Project: ml-training-pipeline
🎯 Pipeline: customer_churn
⚡ Quick execution mode


ModuleNotFoundError: No module named 'sklearn'

## 1. Data Exploration

Explore the customer data that our ML pipeline will process:

In [None]:
# Load and explore the customer data
data_file = "data/customer_data.csv"

if Path(data_file).exists():
    df = pd.read_csv(data_file)
    print(f"📊 Dataset shape: {df.shape}")
    print(f"📈 Features: {df.columns.tolist()}")
    print(f"🎯 Target variable: 'churn' (if available)")
    
    print("\n" + "="*50)
    print("Dataset Overview:")
    display(df.head())
    
    print("\n" + "="*50)
    print("Dataset Info:")
    display(df.info())
    
    print("\n" + "="*50)
    print("Statistical Summary:")
    display(df.describe())
    
    # Check for target variable
    if 'churn' in df.columns:
        print("\n" + "="*50)
        print("Churn Distribution:")
        churn_counts = df['churn'].value_counts()
        print(churn_counts)
        
        # Visualize churn distribution
        plt.figure(figsize=(8, 6))
        plt.subplot(1, 2, 1)
        churn_counts.plot(kind='bar')
        plt.title('Churn Distribution')
        plt.xlabel('Churn')
        plt.ylabel('Count')
        
        plt.subplot(1, 2, 2)
        churn_counts.plot(kind='pie', autopct='%1.1f%%')
        plt.title('Churn Percentage')
        plt.ylabel('')
        
        plt.tight_layout()
        plt.show()
else:
    print(f"⚠️ Data file not found: {data_file}")
    print("💡 The pipeline will generate synthetic data during execution")

## 2. Pipeline Components Analysis

In [None]:
# Run detailed pipeline execution
detailed_result = project.pipeline_manager.run(
    "customer_churn",
    inputs={
        "training_date": datetime.now().isoformat(),
        "test_size": 0.2,
        "random_state": 42,
        "model_params": {
            "n_estimators": 100,
            "max_depth": 6,
            "min_samples_split": 2,
            "min_samples_leaf": 1
        }
    },
    final_vars=[
        "trained_model",
        "model_predictions",
        "model_evaluation_report",
        "customer_features"
    ]
)

print("🔍 Detailed Pipeline Analysis")
print("=============================")

# Analyze trained model
if "trained_model" in detailed_result:
    model_info = detailed_result["trained_model"]
    print(f"\n🤖 Model Training:")
    print(f"   • Algorithm: {model_info['model_type']}")
    print(f"   • Training time: {model_info['training_time']:.2f}s")
    print(f"   • Model file: {model_info['model_file']}")

# Analyze predictions
if "model_predictions" in detailed_result:
    predictions = detailed_result["model_predictions"]
    print(f"\n🎯 Predictions:")
    print(f"   • Prediction count: {predictions['prediction_count']}")
    print(f"   • Churn predicted: {predictions['churn_predictions']}")
    print(f"   • No churn predicted: {predictions['no_churn_predictions']}")

## 3. Model Performance Visualization

Visualize the model's performance with key metrics:

In [None]:
# Extract evaluation metrics for visualization
if "model_evaluation_report" in detailed_result:
    evaluation = detailed_result["model_evaluation_report"]
    metrics = evaluation["model_metrics"]
    
    print("📈 Model Performance Metrics")
    print("============================")
    
    # Display key metrics
    print(f"Accuracy: {metrics['accuracy']:.3f}")
    print(f"Precision: {metrics['precision']:.3f}")
    print(f"Recall: {metrics['recall']:.3f}")
    print(f"F1 Score: {metrics['f1_score']:.3f}")
    
    # Create performance visualization
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    
    # Metrics bar chart
    metric_names = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
    metric_values = [metrics['accuracy'], metrics['precision'], 
                    metrics['recall'], metrics['f1_score']]
    
    axes[0, 0].bar(metric_names, metric_values, color=['skyblue', 'lightgreen', 'lightcoral', 'gold'])
    axes[0, 0].set_title('Model Performance Metrics')
    axes[0, 0].set_ylabel('Score')
    axes[0, 0].set_ylim(0, 1)
    for i, v in enumerate(metric_values):
        axes[0, 0].text(i, v + 0.01, f'{v:.3f}', ha='center')
    
    # Confusion matrix (simulated for demonstration)
    conf_matrix = [[85, 15], [20, 80]]  # Example confusion matrix
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=axes[0, 1])
    axes[0, 1].set_title('Confusion Matrix')
    axes[0, 1].set_xlabel('Predicted')
    axes[0, 1].set_ylabel('Actual')
    
    # Feature importance (simulated)
    feature_names = ['tenure', 'monthly_charges', 'total_charges', 'contract_type', 'payment_method']
    importance_scores = [0.25, 0.20, 0.18, 0.15, 0.12]
    
    axes[1, 0].barh(feature_names, importance_scores, color='lightblue')
    axes[1, 0].set_title('Feature Importance')
    axes[1, 0].set_xlabel('Importance Score')
    
    # Training history (simulated)
    epochs = list(range(1, 11))
    train_acc = [0.65, 0.70, 0.75, 0.78, 0.80, 0.82, 0.83, 0.84, 0.84, 0.85]
    val_acc = [0.63, 0.68, 0.72, 0.74, 0.76, 0.77, 0.78, 0.78, 0.79, 0.79]
    
    axes[1, 1].plot(epochs, train_acc, 'b-', label='Training Accuracy', marker='o')
    axes[1, 1].plot(epochs, val_acc, 'r-', label='Validation Accuracy', marker='s')
    axes[1, 1].set_title('Training Progress')
    axes[1, 1].set_xlabel('Epoch')
    axes[1, 1].set_ylabel('Accuracy')
    axes[1, 1].legend()
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Display evaluation summary
    print(f"\n📄 Evaluation Report: {evaluation['report_file']}")
    print(f"⏱️ Evaluation time: {evaluation['evaluation_metadata']['completed_at']}")
    print(f"🎯 Model ready for deployment: {metrics['accuracy'] > 0.75}")
else:
    print("⚠️ No evaluation results available")

## 4. Experiment with Different Configurations

Run the pipeline with different hyperparameters:

In [None]:
# Experiment with different configurations
experiments = [
    {
        "name": "Default",
        "config": {
            "test_size": 0.2,
            "random_state": 42
        }
    },
    {
        "name": "Large Test Set",
        "config": {
            "test_size": 0.3,
            "random_state": 42,
            "model_params": {
                "n_estimators": 150,
                "max_depth": 8
            }
        }
    },
    {
        "name": "Feature Engineering Focus", 
        "config": {
            "test_size": 0.2,
            "random_state": 123,
            "feature_selection": True,
            "scale_features": True
        }
    }
]

experiment_results = []

print("🧪 Running ML Experiments")
print("==========================")

for exp in experiments:
    print(f"\n🔄 Running {exp['name']} experiment...")
    
    # Add training date to config
    config = exp['config'].copy()
    config['training_date'] = datetime.now().isoformat()
    
    try:
        result = project.pipeline_manager.run(
            "customer_churn",
            inputs=config,
            final_vars=["model_evaluation_report"]
        )
        
        if "model_evaluation_report" in result:
            metrics = result["model_evaluation_report"]["model_metrics"]
            experiment_results.append({
                "name": exp['name'],
                "accuracy": metrics['accuracy'],
                "f1_score": metrics['f1_score'],
                "precision": metrics['precision'],
                "recall": metrics['recall']
            })
            
            print(f"   ✅ Accuracy: {metrics['accuracy']:.3f}, F1: {metrics['f1_score']:.3f}")
        else:
            print(f"   ❌ Experiment failed")
            
    except Exception as e:
        print(f"   ❌ Error: {e}")

# Compare experiment results
if experiment_results:
    print("\n📊 Experiment Comparison")
    print("========================")
    
    results_df = pd.DataFrame(experiment_results)
    display(results_df)
    
    # Visualize comparison
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # Accuracy comparison
    axes[0].bar(results_df['name'], results_df['accuracy'], color='skyblue')
    axes[0].set_title('Accuracy Comparison')
    axes[0].set_ylabel('Accuracy')
    axes[0].set_ylim(0, 1)
    for i, v in enumerate(results_df['accuracy']):
        axes[0].text(i, v + 0.01, f'{v:.3f}', ha='center')
    
    # All metrics comparison
    metrics_cols = ['accuracy', 'precision', 'recall', 'f1_score']
    x = range(len(results_df))
    width = 0.2
    
    for i, metric in enumerate(metrics_cols):
        axes[1].bar([pos + width * i for pos in x], results_df[metric], 
                   width, label=metric.title())
    
    axes[1].set_title('All Metrics Comparison')
    axes[1].set_ylabel('Score')
    axes[1].set_xticks([pos + width * 1.5 for pos in x])
    axes[1].set_xticklabels(results_df['name'])
    axes[1].legend()
    axes[1].set_ylim(0, 1)
    
    plt.tight_layout()
    plt.show()
    
    # Find best experiment
    best_exp = results_df.loc[results_df['f1_score'].idxmax()]
    print(f"\n🏆 Best performing experiment: {best_exp['name']}")
    print(f"   📈 F1 Score: {best_exp['f1_score']:.3f}")
    print(f"   🎯 Accuracy: {best_exp['accuracy']:.3f}")

## 5. Background Job Queue Example

Schedule model training jobs for background execution using FlowerPower's JobQueueManager:

In [None]:
# Demonstrate job queue functionality for ML training
print("🚀 Job Queue ML Training Example")
print("=================================")

# Queue a training job
try:
    job = project.pipeline_manager.enqueue(
        "customer_churn",
        inputs={
            "training_date": datetime.now().isoformat(),
            "test_size": 0.25,
            "model_params": {
                "n_estimators": 200,
                "max_depth": 10
            }
        },
        final_vars=["model_evaluation_report"],
        queue_name="ml_training"
    )
    
    print(f"✅ Training job enqueued!")
    print(f"🔧 Job ID: {job.id}")
    print(f"📋 Queue: {job.origin}")
    print("\n🚀 To process this job, start a worker:")
    print("   flowerpower job-queue start-worker --queue-names ml_training")
    
except Exception as e:
    print(f"❌ Job queue error: {e}")
    print("💡 This requires Redis to be running for background jobs")

# Schedule recurring model retraining
try:
    scheduled_job = project.pipeline_manager.schedule(
        "customer_churn",
        cron="0 2 * * 1",  # Every Monday at 2 AM
        inputs={
            "training_date": datetime.now().isoformat(),
            "test_size": 0.2,
            "retrain_model": True
        },
        final_vars=["model_evaluation_report"],
        queue_name="ml_training"
    )
    
    print(f"\n📅 Scheduled weekly model retraining!")
    print(f"🔧 Job ID: {scheduled_job.id}")
    print(f"⏰ Schedule: Every Monday at 2:00 AM")
    print("\n🚀 To process scheduled jobs, start a worker with scheduler:")
    print("   flowerpower job-queue start-worker --with-scheduler")
    
except Exception as e:
    print(f"❌ Scheduling error: {e}")
    print("💡 This requires Redis to be running for scheduled jobs")

## 6. Model Export and Results Analysis

Export trained models and analyze results for deployment:

In [None]:
# Run pipeline with export focus
export_result = project.pipeline_manager.run(
    "customer_churn",
    inputs={
        "training_date": datetime.now().isoformat(),
        "export_model": True,
        "save_predictions": True,
        "generate_report": True
    },
    final_vars=[
        "trained_model",
        "model_predictions",
        "model_evaluation_report"
    ]
)

print("📦 Model Export and Analysis")
print("=============================")

# Model export information
if "trained_model" in export_result:
    model_info = export_result["trained_model"]
    print(f"\n🤖 Trained Model:")
    print(f"   • Model type: {model_info['model_type']}")
    print(f"   • Model file: {model_info['model_file']}")
    print(f"   • Training time: {model_info['training_time']:.2f}s")
    print(f"   • Model size: {model_info.get('model_size', 'N/A')}")

# Predictions export
if "model_predictions" in export_result:
    predictions_info = export_result["model_predictions"]
    print(f"\n🎯 Predictions:")
    print(f"   • Predictions file: {predictions_info['predictions_file']}")
    print(f"   • Prediction count: {predictions_info['prediction_count']}")
    print(f"   • Churn rate: {predictions_info['churn_predictions'] / predictions_info['prediction_count']:.1%}")

# Evaluation report
if "model_evaluation_report" in export_result:
    evaluation = export_result["model_evaluation_report"]
    print(f"\n📊 Evaluation Report:")
    print(f"   • Report file: {evaluation['report_file']}")
    print(f"   • Model accuracy: {evaluation['model_metrics']['accuracy']:.3f}")
    print(f"   • Model F1 score: {evaluation['model_metrics']['f1_score']:.3f}")
    print(f"   • Ready for production: {evaluation['model_metrics']['accuracy'] > 0.8}")

# Create deployment summary
print(f"\n🚀 Deployment Summary")
print(f"=====================")
print(f"📅 Training date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"📈 Model performance: {'Good' if export_result.get('model_evaluation_report', {}).get('model_metrics', {}).get('accuracy', 0) > 0.75 else 'Needs improvement'}")
print(f"✅ Ready for deployment: {export_result.get('model_evaluation_report', {}).get('model_metrics', {}).get('accuracy', 0) > 0.8}")

# Save summary to file
summary_data = {
    "training_timestamp": datetime.now().isoformat(),
    "model_accuracy": export_result.get('model_evaluation_report', {}).get('model_metrics', {}).get('accuracy', 0),
    "model_f1_score": export_result.get('model_evaluation_report', {}).get('model_metrics', {}).get('f1_score', 0),
    "total_predictions": export_result.get('model_predictions', {}).get('prediction_count', 0),
    "churn_predictions": export_result.get('model_predictions', {}).get('churn_predictions', 0)
}

summary_df = pd.DataFrame([summary_data])
summary_file = f"outputs/ml_training_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"

try:
    os.makedirs("outputs", exist_ok=True)
    summary_df.to_csv(summary_file, index=False)
    print(f"\n💾 Training summary saved: {summary_file}")
except Exception as e:
    print(f"\n⚠️ Could not save summary: {e}")

print(f"\n🎉 ML Training Pipeline completed successfully!")
print(f"💡 Use the exported model for real-time churn prediction in production")

## 7. Additional Experiments

Try different aspects of the ML pipeline:

In [None]:
# Experiment with pipeline-only mode (no job queue)
print("🔧 Pipeline-Only Mode Experiment")
print("==================================")

# Use pipeline manager directly
simple_result = project.pipeline_manager.run(
    "customer_churn",
    inputs={
        "training_date": datetime.now().isoformat(),
        "simple_mode": True,
        "test_size": 0.3
    },
    final_vars=["customer_features", "model_evaluation_report"]
)

print("✅ Pipeline-only execution completed")
print("💡 This mode doesn't require Redis - perfect for development!")

if "customer_features" in simple_result:
    features = simple_result["customer_features"]
    print(f"📊 Features processed: {features['feature_count']}")

if "model_evaluation_report" in simple_result:
    metrics = simple_result["model_evaluation_report"]["model_metrics"]
    print(f"🎯 Quick model accuracy: {metrics['accuracy']:.3f}")

# Custom feature engineering experiment
print("\n🛠️ Custom Feature Engineering")
print("==============================")

custom_feature_result = project.pipeline_manager.run(
    "customer_churn",
    inputs={
        "training_date": datetime.now().isoformat(),
        "feature_engineering": {
            "polynomial_features": True,
            "interaction_features": True,
            "feature_selection_k": 15
        },
        "cross_validation": True
    },
    final_vars=["customer_features", "model_evaluation_report"]
)

print("✅ Custom feature engineering completed")

if "customer_features" in custom_feature_result:
    features = custom_feature_result["customer_features"]
    print(f"🔧 Enhanced features: {features['feature_count']}")
    print(f"⚡ Feature selection applied: {features.get('selected_features', 'N/A')}")

if "model_evaluation_report" in custom_feature_result:
    metrics = custom_feature_result["model_evaluation_report"]["model_metrics"]
    print(f"📈 Enhanced model accuracy: {metrics['accuracy']:.3f}")
    print(f"🎯 Enhanced F1 score: {metrics['f1_score']:.3f}")