# Complete ML Pipeline - Week 1 Workshop

This is the culmination of Week 1! We'll combine everything we've learned - Metaflow, data exploration, and visualization - into a production-ready machine learning pipeline.

## Learning Objectives
- Build end-to-end ML pipelines with Metaflow
- Implement proper data preprocessing
- Compare multiple ML algorithms systematically
- Create comprehensive model evaluation
- Generate production-ready reports

Let's build something amazing!

## 1. Pipeline Setup and Imports

In [None]:
# Import all necessary libraries
print("🚀 Setting up Complete ML Pipeline Environment")
print("=" * 50)

# Core ML and data libraries
import pandas as pd
import numpy as np
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    roc_auc_score, roc_curve, precision_recall_curve
)

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Metaflow for MLOps
from metaflow import FlowSpec, step, Parameter, catch

# Utilities
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

print("✅ All libraries imported successfully!")
print("📊 Ready to build production ML pipeline")
print("🎯 Target: Wine classification with comprehensive evaluation")

## 2. Define the Complete ML Pipeline

In [None]:
class CompleteMLPipeline(FlowSpec):
    """
    Production-ready ML pipeline for wine classification
    
    Features:
    - Comprehensive data preprocessing
    - Multiple algorithm comparison
    - Cross-validation and robust evaluation
    - Automated report generation
    - Production-ready model artifacts
    """
    
    # Configurable parameters
    test_size = Parameter('test_size',
                         help='Test set proportion (0.1-0.4)',
                         default=0.2,
                         type=float)
    
    random_state = Parameter('random_state',
                           help='Random seed for reproducibility',
                           default=42,
                           type=int)
    
    cv_folds = Parameter('cv_folds',
                        help='Number of cross-validation folds',
                        default=5,
                        type=int)
    
    models_to_test = Parameter('models',
                              help='Comma-separated list of models',
                              default='random_forest,logistic_regression,svm')
    
    @step
    def start(self):
        """
        Initialize pipeline with data loading and validation
        """
        print("🍷 Starting Complete Wine Classification Pipeline")
        print("=" * 50)
        print(f"📊 Configuration:")
        print(f"   Test size: {self.test_size}")
        print(f"   Random state: {self.random_state}")
        print(f"   CV folds: {self.cv_folds}")
        print(f"   Models: {self.models_to_test}")
        
        # Parameter validation
        if not (0.1 <= self.test_size <= 0.4):
            raise ValueError(f"test_size must be between 0.1 and 0.4, got {self.test_size}")
        
        # Load wine dataset
        wine_data = load_wine()
        
        # Store raw data and metadata
        self.X_raw = wine_data.data
        self.y_raw = wine_data.target
        self.feature_names = wine_data.feature_names.tolist()
        self.target_names = wine_data.target_names.tolist()
        
        # Create dataset info
        self.dataset_info = {
            'n_samples': self.X_raw.shape[0],
            'n_features': self.X_raw.shape[1],
            'n_classes': len(np.unique(self.y_raw)),
            'class_distribution': np.bincount(self.y_raw).tolist()
        }
        
        print(f"\n📈 Dataset Overview:")
        print(f"   Samples: {self.dataset_info['n_samples']}")
        print(f"   Features: {self.dataset_info['n_features']}")
        print(f"   Classes: {self.dataset_info['n_classes']}")
        
        self.next(self.preprocess)
    
    @step
    def preprocess(self):
        """
        Data preprocessing pipeline
        """
        print("\n🔧 Data Preprocessing...")
        
        # Split the data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X_raw, self.y_raw,
            test_size=self.test_size,
            random_state=self.random_state,
            stratify=self.y_raw
        )
        
        # Feature scaling
        self.scaler = StandardScaler()
        self.X_train_scaled = self.scaler.fit_transform(self.X_train)
        self.X_test_scaled = self.scaler.transform(self.X_test)
        
        print(f"   📊 Train/Test split: {len(self.X_train)}/{len(self.X_test)}")
        print(f"   📏 Features scaled using StandardScaler")
        
        self.next(self.train_models)
    
    @catch(var='training_errors')
    @step
    def train_models(self):
        """
        Train and compare multiple ML algorithms
        """
        print("\n🤖 Training Multiple ML Models...")
        
        # Parse model list
        model_names = [name.strip() for name in self.models_to_test.split(',')]
        
        # Define model configurations
        model_configs = {
            'random_forest': RandomForestClassifier(
                n_estimators=100,
                random_state=self.random_state
            ),
            'logistic_regression': LogisticRegression(
                random_state=self.random_state,
                max_iter=1000
            ),
            'svm': SVC(
                random_state=self.random_state,
                probability=True
            )
        }
        
        # Train models
        self.model_results = {}
        self.training_errors = {}
        
        cv = StratifiedKFold(n_splits=self.cv_folds, shuffle=True, random_state=self.random_state)
        
        for model_name in model_names:
            if model_name in model_configs:
                try:
                    print(f"   🔨 Training {model_name}...")
                    
                    model = model_configs[model_name]
                    
                    # Cross-validation
                    cv_scores = cross_val_score(
                        model, self.X_train_scaled, self.y_train,
                        cv=cv, scoring='accuracy'
                    )
                    
                    # Fit and evaluate
                    model.fit(self.X_train_scaled, self.y_train)
                    test_accuracy = model.score(self.X_test_scaled, self.y_test)
                    
                    self.model_results[model_name] = {
                        'model': model,
                        'cv_mean': cv_scores.mean(),
                        'cv_std': cv_scores.std(),
                        'test_accuracy': test_accuracy
                    }
                    
                    print(f"      CV: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")
                    print(f"      Test: {test_accuracy:.3f}")
                    
                except Exception as e:
                    print(f"      ❌ Training failed: {str(e)}")
                    self.training_errors[model_name] = str(e)
        
        print(f"\n   ✅ Successfully trained {len(self.model_results)} models")
        
        self.next(self.evaluate)
    
    @step
    def evaluate(self):
        """
        Evaluate models and select best performer
        """
        print("\n📊 Model Evaluation...")
        
        if not self.model_results:
            print("   ❌ No models to evaluate")
            self.best_model_name = None
            self.next(self.end)
            return
        
        # Find best model
        best_model_name = max(self.model_results.keys(),
                            key=lambda x: self.model_results[x]['cv_mean'])
        
        self.best_model_name = best_model_name
        best_results = self.model_results[best_model_name]
        
        print(f"   🏆 Best model: {best_model_name}")
        print(f"   📈 CV score: {best_results['cv_mean']:.3f} ± {best_results['cv_std']:.3f}")
        print(f"   🎯 Test accuracy: {best_results['test_accuracy']:.3f}")
        
        # Generate predictions for evaluation
        best_model = best_results['model']
        self.y_pred = best_model.predict(self.X_test_scaled)
        
        # Classification report
        self.classification_report = classification_report(
            self.y_test, self.y_pred,
            target_names=self.target_names,
            output_dict=True
        )
        
        self.next(self.end)
    
    @step
    def end(self):
        """
        Finalize pipeline
        """
        print("\n🎉 Complete ML Pipeline Finished!")
        print("=" * 40)
        
        if self.best_model_name:
            best_results = self.model_results[self.best_model_name]
            
            print("📊 Pipeline Summary:")
            print(f"   🏆 Best Model: {self.best_model_name}")
            print(f"   🎯 Accuracy: {best_results['test_accuracy']:.3f}")
            print(f"   📈 CV Score: {best_results['cv_mean']:.3f}")
            print(f"   🤖 Models Trained: {len(self.model_results)}")
            
            # Performance assessment
            accuracy = best_results['test_accuracy']
            if accuracy > 0.95:
                print("   ✅ Excellent performance - ready for production!")
            elif accuracy > 0.9:
                print("   ✅ Very good performance")
            elif accuracy > 0.8:
                print("   ⚠️ Good performance - consider improvements")
            else:
                print("   ❌ Performance needs improvement")
        else:
            print("❌ Pipeline execution failed")
        
        print("\n✨ All artifacts saved by Metaflow!")

print("✅ CompleteMLPipeline class defined successfully!")
print("💡 To run: save as .py file and execute 'python pipeline.py run'")

## 3. Pipeline Demonstration

In [None]:
print("🧪 PIPELINE DEMONSTRATION")
print("=" * 25)

# Quick demo with sample data
wine_data = load_wine()
X, y = wine_data.data, wine_data.target

print(f"📊 Dataset: {X.shape[0]} samples, {X.shape[1]} features")

# Split and preprocess
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"🔧 Preprocessing: {len(X_train)} train, {len(X_test)} test")

# Quick model comparison
models = {
    'Random Forest': RandomForestClassifier(n_estimators=50, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    accuracy = model.score(X_test_scaled, y_test)
    print(f"🤖 {name}: {accuracy:.3f} accuracy")

print("\n✅ Pipeline components working correctly!")

## 4. Results Visualization

In [None]:
print("📊 RESULTS VISUALIZATION")
print("=" * 25)

# Comprehensive model comparison
models_to_compare = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'SVM': SVC(random_state=42, probability=True)
}

# Train and evaluate
model_results = {}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models_to_compare.items():
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=cv)
    model.fit(X_train_scaled, y_train)
    test_accuracy = model.score(X_test_scaled, y_test)
    
    model_results[name] = {
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'test_accuracy': test_accuracy
    }

# Create visualization
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
fig.suptitle('ML Pipeline Results Analysis', fontsize=14, fontweight='bold')

# 1. Model comparison
model_names = list(model_results.keys())
cv_means = [model_results[name]['cv_mean'] for name in model_names]
test_accs = [model_results[name]['test_accuracy'] for name in model_names]

x_pos = np.arange(len(model_names))
axes[0].bar(x_pos - 0.2, cv_means, 0.4, label='CV Score', alpha=0.7)
axes[0].bar(x_pos + 0.2, test_accs, 0.4, label='Test Score', alpha=0.7)
axes[0].set_xlabel('Models')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Model Performance')
axes[0].set_xticks(x_pos)
axes[0].set_xticklabels(model_names, rotation=45)
axes[0].legend()
axes[0].grid(axis='y', alpha=0.3)

# 2. Feature importance (Random Forest)
rf_model = models_to_compare['Random Forest']
feature_importance = rf_model.feature_importances_
top_features_idx = np.argsort(feature_importance)[-8:]
axes[1].barh(range(len(top_features_idx)), feature_importance[top_features_idx])
axes[1].set_yticks(range(len(top_features_idx)))
axes[1].set_yticklabels([wine_data.feature_names[i] for i in top_features_idx])
axes[1].set_xlabel('Importance')
axes[1].set_title('Top Features (Random Forest)')

# 3. Confusion matrix (best model)
best_model_name = max(model_results.keys(), key=lambda x: model_results[x]['cv_mean'])
best_model = models_to_compare[best_model_name]
y_pred = best_model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[2])
axes[2].set_xlabel('Predicted')
axes[2].set_ylabel('Actual')
axes[2].set_title(f'Confusion Matrix ({best_model_name})')

plt.tight_layout()
plt.show()

print(f"🏆 Best Model: {best_model_name}")
print(f"📈 CV Score: {model_results[best_model_name]['cv_mean']:.3f}")
print(f"🎯 Test Accuracy: {model_results[best_model_name]['test_accuracy']:.3f}")

## 5. Production Considerations

In [None]:
print("🚀 PRODUCTION DEPLOYMENT")
print("=" * 25)

print("📋 Production Checklist:")
checklist = [
    "✅ Model performance >90% accuracy",
    "✅ Cross-validation stability",
    "✅ Error handling implemented",
    "✅ Reproducible with fixed seeds",
    "⚠️ Monitoring setup needed",
    "⚠️ A/B testing framework"
]

for item in checklist:
    print(f"   {item}")

print("\n🔧 Next Steps:")
next_steps = [
    "1. Set up model serving API",
    "2. Implement monitoring dashboard",
    "3. Create automated retraining",
    "4. Establish performance alerts",
    "5. Document model limitations"
]

for step in next_steps:
    print(f"   {step}")

print("\n💡 Deployment Options:")
print("   📦 Batch Processing: Metaflow + AWS Batch")
print("   🌐 Real-time API: Flask/FastAPI + Docker")
print("   🔄 Streaming: Kafka + Spark")

print("\n✅ Pipeline is production-ready!")

## 6. Workshop Summary

In [None]:
print("🎓 WEEK 1 WORKSHOP COMPLETE!")
print("=" * 30)

print("🏆 What You've Accomplished:")
accomplishments = [
    "✅ Built complete Metaflow ML pipeline",
    "✅ Mastered data exploration techniques",
    "✅ Implemented robust preprocessing",
    "✅ Compared multiple ML algorithms",
    "✅ Created professional visualizations",
    "✅ Assessed production readiness"
]

for achievement in accomplishments:
    print(f"   {achievement}")

print("\n🛠️ Skills Developed:")
print("   🔧 Metaflow workflow development")
print("   📊 Pandas data manipulation")
print("   🤖 Scikit-learn model training")
print("   📈 Matplotlib/Seaborn visualization")
print("   ⚖️ Cross-validation techniques")

print("\n🚀 Coming in Week 2:")
print("   🔗 LangChain and LCEL")
print("   🧠 LLM integration")
print("   📝 Text processing pipelines")
print("   🌐 AI application development")

print("\n💡 Practice Recommendations:")
print("   🔄 Run pipeline with different datasets")
print("   ⚙️ Experiment with parameters")
print("   📊 Create custom visualizations")
print("   📚 Review LangChain basics")

print("\n🎉 Excellent work! Ready for Week 2!")
print("🏆 - INRIVA AI Academy Team")

## 7. Save Pipeline for Execution

In [None]:
print("💾 Pipeline Export Instructions")
print("=" * 30)

print("📋 To use this pipeline:")
print("")
print("1. 📄 Create 'complete_ml_pipeline.py'")
print("2. 📝 Copy CompleteMLPipeline class from cell 2")
print("3. 💾 Add imports and main block")
print("4. ▶️ Run: python complete_ml_pipeline.py run")
print("5. 📊 View: python complete_ml_pipeline.py show")
print("")
print("🔧 Example commands:")
print("   python complete_ml_pipeline.py run")
print("   python complete_ml_pipeline.py run --test_size 0.3")
print("   python complete_ml_pipeline.py run --models 'random_forest,svm'")
print("")
print("📈 Access results:")
print("   from metaflow import Flow")
print("   flow = Flow('CompleteMLPipeline')")
print("   run = flow.latest_run")
print("")
print("✨ Production-ready MLOps workflow!")