# ML Pipeline Test

This notebook demonstrates a complete machine learning pipeline that will be converted to Kubeflow Pipelines using the Pipeline Builder extension.

We'll build a simple iris classification model with the following steps:
1. Data loading and exploration
2. Data preprocessing 
3. Model training
4. Model evaluation
5. Results summary

In [2]:
!pip install pandas

Collecting pandas
  Using cached pandas-2.3.0-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.3.0-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.3.0-cp313-cp313-win_amd64.whl (11.0 MB)
Downloading numpy-2.3.0-cp313-cp313-win_amd64.whl (12.7 MB)
   ---------------------------------------- 0.0/12.7 MB ? eta -:--:--
   ---------------------- ----------------- 7.1/12.7 MB 52.9 MB/s eta 0:00:01
   ---------------------------------------- 12.7/12.7 MB 47.7 MB/s eta 0:00:00
Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: tzdata, numpy, pandas

   ---------------------------------------- 0/3 [tzdata]
   ---------------------------------------- 0/3 [tzdata]
   ---------------------------------------- 0/3 [tzdata]
   ---------------------------------------- 0/3 [tzdata]
   ----

In [4]:
!pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.7.0-cp313-cp313-win_amd64.whl.metadata (14 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Using cached scipy-1.15.3-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.7.0-cp313-cp313-win_amd64.whl (10.7 MB)
Using cached joblib-1.5.1-py3-none-any.whl (307 kB)
Using cached scipy-1.15.3-cp313-cp313-win_amd64.whl (41.0 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn

   ---------- ----------------------------- 1/4 [scipy]
   ---------- ----------------------------- 1/4 [scipy]
   ---------- ----------------------------- 1/4 [scipy]
   ---------- ----------------------------- 1/4 [scipy]
   ---------- --

In [6]:
!pip install matplotlib

Collecting matplotlib
  Using cached matplotlib-3.10.3-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.2-cp313-cp313-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.58.4-cp313-cp313-win_amd64.whl.metadata (108 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Using cached kiwisolver-1.4.8-cp313-cp313-win_amd64.whl.metadata (6.3 kB)
Collecting pillow>=8 (from matplotlib)
  Using cached pillow-11.2.1-cp313-cp313-win_amd64.whl.metadata (9.1 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Using cached pyparsing-3.2.3-py3-none-any.whl.metadata (5.0 kB)
Using cached matplotlib-3.10.3-cp313-cp313-win_amd64.whl (8.1 MB)
Using cached contourpy-1.3.2-cp313-cp313-win_amd64.whl (223 kB)
Using cached cycler-0.12.1-py3-none-any.whl (8.3 kB)
Downloading fonttools-4

In [1]:
# Data Loading and Initial Exploration
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt

# Load the iris dataset
print(" Loading iris dataset...")
iris = load_iris()

# Create DataFrame
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target
df['target_name'] = df['target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

print(f" Dataset loaded successfully!")
print(f" Dataset shape: {df.shape}")
print(f" Features: {list(df.columns[:-2])}")
print(f" Target classes: {df['target_name'].unique()}")

# Basic statistics
print("\n Dataset Overview:")
print(df.describe())

# Save some key metrics
total_samples = len(df)
num_features = len(iris.feature_names)
num_classes = len(iris.target_names)

print(f"\n Summary: {total_samples} samples, {num_features} features, {num_classes} classes")

 Loading iris dataset...
 Dataset loaded successfully!
 Dataset shape: (150, 6)
 Features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
 Target classes: ['setosa' 'versicolor' 'virginica']

 Dataset Overview:
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            5.843333          3.057333           3.758000   
std             0.828066          0.435866           1.765298   
min             4.300000          2.000000           1.000000   
25%             5.100000          2.800000           1.600000   
50%             5.800000          3.000000           4.350000   
75%             6.400000          3.300000           5.100000   
max             7.900000          4.400000           6.900000   

       petal width (cm)      target  
count        150.000000  150.000000  
mean           1.199333    1.000000  
std            0.762238    0.819232  
min            

In [2]:
# Data Preprocessing and Feature Engineering
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

print(" Starting data preprocessing...")

# Prepare features and target
X = df.drop(['target', 'target_name'], axis=1)
y = df['target']

print(f" Features shape: {X.shape}")
print(f" Target shape: {y.shape}")

# Split the data
test_size = 0.2
random_state = 42

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state, stratify=y
)

print(f" Training set: {X_train.shape[0]} samples")
print(f" Test set: {X_test.shape[0]} samples")

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(" Feature scaling completed")

# Check for missing values
missing_values = df.isnull().sum().sum()
print(f" Missing values: {missing_values}")

# Feature statistics after scaling
train_mean = np.mean(X_train_scaled, axis=0)
train_std = np.std(X_train_scaled, axis=0)

print(" Preprocessing completed successfully!")
print(f" Scaled features - Mean: {train_mean.round(3)}")
print(f" Scaled features - Std: {train_std.round(3)}")

preprocessing_summary = {
    'train_samples': len(X_train),
    'test_samples': len(X_test),
    'features': X_train.shape[1],
    'test_size_ratio': test_size
}

 Starting data preprocessing...
 Features shape: (150, 4)
 Target shape: (150,)
 Training set: 120 samples
 Test set: 30 samples
 Feature scaling completed
 Missing values: 0
 Preprocessing completed successfully!
 Scaled features - Mean: [-0. -0.  0.  0.]
 Scaled features - Std: [1. 1. 1. 1.]


In [3]:
# Model Training
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import time

print(" Starting model training...")

# Define model parameters
rf_params = {
    'n_estimators': 100,
    'random_state': 42,
    'max_depth': 5
}

lr_params = {
    'random_state': 42,
    'max_iter': 1000
}

svm_params = {
    'random_state': 42,
    'kernel': 'rbf'
}

# Train multiple models
models = {}
training_times = {}

print(" Training Random Forest...")
start_time = time.time()
rf_model = RandomForestClassifier(**rf_params)
rf_model.fit(X_train_scaled, y_train)
training_times['RandomForest'] = time.time() - start_time
models['RandomForest'] = rf_model
print(f"    Training time: {training_times['RandomForest']:.3f}s")

print(" Training Logistic Regression...")
start_time = time.time()
lr_model = LogisticRegression(**lr_params)
lr_model.fit(X_train_scaled, y_train)
training_times['LogisticRegression'] = time.time() - start_time
models['LogisticRegression'] = lr_model
print(f"    Training time: {training_times['LogisticRegression']:.3f}s")

print(" Training SVM...")
start_time = time.time()
svm_model = SVC(**svm_params)
svm_model.fit(X_train_scaled, y_train)
training_times['SVM'] = time.time() - start_time
models['SVM'] = svm_model
print(f"    Training time: {training_times['SVM']:.3f}s")

print(" All models trained successfully!")
print(f" Trained {len(models)} models: {list(models.keys())}")

# Quick training accuracy check
train_accuracies = {}
for name, model in models.items():
    train_pred = model.predict(X_train_scaled)
    train_acc = accuracy_score(y_train, train_pred)
    train_accuracies[name] = train_acc
    print(f" {name} training accuracy: {train_acc:.4f}")

best_train_model = max(train_accuracies, key=train_accuracies.get)
print(f" Best training accuracy: {best_train_model} ({train_accuracies[best_train_model]:.4f})")

 Starting model training...
 Training Random Forest...
    Training time: 0.159s
 Training Logistic Regression...
    Training time: 0.019s
 Training SVM...
    Training time: 0.003s
 All models trained successfully!
 Trained 3 models: ['RandomForest', 'LogisticRegression', 'SVM']
 RandomForest training accuracy: 1.0000
 LogisticRegression training accuracy: 0.9583
 SVM training accuracy: 0.9750
 Best training accuracy: RandomForest (1.0000)


In [4]:
# Model Evaluation and Performance Analysis
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

print(" Starting model evaluation...")

# Evaluate all models
evaluation_results = {}

for name, model in models.items():
    print(f"\n Evaluating {name}...")
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Store results
    evaluation_results[name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'training_time': training_times[name]
    }
    
    print(f"    Accuracy: {accuracy:.4f}")
    print(f"    Precision: {precision:.4f}")
    print(f"    Recall: {recall:.4f}")
    print(f"    F1-Score: {f1:.4f}")

# Find best model
best_model_name = max(evaluation_results, key=lambda x: evaluation_results[x]['accuracy'])
best_model = models[best_model_name]
best_accuracy = evaluation_results[best_model_name]['accuracy']

print(f"\n Best Model: {best_model_name}")
print(f" Best Accuracy: {best_accuracy:.4f}")

# Detailed evaluation of best model
print(f"\n Detailed Classification Report for {best_model_name}:")
y_pred_best = best_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred_best, target_names=iris.target_names))

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_best)
print(f"\n Confusion Matrix for {best_model_name}:")
print(conf_matrix)

# Model comparison summary
print(f"\n Model Comparison Summary:")
for name, results in evaluation_results.items():
    print(f"{name:15} | Acc: {results['accuracy']:.4f} | Time: {results['training_time']:.3f}s")

# Final metrics for pipeline output
final_accuracy = best_accuracy
final_model_name = best_model_name
total_models_trained = len(models)

 Starting model evaluation...

 Evaluating RandomForest...
    Accuracy: 0.9333
    Precision: 0.9333
    Recall: 0.9333
    F1-Score: 0.9333

 Evaluating LogisticRegression...
    Accuracy: 0.9333
    Precision: 0.9333
    Recall: 0.9333
    F1-Score: 0.9333

 Evaluating SVM...
    Accuracy: 0.9667
    Precision: 0.9697
    Recall: 0.9667
    F1-Score: 0.9666

 Best Model: SVM
 Best Accuracy: 0.9667

 Detailed Classification Report for SVM:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      0.90      0.95        10
   virginica       0.91      1.00      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30


 Confusion Matrix for SVM:
[[10  0  0]
 [ 0  9  1]
 [ 0  0 10]]

 Model Comparison Summary:
RandomForest    | Acc: 0.9333 | Time: 0.159s
LogisticRegression | Acc: 0.9333 | 

In [5]:
# Pipeline Configuration Parameters
print(" Setting up pipeline parameters...")

# Data parameters
dataset_name = "iris"
target_column = "target"
test_size_param = 0.2
random_state_param = 42

# Model parameters
rf_n_estimators = 100
rf_max_depth = 5
lr_max_iter = 1000
svm_kernel = "rbf"

# Evaluation parameters
scoring_metric = "accuracy"
cv_folds = 5

# Pipeline metadata
pipeline_version = "1.0.0"
pipeline_description = "Iris classification pipeline with multiple models"
author = "Pipeline Builder Extension"

print(" Pipeline parameters configured:")
print(f"    Dataset: {dataset_name}")
print(f"    Target: {target_column}")
print(f"    Test size: {test_size_param}")
print(f"    Random state: {random_state_param}")
print(f"    Version: {pipeline_version}")

 Setting up pipeline parameters...
 Pipeline parameters configured:
    Dataset: iris
    Target: target
    Test size: 0.2
    Random state: 42
    Version: 1.0.0


In [6]:
# Pipeline Results and Metrics Summary
print(" Generating pipeline metrics summary...")

# Calculate overall pipeline metrics
pipeline_success = True
total_execution_time = sum(training_times.values())

# Performance metrics
avg_accuracy = np.mean([results['accuracy'] for results in evaluation_results.values()])
std_accuracy = np.std([results['accuracy'] for results in evaluation_results.values()])

# Model diversity
accuracy_range = max(evaluation_results.values(), key=lambda x: x['accuracy'])['accuracy'] - \
                min(evaluation_results.values(), key=lambda x: x['accuracy'])['accuracy']

# Final pipeline metrics
pipeline_metrics = {
    'pipeline_success': pipeline_success,
    'best_model': best_model_name,
    'best_accuracy': float(best_accuracy),
    'average_accuracy': float(avg_accuracy),
    'accuracy_std': float(std_accuracy),
    'accuracy_range': float(accuracy_range),
    'total_models': total_models_trained,
    'total_training_time': float(total_execution_time),
    'dataset_size': total_samples,
    'test_samples': len(X_test),
    'num_features': num_features,
    'num_classes': num_classes
}

print(" Pipeline metrics computed:")
print(f"    Best model: {best_model_name} ({best_accuracy:.4f})")
print(f"    Average accuracy: {avg_accuracy:.4f} ± {std_accuracy:.4f}")
print(f"    Total training time: {total_execution_time:.3f}s")
print(f"    Models trained: {total_models_trained}")
print(f"    Accuracy range: {accuracy_range:.4f}")

# Success criteria
success_threshold = 0.90
pipeline_passed = best_accuracy >= success_threshold

print(f"\n Pipeline Quality Assessment:")
print(f"   Success threshold: {success_threshold}")
print(f"   Pipeline passed: {' YES' if pipeline_passed else ' NO'}")

if pipeline_passed:
    print(" Pipeline completed successfully with high accuracy!")
else:
    print(" Pipeline completed but accuracy below threshold")

 Generating pipeline metrics summary...
 Pipeline metrics computed:
    Best model: SVM (0.9667)
    Average accuracy: 0.9444 ± 0.0157
    Total training time: 0.181s
    Models trained: 3
    Accuracy range: 0.0333

 Pipeline Quality Assessment:
   Success threshold: 0.9
   Pipeline passed:  YES
 Pipeline completed successfully with high accuracy!
