# Project Details

```
Type: Image Classification
Dataset: cifar10
Model architecture: simple cnn, ResNet50, InceptionV3
```

### Install required Libraries

In [2]:
!pip install kfp==2.8.0




[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


### Check Readme file to install additional components for kubeflow pipeline such as kserve, push gateway, minio s3 storage alike, cluster roles

# Project Walkthrough as follows

```
python 3.8.9
kfp version 2.8.0
kubernetes version 1.29.1
kubeflow pipeline version 2.2.0
```

**Pipeline componentes:**
- Data Prepration component     (Data downloading and preprocessing step)
- Training Model component     (Model training using simple cnn, ResNet50, and InceptionV3 Step)
  - Training Model using simple cnn architecture Component
  - Training Model using ResNet50 architecture Component
  - Training Model using InceptionV3 architecture Component
- Evaluating Trained Model component      (Evaluation Step)
- Perforamance Check for best model to deploy Component    (Best model finding between trained model step)
- Deploy Model Component    (Deploying the Best model)
- Monitor Model Component    (Monitor The deployed model)

**Arguments for kubefow time-series experiment pipeline as name depicts to be define:**
```
arguments={
        "epochs": 1, 
        "service_account_name" : "sa-minio-kserve", 
        "namespace" : "kubeflow", 
        "service_name" : "cifar10-service",
        "kserve_version" :"v1beta1"
        }
```

# Importing Required Libraries

In [3]:
import kfp
from kfp.v2.dsl import component, Input, Output, Dataset, Model, OutputPath, Metrics, ClassificationMetrics
from kfp import dsl

  import kfp
  from kfp.v2.dsl import component, Input, Output, Dataset, Model, OutputPath, Metrics, ClassificationMetrics


# Pipeline Components

### Data Prepration Component

In [4]:
@component(packages_to_install=['tensorflow', 'numpy', 'matplotlib'])
def prepare_data(prepare_dataset: Output[Dataset]):
    import tensorflow as tf
    import numpy as np
    import os
    import matplotlib.pyplot as plt

    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()

    data_path = prepare_dataset.path
    os.makedirs(data_path, exist_ok=True)

    np.save(os.path.join(data_path, 'x_train.npy'), x_train)
    np.save(os.path.join(data_path, 'y_train.npy'), y_train)
    np.save(os.path.join(data_path, 'x_test.npy'), x_test)
    np.save(os.path.join(data_path, 'y_test.npy'), y_test)

    class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer',
               'dog', 'frog', 'horse', 'ship', 'truck']

    plt.figure(figsize=(10,10))
    for i in range(25):
        plt.subplot(5,5,i+1)
        plt.xticks([])
        plt.yticks([])
        plt.grid(False)
        plt.imshow(x_train[i])
        # The CIFAR labels happen to be arrays, 
        # which is why you need the extra index
        plt.xlabel(class_names[y_train[i][0]])
    plt.savefig(data_path + '/dataset.png')

    print("Data preparation is complete.")

  return component_factory.create_component_from_func(


### Training Model Component

#### Training Model using simple cnn architecture Component

In [5]:
@component(packages_to_install=['tensorflow', 'numpy', 'matplotlib', 'scikit-learn'])
def train_model_simple_cnn(input_data: Input[Dataset], trained_model_simple_cnn: Output[Model], train_simple_cnn_metrics: Output[Metrics], epochs: int = 10):
    import tensorflow as tf
    import numpy as np
    import os
    import matplotlib.pyplot as plt
    from sklearn.model_selection import train_test_split

    x_train = np.load(os.path.join(input_data.path, 'x_train.npy'))
    y_train = np.load(os.path.join(input_data.path, 'y_train.npy'))

    # Normalize pixel values to be between 0 and 1
    x_train = x_train.astype('float32') / 255.0

    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

    datagen = tf.keras.preprocessing.image.ImageDataGenerator(
        rotation_range=15,
        width_shift_range=0.1,
        height_shift_range=0.1,
        horizontal_flip=True,
        zoom_range=0.1
    )

    datagen.fit(x_train)

    strategy = tf.distribute.MultiWorkerMirroredStrategy()

    with strategy.scope():
        model = tf.keras.models.Sequential([
            tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 3)),
            tf.keras.layers.MaxPooling2D((2, 2)),
            tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
            tf.keras.layers.MaxPooling2D((2, 2)),
            tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(10)
        ])

        model.compile(optimizer='adam',
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])
    
        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

        history = model.fit(datagen.flow(x_train, y_train, batch_size=64), validation_data=(x_val, y_val), epochs=epochs, callbacks=[early_stopping])

    # Save the model
    model.save(os.path.join(trained_model_simple_cnn.path, '1'))  ## here '1' is version of model

    plt.plot(history.history['accuracy'], label='accuracy')
    plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.ylim([0, 1])
    plt.legend(loc='lower right')
    plt.savefig(trained_model_simple_cnn.path + '/training_history.png')

    train_simple_cnn_metrics.log_metric("accuracy", max(history.history['accuracy']))
    train_simple_cnn_metrics.log_metric("val_accuracy", max(history.history['val_accuracy']))

    print("Model training is complete.")


#### Training Model using ResNet50 architecture Component

In [6]:
@component(packages_to_install=['tensorflow', 'numpy', 'matplotlib', 'scikit-learn'])
def train_model_resnet50(input_data: Input[Dataset], trained_model_resnet50: Output[Model], train_resnet50_metrics: Output[Metrics], epochs: int = 10):
    import tensorflow as tf
    import numpy as np
    import os
    import matplotlib.pyplot as plt
    from sklearn.model_selection import train_test_split

    x_train = np.load(os.path.join(input_data.path, 'x_train.npy'))
    y_train = np.load(os.path.join(input_data.path, 'y_train.npy'))

    # Normalize pixel values to be between 0 and 1
    x_train = x_train.astype('float32') / 255.0

    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

    datagen = tf.keras.preprocessing.image.ImageDataGenerator(
        rotation_range=15,
        width_shift_range=0.1,
        height_shift_range=0.1,
        horizontal_flip=True,
        zoom_range=0.1
    )

    datagen.fit(x_train)

    strategy = tf.distribute.MultiWorkerMirroredStrategy()

    with strategy.scope():

        base_model = tf.keras.applications.ResNet50(weights=None, include_top=False, input_shape=(32,32,3))

        x= base_model.output
        x= tf.keras.layers.GlobalAveragePooling2D()(x)
        x= tf.keras.layers.Dense(64,activation='relu')(x)
        predictions = tf.keras.layers.Dense(10,activation='softmax')(x)
        
        model = tf.keras.models.Model(inputs=base_model.input, outputs=predictions)

        model.compile(optimizer='adam',
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])
    
        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

        history = model.fit(datagen.flow(x_train, y_train, batch_size=64), validation_data=(x_val, y_val), epochs=epochs, callbacks=[early_stopping])

    # Save the model
    model.save(os.path.join(trained_model_resnet50.path, '1'))    ## here '1' is version of model

    plt.plot(history.history['accuracy'], label='accuracy')
    plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.ylim([0, 1])
    plt.legend(loc='lower right')
    plt.savefig(trained_model_resnet50.path + '/training_history.png')

    train_resnet50_metrics.log_metric("accuracy", max(history.history['accuracy']))
    train_resnet50_metrics.log_metric("val_accuracy", max(history.history['val_accuracy']))

    print("Model training is complete.")

#### Training Model using InceptionV3 Component

In [7]:
@component(packages_to_install=['tensorflow', 'numpy', 'matplotlib', 'scikit-learn'])
def train_model_inceptionv3(input_data: Input[Dataset], trained_model_inceptionv3: Output[Model], train_inceptionv3_metrics: Output[Metrics], epochs: int = 10):
    import tensorflow as tf
    import numpy as np
    import os
    import matplotlib.pyplot as plt
    from sklearn.model_selection import train_test_split

    x_train = np.load(os.path.join(input_data.path, 'x_train.npy'))
    y_train = np.load(os.path.join(input_data.path, 'y_train.npy'))

    # Normalize pixel values to be between 0 and 1
    x_train = x_train.astype('float32') / 255.0

    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

    datagen = tf.keras.preprocessing.image.ImageDataGenerator(
        rotation_range=15,
        width_shift_range=0.1,
        height_shift_range=0.1,
        horizontal_flip=True,
        zoom_range=0.1
    )

    datagen.fit(x_train)

    strategy = tf.distribute.MultiWorkerMirroredStrategy()
    print(f'Number of devices:{strategy.num_replicas_in_sync}')

    with strategy.scope():

        inputs = tf.keras.layers.Input(shape=(32,32,3))
        resized_input = tf.keras.layers.Resizing(75,75)(inputs)

        base_model = tf.keras.applications.InceptionV3(weights=None, include_top=False, input_tensor=resized_input)

        x= base_model.output
        x= tf.keras.layers.GlobalAveragePooling2D()(x)
        x= tf.keras.layers.Dense(64,activation='relu')(x)
        predictions = tf.keras.layers.Dense(10,activation='softmax')(x)
        
        model = tf.keras.models.Model(inputs=base_model.input, outputs=predictions)

        model.compile(optimizer='adam',
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])
    
        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

        history = model.fit(datagen.flow(x_train, y_train, batch_size=64), validation_data=(x_val, y_val), epochs=epochs, callbacks=[early_stopping])

    # Save the model
    model.save(os.path.join(trained_model_inceptionv3.path, '1'))   ## here '1' is version of model

    plt.plot(history.history['accuracy'], label='accuracy')
    plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.ylim([0, 1])
    plt.legend(loc='lower right')
    plt.savefig(trained_model_inceptionv3.path + '/training_history.png')

    train_inceptionv3_metrics.log_metric("accuracy", max(history.history['accuracy']))
    train_inceptionv3_metrics.log_metric("val_accuracy", max(history.history['val_accuracy']))

    print("Model training is complete.")


### Evaluating Trained Model Component

In [8]:
@component(packages_to_install=['tensorflow', 'numpy', 'scikit-learn', 'seaborn', 'matplotlib'])
def evaluate_model(input_data: Input[Dataset], input_model: Input[Model], classification_report_metrics: Output[ClassificationMetrics], evalution_metrics: Output[Metrics]):
    import tensorflow as tf
    import numpy as np
    import os
    import matplotlib.pyplot as plt
    from sklearn.metrics import classification_report, confusion_matrix
    import seaborn as sns
    import json

    x_test = np.load(os.path.join(input_data.path, 'x_test.npy'))
    y_test = np.load(os.path.join(input_data.path, 'y_test.npy'))

    # Normalize pixel values to be between 0 and 1
    x_test = x_test.astype('float32') / 255.0

    model = tf.keras.models.load_model(os.path.join(input_model.path, '1'))  ## here '1' is version of model

    # loss, accuracy = model.evaluate(x_test, y_test)
    y_pred = model.predict(x_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_true_classes = y_test.flatten()

    class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer',
                   'dog', 'frog', 'horse', 'ship', 'truck']

    report = classification_report(y_true_classes, y_pred_classes, output_dict=True)
    cm = confusion_matrix(y_true_classes, y_pred_classes)

    # Plot confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.savefig(input_model.path + '/evaluation_metrics_and_confusion_matrix_graph.png')
    
    # Log the confusion matrix with both matrix and categories arguments
    classification_report_metrics.log_confusion_matrix(matrix=cm.tolist(), categories=class_names)

    evalution_metrics.log_metric("precision", report['weighted avg']['precision'])
    evalution_metrics.log_metric("recall", report['weighted avg']['recall'])
    evalution_metrics.log_metric("f1-score", report['weighted avg']['f1-score'])


    # print(f'Model evaluation complete. Loss: {loss}, Accuracy: {accuracy}')
    print("\n---- Model Evaluation Complete ----")


### Perforamance Check for best model to deploy Component

In [9]:
@component()
def preformance_check(evaluation_simple_cnn_metrics: Input[Metrics], 
                      evaluation_resnet50_metrics: Input[Metrics], 
                      evaluation_inceptionv3_metrics: Input[Metrics], 
                      deploy_model_metrics: Output[Metrics], 
                      trained_model_simple_cnn: Input[Model], 
                      trained_model_resnet50: Input[Model], 
                      trained_model_inceptionv3: Input[Model], 
                      deploy_model: Output[Model]):


    # Create list of tuples with model metrics and corresponding model
    models = [
        (evaluation_simple_cnn_metrics.metadata.get('f1-score'), trained_model_simple_cnn),
        (evaluation_resnet50_metrics.metadata.get('f1-score'), trained_model_resnet50),
        (evaluation_inceptionv3_metrics.metadata.get('f1-score'), trained_model_inceptionv3)
    ]

    # Find model with maximum F1 score
    max_f1_score, deploy_model = max(models, key=lambda x: x[0])

    # Get tmetrics corresponding to selected model
    if max_f1_score == evaluation_simple_cnn_metrics.metadata.get('f1-score'):
        deploy_model_metrics.metadata.update(evaluation_simple_cnn_metrics.metadata)
        deploy_model.path = trained_model_simple_cnn.path
        
    elif max_f1_score == evaluation_resnet50_metrics.metadata.get('f1-score'):
        deploy_model_metrics.metadata.update(evaluation_resnet50_metrics.metadata)
        deploy_model.path = trained_model_resnet50.path

    else:
        deploy_model_metrics.metadata.update(evaluation_inceptionv3_metrics.metadata)
        deploy_model.path = trained_model_inceptionv3.path

    # Output the results
    print("Deploy Model Metrics:", deploy_model_metrics.metadata)
    print("Deploy Model:", deploy_model.path)

    print("\n---- Models Preformance Check Complete ----")
    

### Deploy model using kserve Component

In [10]:
@component(packages_to_install=['kserve', 'kubernetes'])
def deploy_model(input_model: Input[Model], service_account_name:str="sa-minio-kserve", namespace:str= "kubeflow", service_name:str="weather-model", kserve_version:str="v1beta1"):
    """
    Create kserve instance
    """
    from kubernetes import client, config
    from kserve import KServeClient
    from kserve import constants
    from kserve import utils
    from kserve import V1beta1InferenceService
    from kserve import V1beta1InferenceServiceSpec
    from kserve import V1beta1PredictorSpec
    from kserve import V1beta1TFServingSpec
    from datetime import datetime

    uri = input_model.uri.replace('minio://', '')
    input_model_path = f"s3://{uri}"

    # namespace = utils.get_default_target_namespace()
    

    config.load_incluster_config()

    now = datetime.now()
    v = now.strftime("%Y-%m-%d--%H-%M-%S")

    api_version = constants.KSERVE_GROUP + '/' + kserve_version

    isvc = V1beta1InferenceService(api_version=api_version,
                                   kind=constants.KSERVE_KIND,
                                   metadata=client.V1ObjectMeta(
                                       name=service_name, namespace=namespace, annotations={'sidecar.istio.io/inject':'false'}),
                                   spec=V1beta1InferenceServiceSpec(
                                   predictor=V1beta1PredictorSpec(
                                       service_account_name=service_account_name,
                                       tensorflow=(V1beta1TFServingSpec(
                                           storage_uri=input_model_path))))
    )

    KServe = KServeClient()
    KServe.create(isvc)

    print(f'Model deployed as an InferenceService: {service_name}')



### Monitor Deploy Model using prometheus Component

In [11]:
@component(packages_to_install=['prometheus-client', 'matplotlib', 'psutil'])
def monitor_model():
    """
    This component simulates model monitoring by generating and pushing
    performance metrics to a Prometheus Pushgateway.
    """
    from prometheus_client import CollectorRegistry, Gauge, push_to_gateway
    import time
    import psutil  # To monitor resource utilization

    # Initialize Prometheus registry and gauges
    registry = CollectorRegistry()
    performance_gauge = Gauge('model_performance', 'Track model performance', registry=registry)
    cpu_usage_gauge = Gauge('cpu_usage', 'Track CPU usage percentage', registry=registry)
    memory_usage_gauge = Gauge('memory_usage', 'Track memory usage percentage', registry=registry)

    for i in range(10):
        # Simulating metrics update
        accuracy = 0.85 + 0.01 * i  # Simulate increasing accuracy
        performance_gauge.set(accuracy)

        # Capture resource utilization
        cpu_usage = psutil.cpu_percent()
        memory_usage = psutil.virtual_memory().percent
        cpu_usage_gauge.set(cpu_usage)
        memory_usage_gauge.set(memory_usage)

        # Push metrics to Prometheus Pushgateway
        push_to_gateway('pushgateway:9091', job='model_monitoring', registry=registry)
        time.sleep(30)  # Sleep for 30 seconds to simulate real-time tracking

    print("Model performance and resource utilization metrics sent to Prometheus.")


# Combined all above Components for Kubeflow pipeline

In [12]:
@dsl.pipeline(
    name='Image Classification Pipeline with Serving and Monitoring',
    description='An example pipeline to train, deploy, and monitor an image classifier on the CIFAR-10 dataset.'
)
def image_classification_pipeline(epochs: int = 10, service_account_name:str="sa-minio-kserve", namespace:str= "kubeflow", service_name:str="weather-model", kserve_version:str="v1beta1"):
    # Define pipeline tasks
    preprocess_data_task = prepare_data()

    train_simple_cnn_task = train_model_simple_cnn(
        input_data=preprocess_data_task.outputs['prepare_dataset'],
        epochs=epochs
    )

    train_resnet50_task = train_model_resnet50(
        input_data=preprocess_data_task.outputs['prepare_dataset'],
        epochs=epochs
    )

    train_inceptionv3_task = train_model_inceptionv3(
        input_data=preprocess_data_task.outputs['prepare_dataset'],
        epochs=epochs
    )

    evaluate_model_task_simple_cnn = evaluate_model(
        input_data=preprocess_data_task.outputs['prepare_dataset'],
        input_model=train_simple_cnn_task.outputs['trained_model_simple_cnn']
    )

    evaluate_model_task_resnet50 = evaluate_model(
        input_data=preprocess_data_task.outputs['prepare_dataset'],
        input_model=train_resnet50_task.outputs['trained_model_resnet50']
    )

    evaluate_model_task_inceptionv3 = evaluate_model(
        input_data=preprocess_data_task.outputs['prepare_dataset'],
        input_model=train_inceptionv3_task.outputs['trained_model_inceptionv3']
    )

    preformance_check_task = preformance_check(
        evaluation_simple_cnn_metrics = evaluate_model_task_simple_cnn.outputs['evalution_metrics'],
        evaluation_resnet50_metrics= evaluate_model_task_resnet50.outputs['evalution_metrics'],
        evaluation_inceptionv3_metrics= evaluate_model_task_inceptionv3.outputs['evalution_metrics'],
        trained_model_simple_cnn = train_simple_cnn_task.outputs['trained_model_simple_cnn'],
        trained_model_resnet50 = train_resnet50_task.outputs['trained_model_resnet50'],
        trained_model_inceptionv3= train_inceptionv3_task.outputs['trained_model_inceptionv3']
    )

    deploy_model_task = deploy_model(
        input_model=preformance_check_task.outputs['deploy_model'],
        service_account_name=service_account_name, 
        namespace= namespace, 
        service_name= service_name,
        kserve_version= kserve_version
    )

    monitor_pipeline_task = monitor_model().after(deploy_model_task)

    # Enable caching for all tasks
    preprocess_data_task.set_caching_options(True)
    train_simple_cnn_task.set_caching_options(True)
    train_resnet50_task.set_caching_options(True)
    train_inceptionv3_task.set_caching_options(True)
    evaluate_model_task_simple_cnn.set_caching_options(True)
    evaluate_model_task_resnet50.set_caching_options(True)
    evaluate_model_task_inceptionv3.set_caching_options(True)
    preformance_check_task.set_caching_options(True)
    deploy_model_task.set_caching_options(False)
    monitor_pipeline_task.set_caching_options(False)

# Compile Kubeflow pipline and Run it on Kubefow server

In [14]:

# Compile the pipeline
kfp.compiler.Compiler().compile(
    pipeline_func=image_classification_pipeline,
    package_path='image_classification.yaml'
)

# Create a client to run the pipeline
client = kfp.Client()

# Run the pipeline
client.create_run_from_pipeline_func(
    image_classification_pipeline,
    arguments={
        "epochs": 1, 
        "service_account_name" : "sa-minio-kserve", 
        "namespace" : "kubeflow", 
        "service_name" : "cifar10-service",
        "kserve_version" :"v1beta1"
        },
    experiment_name='image_classification_experiment'
)


RunPipelineResult(run_id=db2f2799-ca05-4c3b-b67a-98bf82539f22)