# This Notebook focuses on a wine quality usecase with different flavours of kfp sdk.

In [1]:
from kfp import components
from kfp import dsl
from typing import List
import kfp
from typing import NamedTuple
from kfp.components import func_to_container_op, create_component_from_func

## Read and save data
- This method loads the data from the load_wine from sklearn.datasets and saves it
    to `/kfs_private/data_dir` as train.csv and test.csv and @create_component_from_func decorator is used to conver this function into a component

In [2]:
@create_component_from_func
def get_data():
    import os
    os.system("pip install pandas scikit-learn")
    import pandas as pd
    from sklearn.datasets import load_wine
    from sklearn.model_selection import train_test_split
    x, y = load_wine(return_X_y=True)
    x = pd.DataFrame(x)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    x_train['y'] = y_train
    x_test['y'] = y_test

    x_train.to_csv('/kfs_private/data_dir/train.csv', index=False)
    x_test.to_csv('kfs_private/data_dir/test.csv', index=False)

## Decision tree
- This method reads `train.csv` and `test.csv` from `/kfs_private/data_dir` and creating a DecisionTreeClassifier object and printing the accuracy score

In [3]:
@create_component_from_func # This is another way to create component from function, to use create_component_from_func as a decorator
def decision_tree():
    import os
    os.system("pip install pandas scikit-learn")
    import pandas as pd
    from sklearn.tree import DecisionTreeClassifier
    train = pd.read_csv('/kfs_private/data_dir/train.csv')
    test = pd.read_csv('/kfs_private/data_dir/test.csv')
    x_train, y_train = train.drop(columns=['y'], axis=1), train['y']
    x_test, y_test = test.drop(columns=['y'], axis=1), test['y']
    clf = DecisionTreeClassifier()
    clf.fit(x_train, y_train)
    print(clf.score(x_test, y_test))

## Logistic regression
- This method reads `train.csv` and `test.csv` from `/kfs_private/data_dir` and creating a Logistic Regression object and printing the accuracy score

In [4]:
@create_component_from_func
def logistic_regression():
    import os
    os.system("pip install pandas scikit-learn")
    import pandas as pd
    from sklearn.linear_model import LogisticRegression
    train = pd.read_csv('/kfs_private/data_dir/train.csv')
    test = pd.read_csv('/kfs_private/data_dir/test.csv')
    x_train, y_train = train.drop(columns=['y'], axis=1), train['y']
    x_test, y_test = test.drop(columns=['y'], axis=1), test['y']
    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    print(clf.score(x_test, y_test))

## Random forest
- This method reads `train.csv` and `test.csv` from `/kfs_private/data_dir` and creating a RandomForestClassifier object and printing the accuracy score

In [5]:
@create_component_from_func
def random_forest():
    import os
    os.system("pip install pandas scikit-learn")
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    train = pd.read_csv('/kfs_private/data_dir/train.csv')
    test = pd.read_csv('/kfs_private/data_dir/test.csv')
    x_train, y_train = train.drop(columns=['y'], axis=1), train['y']
    x_test, y_test = test.drop(columns=['y'], axis=1), test['y']
    clf = RandomForestClassifier()
    clf.fit(x_train, y_train)
    print(clf.score(x_test, y_test))

## Exit handler
- This method is going to be used as a exit handler for the pipeline

    that means this method is going to be executed at the end of the pipeline regardless, wheather pipeline executes successfully or not

In [6]:
@create_component_from_func
def echo_msg(msg: str):
    """Echo a message by parameter."""
    print(msg)

## Pipeline creation
- This method is where the pipeline design is happening. All of the components are kept under dsl.ExitHandler, so that they get executed first then at the end exit_task gets executed. This exit handler can be used to do something that is necessary even if the pipeline fails.

- .after is used with a component to create dependencies between different components, so that they don't execute parallally but gets executed sequencially even without any internal dependency between component.

- .set_retry() is used to re-run a compoent if it fails, because the failure can be due to some internal issue and can be resolved with a re-run. Then instead of reruning a entire pipeline using .set_retry() and passing number of times to retry can be save a lot of time and resources

In [8]:
@dsl.pipeline(
    name='Wine quality pipeline',
    description='A pipeline that trains on wine quality dataset'
)
def wine_pipeline():

    exit_task = echo_msg('Pipeline finished running.Exiting.....')
    # dsl.ExitHandler, this helps in control what happends at the end of the pipeline
    with dsl.ExitHandler(exit_task): 
        get_data_task = get_data().add_pvolumes({ '/kfs_private':dsl.PipelineVolume(pvc="private-storage-08a7de14") }).set_retry(2) # if the pod fails it'll try to re-execute it
        logistic_regression_task = logistic_regression().add_pvolumes({ '/kfs_private':dsl.PipelineVolume(pvc="private-storage-08a7de14") }).set_retry(2)
        random_forest_task = random_forest().add_pvolumes({ '/kfs_private':dsl.PipelineVolume(pvc="private-storage-08a7de14") }).set_retry(2)
        decision_tree_task = decision_tree().add_pvolumes({ '/kfs_private':dsl.PipelineVolume(pvc="private-storage-08a7de14") }).set_retry(2)
        random_forest_task.after(get_data_task) # random_forest_task executes after get_data_task
        decision_tree_task.after(get_data_task)
        logistic_regression_task.after(get_data_task)

In [9]:
from datetime import datetime
import uuid
EXPERIMENT_NAME = "wine_pipeline"
pipeline_func = wine_pipeline
arguments={}
pipeline_filename = pipeline_func.__name__ + f'{uuid.uuid1()}.pipeline.yaml'
kfp.compiler.Compiler().compile(pipeline_func, pipeline_filename)
client = kfp.Client()
experiment = client.create_experiment(EXPERIMENT_NAME)
run_name = pipeline_func.__name__ + str(datetime.now().strftime("%d-%m-%Y-%H-%M-%S"))
client.upload_pipeline(pipeline_filename)
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)

ERROR:root:Failed to read a token from file '/var/run/secrets/kubeflow/pipelines/token' ([Errno 2] No such file or directory: '/var/run/secrets/kubeflow/pipelines/token').


In [20]:
import uuid
client.upload_pipeline_version(pipeline_filename, uuid.uuid1())