# This Notebooks focuces on how to use kaml package with kfp sdk

In [2]:
!pip install kfp==1.8.22

Collecting kfp==1.8.22
  Downloading kfp-1.8.22.tar.gz (304 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m304.9/304.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting absl-py<2,>=0.9
  Downloading absl_py-1.4.0-py3-none-any.whl (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.5/126.5 kB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyYAML<7,>=5.3
  Downloading PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (738 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m738.9/738.9 kB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5
  Downloading google_api_core-2.11.1-py3-none-any.whl (120 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.5/120.5 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting google

In [29]:
from kfp import components
from kfp import dsl
from typing import List
import kfp
from typing import NamedTuple
from kfp.components import func_to_container_op, create_component_from_func

## Split data

- This method loads the data from the `https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv`, splits it into `x_train, x_test, y_train, y_test` and  saves it as `train.csv` and `test.csv`
    to `storage_path+'/diabetes.csv'` and returns storage_path
    parameters
    --------------
    data_path: str
        path where data need to be saved
        
    Returns
    -------------
    none

In [87]:
def split_data(input_data:str, data_path: str):
    import os
    os.system("pip install pandas scikit-learn")
    import pandas as pd
    from sklearn.model_selection import train_test_split
    
    df = pd.read_csv(input_data)
    x = df.drop(columns=['Outcome'], axis=1)
    y = df['Outcome']
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    x_train['outcome'] = y_train
    x_test['outcome'] = y_test

    x_train.to_csv(data_path+'/train.csv', index=False)
    x_test.to_csv(data_path+'/test.csv', index=False)

    

## Create classifier

- This method installs requirements for kaml package and the package itself, reads the train.csv and test.csv file, then creates a Classifier object by passing `x_train,x_test,y_train,y_test, exp_name` to the class then saves the clf object as a pkl file because objects can't be passes between components
    parameters
    ------------
    exp_name: str
        The experiment name that to be created in kaml
        
    Returns
    -------------
    none

In [88]:
def create_classifier(exp_name: str):
    import os
    # os.system("pip install -r /kfs_private/requirements.txt")
    os.system("pip install katonic numpy optuna==3.3.0 boto3==1.28.45 protobuf==3.20.1")
    os.system('pip install ipython joblib pandas')
    import pandas as pd
    # from kaml.classification import Classifier
    from katonic.ml.classification import Classifier
    from katonic.ml.client import set_exp
    import joblib
    
    train = pd.read_csv('/kfs_private/data_dir/train.csv')
    test = pd.read_csv('/kfs_private/data_dir/test.csv')
    x_train, y_train = train.drop(columns=['outcome'], axis=1), train['outcome']
    x_test, y_test = test.drop(columns=['outcome'], axis=1), test['outcome']
    
    set_exp(exp_name=exp_name)
    clf = Classifier(
      x_train,
      x_test,
      y_train,
      y_test,
      exp_name)
    clf.LogisticRegression()
    
    joblib.dump(clf, '/kfs_private/katflow_clf.pkl')


## decision tree
- This method installs requirements for kaml package and the package itself, then reads the saved clf object and trains the `DecisionTreeClassifier`

    classifier object needs to be read to use this method

In [89]:
def decision_tree(exp_name: str):
    import os
    # os.system("pip install -r /kfs_private/requirements.txt")
    os.system("pip install katonic numpy optuna==3.3.0 boto3==1.28.45 protobuf==3.20.1")
    os.system('pip install ipython joblib pandas')
    import pandas as pd
    # from kaml.classification import Classifier
    from katonic.ml.classification import Classifier
    from katonic.ml.client import set_exp
    import joblib
    
    train = pd.read_csv('/kfs_private/data_dir/train.csv')
    test = pd.read_csv('/kfs_private/data_dir/test.csv')
    x_train, y_train = train.drop(columns=['outcome'], axis=1), train['outcome']
    x_test, y_test = test.drop(columns=['outcome'], axis=1), test['outcome']
    
    set_exp(exp_name=exp_name)
    clf = Classifier(
      x_train,
      x_test,
      y_train,
      y_test,
      exp_name)
    clf.DecisionTreeClassifier(max_depth=8, criterion='gini', min_samples_split=3)


## random forest
- This method installs requirements for kaml package and the package itself, then clf object is read and `RandomForestClassifier` is tuned using a set of hyperparameters

    classifier object needs to be read to use this method

In [90]:
def random_forest(exp_name: str):
    import os
    # os.system("pip install -r /kfs_private/requirements.txt")
    os.system("pip install katonic numpy optuna==3.3.0 boto3==1.28.45 protobuf==3.20.1")
    os.system('pip install ipython joblib pandas')
    import pandas as pd
    # from kaml.classification import Classifier
    from katonic.ml.classification import Classifier
    from katonic.ml.client import set_exp
    import joblib
    
    train = pd.read_csv('/kfs_private/data_dir/train.csv')
    test = pd.read_csv('/kfs_private/data_dir/test.csv')
    x_train, y_train = train.drop(columns=['outcome'], axis=1), train['outcome']
    x_test, y_test = test.drop(columns=['outcome'], axis=1), test['outcome']
    
    set_exp(exp_name=exp_name)
    clf = Classifier(
      x_train,
      x_test,
      y_train,
      y_test,
      exp_name)
    params = {
    'n_estimators': {
        'low': 80,
        'high': 120,
        'step': 10,
        'type': 'int'
        },
    'criterion':{
        'values': ['gini', 'entropy'],
        'type': 'categorical'
        },
    'min_samples_split': {
        'low': 2,
        'high': 5,
        'type': 'int'
        },
    'min_samples_leaf':{
        'low': 1,
        'high': 5,
        'type': 'int'
        }
    }
    clf.RandomForestClassifier(is_tune=True,n_trials=5, params=params)

## show runs
- This method installs requirements for kaml package and the package itself, then clf object is read and different information about the experiment is printed along with some data about the runs that we performed earlier

    classifier object needs to be read to use this method

In [91]:
def show_runs(exp_name: str):
    import os
    # os.system("pip install -r /kfs_private/requirements.txt")
    os.system("pip install katonic numpy optuna==3.3.0 boto3==1.28.45 protobuf==3.20.1")
    os.system('pip install ipython joblib pandas')
    import pandas as pd
    # from kaml.classification import Classifier
    from katonic.ml.classification import Classifier
    from katonic.ml.client import set_exp
    import joblib
    
    train = pd.read_csv('/kfs_private/data_dir/train.csv')
    test = pd.read_csv('/kfs_private/data_dir/test.csv')
    x_train, y_train = train.drop(columns=['outcome'], axis=1), train['outcome']
    x_test, y_test = test.drop(columns=['outcome'], axis=1), test['outcome']
    
    set_exp(exp_name=exp_name)
    clf = Classifier(
      x_train,
      x_test,
      y_train,
      y_test,
      exp_name)
    
    clf = joblib.load(open('/kfs_private/katflow_clf.pkl', 'rb'))
    
    exp_id = clf.id
    print("experiment name : ",clf.name)
    print("experiment location : ",clf.location)
    print("experiment id : ",clf.id)
    print("experiment status : ",clf.stage)
    df_runs = clf.search_runs(exp_id)
    print("Number of runs done : ", len(df_runs))
    print(df_runs[['run_name', 'start_time', 'status']])




In [92]:
def register_best_model(exp_name: str):
    import os
    # os.system("pip install -r /kfs_private/requirements.txt")
    os.system("pip install katonic numpy optuna==3.3.0 boto3==1.28.45 protobuf==3.20.1")
    os.system('pip install ipython joblib pandas')
    import pandas as pd
    # from kaml.classification import Classifier
    from katonic.ml.classification import Classifier
    from katonic.ml.client import set_exp
    import joblib
    
    train = pd.read_csv('/kfs_private/data_dir/train.csv')
    test = pd.read_csv('/kfs_private/data_dir/test.csv')
    x_train, y_train = train.drop(columns=['outcome'], axis=1), train['outcome']
    x_test, y_test = test.drop(columns=['outcome'], axis=1), test['outcome']
    
    set_exp(exp_name=exp_name)
    clf = Classifier(
      x_train,
      x_test,
      y_train,
      y_test,
      exp_name)
    
    clf = joblib.load(open('/kfs_private/katflow_clf.pkl', 'rb'))
    
    exp_id = clf.id
    print("experiment name : ",clf.name)
    print("experiment location : ",clf.location)
    print("experiment id : ",clf.id)
    print("experiment status : ",clf.stage)
    df_runs = clf.search_runs(exp_id)
    df_runs.sort_values("metrics.accuracy_score",ascending=True,inplace=True)

    run_id = df_runs["run_id"][0]
    run_name = df_runs["run_name"][0]

    clf.register_model(
        model_name=run_name,
        run_id=run_id,
    )
    print("Best model run_id : ", run_id)
    print(df_runs[['run_name', 'start_time', 'status']])


## Pipeline creation
- In this method pipeline structure is created and volume is assigned to different components. `create_component_from_func` is used here to convert functions to components. add_pvolumes is used to attach private bucket to the component.

    .after is used to create dependencies between components so that they execute sequencially without having any internal dependency 

In [93]:
@dsl.pipeline()
def katflow_kfp(
        input_data: str
    ):
    
    data_path = "/kfs_public/data_dir"
    exp_name = 'diabetes_prediction3'
    
    split_data_op = create_component_from_func(split_data)
    create_classifier_op = create_component_from_func(create_classifier)
    decision_tree_op = create_component_from_func(decision_tree)
    random_forest_op = create_component_from_func(random_forest)
    show_runs_op = create_component_from_func(show_runs)
    register_best_model_op = create_component_from_func(register_best_model)
    
    split_data_task = split_data_op(input_data, data_path).add_pvolumes({ '/kfs_private':dsl.PipelineVolume(pvc="private-storage-08a7de14") })
    create_classifier_task = create_classifier_op(exp_name).add_pvolumes({ '/kfs_private':dsl.PipelineVolume(pvc="private-storage-08a7de14") })
    create_classifier_task.after(split_data_task)
    random_forest_task = random_forest_op(exp_name).add_pvolumes({ '/kfs_private':dsl.PipelineVolume(pvc="private-storage-08a7de14") })
    decision_tree_task = decision_tree_op(exp_name).add_pvolumes({ '/kfs_private':dsl.PipelineVolume(pvc="private-storage-08a7de14") })
    random_forest_task.after(create_classifier_task) # random_forest_task executes after create_classifier_task
    decision_tree_task.after(create_classifier_task)
    show_runs_task = show_runs_op(exp_name).add_pvolumes({ '/kfs_private':dsl.PipelineVolume(pvc="private-storage-08a7de14") })
    show_runs_task.after(random_forest_task, decision_tree_task) # show_runs_task runs after random_forest_task and  decision_tree_task
    register_best_model_task = register_best_model_op(exp_name).add_pvolumes({ '/kfs_private':dsl.PipelineVolume(pvc="private-storage-08a7de14") })
    register_best_model_task.after(show_runs_task)

## Compile and run pipeline
- Here the pipeline is compiled and starts running

In [94]:
from datetime import datetime
import uuid

EXPERIMENT_NAME = "katflow_kfp1"
pipeline_func = katflow_kfp
arguments={
    "input_data": "/kfs_pulic/diabetes.csv",
    }
pipeline_filename = pipeline_func.__name__ + f'{uuid.uuid1()}.pipeline.yaml'
kfp.compiler.Compiler().compile(pipeline_func, pipeline_filename)
client = kfp.Client()
experiment = client.create_experiment(EXPERIMENT_NAME)
run_name = pipeline_func.__name__ + str(datetime.now().strftime("%d-%m-%Y-%H-%M-%S"))
client.upload_pipeline(pipeline_filename)
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)

ERROR:root:Failed to read a token from file '/var/run/secrets/kubeflow/pipelines/token' ([Errno 2] No such file or directory: '/var/run/secrets/kubeflow/pipelines/token').
