# This Notebook focuses on how to use kfp.dsl.Conditions in kfp sdk

In [4]:
import kfp
from kfp import dsl
import kfp.components as comp
from kfp.components import func_to_container_op, create_component_from_func
from typing import NamedTuple

## load data

- This method loads the data from the `https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv` and saves it
    to `storage_path+'/diabetes.csv'` and returns storage_path
    parameters
    -------------
    storage_path: str
        Path where data need to be saved
    
    Return
    --------------
    storage_path: str
        Path where data need to be saved

In [5]:
import pandas
def load_data(storage_path:str) -> NamedTuple('Outputs', [('data_path', str)]):
    import os
    os.system("pip install pandas")
    import pandas as pd
    
    df = pd.read_csv("https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv")
    path = storage_path+'/diabetes.csv'
    df.to_csv(path, index=False)
    return ([storage_path])

## split_data

- This method first reads data from `data_path` then splits it in `x_train, x_test, y_train, y_test` and then saves it as
    `train.csv` and `test.csv` and returns data_path
    parameters
    -------------
    data_path: str
        Path where data is stored
    
    Return
    --------------
    data_path: str
        Path where data is stored

In [6]:

def split_data(data_path:str) -> NamedTuple('Outputs', [('data_path', str)]):
    import os
    os.system("pip install pandas scikit-learn")
    import pandas as pd
    from sklearn.model_selection import train_test_split
    df = pd.read_csv(data_path+"/diabetes.csv")
    x = df.drop(columns=['Outcome'], axis=1)
    y = df['Outcome']
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    x_train['outcome'] = y_train
    x_test['outcome'] = y_test

    x_train.to_csv(data_path+'/train.csv', index=False)
    x_test.to_csv(data_path+'/test.csv', index=False)
    return ([data_path])

## Train model

- This method reads the `train.csv` and `test.csv` and converts it back into `x_train, x_test, y_train, y_test`
    then adds 2 to depth and creates a DecisionTreeClassifier object and trains the model and returns accuract score and
    updated depth value
    
    parameters
    -------------
    data_path: str
        Path where data is stored
    depth: int
        The max depth parameter of the DecisionTree
    
    Return
    --------------
    score: float
        accuracy score of test data
    depth: int
        The max depth parameter of the DecisionTree

In [7]:
def train_model(data_path:str, depth:int) -> NamedTuple('Outputs', [('score', float), ('depth', int)]):
    import os
    os.system("pip install pandas scikit-learn")
    import pandas as pd
    from sklearn.tree import DecisionTreeClassifier
    train = pd.read_csv(data_path+'/train.csv')
    test = pd.read_csv(data_path+'/test.csv')
    x_train, y_train = train.drop(columns=['outcome'], axis=1), train['outcome']
    x_test, y_test = test.drop(columns=['outcome'], axis=1), test['outcome']
    depth += 2
    tree = DecisionTreeClassifier(max_depth=depth)
    tree.fit(x_train, y_train)
    score = tree.score(x_test, y_test)

    return ([score, depth])

## Create kfp component from functions
- This section creates components from the function and stores it into a variable using create_component_from_func

In [8]:
load_data_op = create_component_from_func(load_data)
split_data_op = create_component_from_func(split_data)
train_model_op = create_component_from_func(train_model)

## Defining the condition for recursion and configuring the component
- Here we use kfp.dsl.Condition to check if trained DecisionTree Classifier provides accuracy > `70%` or not, if the accuracy is less than `70%` then train_model_recursion is called again but with data_path and updated depth value, and this keeps on going on until the condition fails to satisfy essencially ending the training cycle. 

- Due to it's nature it ends up becomong a recursion function

In [9]:
@kfp.dsl.graph_component
def train_model_recursion(data_path:str, depth:int):

    train_model_task = train_model_op(data_path, depth).add_pvolumes({ '/kfs_private':dsl.PipelineVolume(pvc="private-storage-f517") })
    with kfp.dsl.Condition(train_model_task.outputs['score'] < 0.70):
        train_model_recursion(data_path, train_model_task.outputs['depth'])

## Configuring the kfp components
- Here pipeline is designed, means how it is going to behave and also `'/kfs_private'` bucket is attached and train_model_recursion is called with data_path and depth. These two values need to be pipeline parameters here due to the nature of the function otherwise there will be an error

In [10]:
@dsl.pipeline()
def demo_recursion(data_path:str = '/kfs_private/data_dir', depth:int=2):

    load_data_task = load_data_op(data_path).add_pvolumes({ '/kfs_private':dsl.PipelineVolume(pvc="private-storage-f517") })
    split_data_task = split_data_op(load_data_task.outputs['data_path']).add_pvolumes({ '/kfs_private':dsl.PipelineVolume(pvc="private-storage-f517") })
    train_model_recursion(split_data_task.outputs['data_path'], depth)
    

In [12]:
from datetime import datetime
import uuid
EXPERIMENT_NAME = "parallel"
kfp_endpoint =  "http://ml-pipeline.kubeflow.svc.cluster.local:8888"
pipeline_func = demo_recursion
arguments={}
pipeline_filename = pipeline_func.__name__ + f'{uuid.uuid1()}.pipeline.zip'
kfp.compiler.Compiler().compile(pipeline_func, pipeline_filename)
client = kfp.Client(kfp_endpoint)
experiment = client.create_experiment(EXPERIMENT_NAME)
run_name = pipeline_func.__name__ + str(datetime.now().strftime("%d-%m-%Y-%H-%M-%S"))
client.upload_pipeline(pipeline_filename)
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)