In [1]:
import kfp
from kfp.v2.dsl import (
    component,
    Output,
    OutputPath,
    Model,
    Input,
    InputPath,
    ParallelFor,
    pipeline,
    Condition
    )
from kfp.v2 import compiler



from google_cloud_pipeline_components import aiplatform as gcc_aip
from google.cloud import aiplatform as aip

In [2]:

import kfp.components as components

In [3]:
BUCKET_URI='gs://laah-playaip-20220822213353'
PROJECT_ID='laah-play'
PIPELINE_ROOT = "{}/pipeline_root/bikes_weather".format(BUCKET_URI)


In [4]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

In [5]:
@component(
    packages_to_install=["pandas", 
                         "google-cloud-aiplatform", 
                         "google-cloud-bigquery-storage",
                         "google-cloud-bigquery",
                         "pyarrow"]
)
def preprocess(in_bigquery_projectid:str, 
               in_bigquery_dataset:str, 
               output_csv_path: OutputPath('CSV_DATASET')):
    #1
    from google.cloud import bigquery
    import google.auth
    
    creds, project = google.auth.default()
    client = bigquery.Client(project=in_bigquery_projectid, credentials=creds)

    query =     """
            SELECT * FROM `laah-play.telco.churn`
    """
    print(query)
    
    dataframe = client.query(query).to_dataframe()
    print(dataframe.head())
    
    dataframe.to_csv(output_csv_path)
    print("done")

In [6]:
@component(
   packages_to_install=["pandas", "scikit-learn==1.0", "google-cloud-aiplatform"], base_image = "python:3.7"
)
def train(in_experiment_name:str, 
          in_experiment_training_set: str,
          in_vertexai_region: str, 
          in_vertexai_projectid: str, 
          in_csv_path: InputPath('CSV_DATASET'), 
          model_type: str, 
          saved_model: Output[Model]
         ):
    
    import pandas as pd  
    from sklearn.model_selection import train_test_split
    import sklearn.metrics as metrics
    from google.cloud import aiplatform
    from datetime import datetime
    import pickle
    import os
    import random
    idn = random.randint(0,1000)
    
    from google.cloud import storage


    df = pd.read_csv(in_csv_path)
    
    from sklearn.preprocessing import LabelEncoder
    for c in df.columns:
        if df[c].dtype=='object':    #Since we are encoding object datatype to integer/float
            lbl = LabelEncoder()
            lbl.fit(list(df[c].values))
            df[c] = lbl.transform(df[c].values)
    print(df.head())  #To check if properly encoded
    
    X = df[['Contract', 'tenure', 'TechSupport', 'OnlineSecurity', 'TotalCharges', 'PaperlessBilling',
       'DeviceProtection', 'Dependents', 'OnlineBackup', 'SeniorCitizen', 'MonthlyCharges',
       'PaymentMethod', 'Partner', 'PhoneService']] #taking only relevant columns
    Y = df['Churn']


    # Scaling all the variables to a range of 0 to 1
    from sklearn.preprocessing import MinMaxScaler
    features = X.columns.values
    scaler = MinMaxScaler(feature_range = (0,1))
    scaler.fit(X)
    X = pd.DataFrame(scaler.transform(X))
    X.columns = features
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=101)
    
    aiplatform.init(
       project=in_vertexai_projectid,
       location=in_vertexai_region,
       experiment=in_experiment_name
    )
    
    run_id = f"run-{idn}-{datetime.now().strftime('%Y%m%d%H%M%S')}"
    aiplatform.start_run(run_id)
    
    #Choose which model to train
    if model_type == 'svm':
        from sklearn import svm
        model = svm.LinearSVC()
        
    elif model_type == 'random_forrest':
        from sklearn.ensemble import RandomForestClassifier
        model = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=0)
        
    elif model_type == 'decision_tree':
        from sklearn.tree import DecisionTreeClassifier
        model = DecisionTreeClassifier()
        
    model.fit(X_train, Y_train)
    
    artifact_filename = 'model.pkl'
    local_path = artifact_filename
    with open(local_path, 'wb') as model_file:
      pickle.dump(model, model_file)
    
     
    saved_model_path = os.path.join(saved_model.path.replace('/gcs', 'gs:/').replace("saved_model",""), artifact_filename)
    blob = storage.blob.Blob.from_string(saved_model_path, client=storage.Client())
    blob.upload_from_filename(local_path)
    #joblib.dump(model, saved_model_path)

     
    predicted = model.predict(X_test)
    
    print("accuracy: {}".format(metrics.accuracy_score(Y_test, predicted)))
    print("f1 score macro: {}".format(metrics.f1_score(Y_test, predicted, average='macro')   )  )
    print("f1 score micro: {}".format(metrics.f1_score(Y_test, predicted, average='micro') ))
    print("precision score: {}".format(metrics.precision_score(Y_test, predicted, average='macro') ))
    print("recall score: {}".format(metrics.recall_score(Y_test, predicted, average='macro') ))
    print("hamming_loss: {}".format(metrics.hamming_loss(Y_test, predicted)))
    print("log_loss: {}".format(metrics.log_loss(Y_test, predicted)))
    print("zero_one_loss: {}".format(metrics.zero_one_loss(Y_test, predicted)))
    print("AUC&ROC: {}".format(metrics.roc_auc_score(Y_test, predicted)))
    print("matthews_corrcoef: {}".format(metrics.matthews_corrcoef(Y_test, predicted) ))
    
    
    training_params = {
        'training_set': in_experiment_training_set,
        'model_type': model_type,
        'dataset_path': in_csv_path,
        'model_path': saved_model_path
    }
    
    training_metrics = {
        'model_accuracy': metrics.accuracy_score(Y_test, predicted),
        'model_precision': metrics.precision_score(Y_test, predicted, average='macro'),
        'model_recall': metrics.recall_score(Y_test, predicted, average='macro'),
        'model_logloss': metrics.log_loss(Y_test, predicted),
        'model_auc_roc': metrics.roc_auc_score(Y_test, predicted)
    }
    
    aiplatform.log_params(training_params)
    aiplatform.log_metrics(training_metrics)

In [7]:
from typing import NamedTuple

@component(
   packages_to_install=["pandas", "google-cloud-aiplatform"]
)
def gate(in_experiment_name: str,
         in_experiment_training_set: str,
         in_vertexai_region: str,
         in_vertexai_projectid: str
        )-> NamedTuple(
           'winner_output',
            [
                ('experiment_info', str),
                ('is_current_champion', bool)
            ]
        ):
    
    from google.cloud import aiplatform
    import json
    from collections import namedtuple
    
    aiplatform.init(
       project=      in_vertexai_projectid,
       location=     in_vertexai_region,
       experiment =  in_experiment_name
    )
    

    ## get vertex AI model object corresponding to <champion model> from ModelRegistry - use labels: experiment_name 
    champion_model = None
    champion_model_exists = False
    
    model_filter_str='labels.experiment_name="'+in_experiment_name+'"'
    print("Model filter string: "+model_filter_str)
    
    models = aiplatform.Model.list(
        filter=model_filter_str
    )
    
    if len(models)>0:
        champion_model_exists = True
        champion_model = models[0]
        print(champion_model.display_name)
        champion_model_experiment_run_id = champion_model.labels['experiment_run_id']
    
    
    ## fetch experiment run details for current <training set>:
    experiment_df = aiplatform.get_experiment_df()
    experiment_df = experiment_df[experiment_df.experiment_name == in_experiment_name]
    
    
    challengers_experiment_run_info =  experiment_df[experiment_df["param.training_set"] == in_experiment_training_set]
    
    print("Challengers:")
    print(challengers_experiment_run_info.to_string())
    
    if champion_model != None:
       current_champion_experiment_run_info = experiment_df[experiment_df["run_name"] == champion_model_experiment_run_id]
    
    decision_metric_name = "metric.model_auc_roc"
    
    ### fetch best experiment_run_id from challengers
    best_challenger_experiment_run_info = challengers_experiment_run_info[
        challengers_experiment_run_info[decision_metric_name]==challengers_experiment_run_info[decision_metric_name].max()
    ]
    
    print("Best challenger")
    print(best_challenger_experiment_run_info.to_string())
    
    winner_experiment_run_info = None
    
    winner_is_current_champion = False
    if champion_model != None: 
        winner_experiment_run_info = current_champion_experiment_run_info
        winner_is_current_champion = True
        
        ## Final: best_challenger vs champion
        if best_challenger_experiment_run_info[decision_metric_name].values[0]>current_champion_experiment_run_info[decision_metric_name].values[0]:
            ## best challenger is the new winner
            winner_experiment_run_info = best_challenger_experiment_run_info
            winner_is_current_champion = False
    else: 
        winner_experiment_run_info = best_challenger_experiment_run_info
        winner_is_current_champion = False
    
    winner_experiment_info = {
           "experiment_name": winner_experiment_run_info["experiment_name"].values[0],
           "experiment_run_id": winner_experiment_run_info["run_name"].values[0]
    }
    
    print("winner:")
    print(winner_experiment_info)
    
    ##https://www.kubeflow.org/docs/components/pipelines/sdk/python-function-components/#pass-by-file
    winner_namedtuple = namedtuple('winner_output', ['experiment_info', 'is_current_champion'])
    
    return winner_namedtuple(json.dumps(winner_experiment_info), winner_is_current_champion)

In [8]:
@component(packages_to_install=["google-cloud-aiplatform",
                                "google-cloud-pipeline-components",
                                "typing",
                                'datetime'
                               ], base_image = "python:3.7"
    
)
def deploy(in_experiment_name: str, 
           in_experiment_training_set: str, 
           in_vertexai_region: str, 
           in_vertexai_projectid: str,
           eval_info: str, #evaluation_gate_task.outputs['experiment_info']
           in_vertex_serving_machine_type: str,
           in_vertex_serving_min_replicas: int,
           in_vertex_serving_max_replica: int
          ):
            
        from typing import Union
        from typing import Dict
        from google.cloud import aiplatform
        from google_cloud_pipeline_components import aiplatform as gcc_aip
        import json
        
        from datetime import datetime
        TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
        
        aiplatform.init(
            project=      in_vertexai_projectid,
            location=     in_vertexai_region,
            experiment =  in_experiment_name
    )

        #x=json.loads(eval_info)
        
        experiment=json.loads(eval_info)['experiment_name']
        run_name=json.loads(eval_info)['experiment_run_id']
        
        def get_experiment_run_params_sample(
            run_name: str,
            experiment: Union[str, aiplatform.Experiment],
            project: str,
            location: str,
        ) -> Dict[str, Union[float, int, str]]:
            experiment_run = aiplatform.ExperimentRun(
                run_name=run_name, experiment=experiment, project=project, location=location
            )
            return experiment_run.get_params()

        results_dict=get_experiment_run_params_sample(run_name, 
                                 experiment,
                                 'laah-play', 
                                 'us-central1')
            
        artifact_uri=results_dict['model_path'].replace("saved_model","").replace('model.pkl','')
            
        model = aiplatform.Model.upload(
            project=in_vertexai_projectid,
            display_name="model"+TIMESTAMP, 
            artifact_uri=artifact_uri, # GCS location of model
            serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-0:latest"
        )
        
        endpoint = aiplatform.Endpoint.create(
        display_name="pipelines"+TIMESTAMP,
        project=in_vertexai_projectid,
        location=in_vertexai_region)
        
        model.deploy(
            endpoint=endpoint,
            deployed_model_display_name="model"+TIMESTAMP,
            machine_type="n1-standard-4",
        )

In [9]:
@pipeline(name="wf-churn")
def pipeline(
    in_bigquery_projectid: str = 'laah-play',
    in_bigquery_dataset: str = 'telcosandbox',
    in_corr_threshold: float = 0.05,
    in_experiment_name: str = "telcochurn10",
    in_experiment_training_set: str = "telcochurn",
    in_vertexai_projectid: str = "laah-play",
    in_vertexai_region: str = "us-central1",
    in_vertex_serving_machine_type: str = "n1-standard-4",
    in_vertex_serving_min_replicas: int = 1,
    in_vertex_serving_max_replicas: int = 2,
    values: list = ['svm', 'random_forrest', 'decision_tree']
    
):
    
    import json
    
    #### STEP1: PREPROCESSING
    staging_task = preprocess(in_bigquery_projectid,
                         in_bigquery_dataset
                        )
    
    
    ### STEP2: TRAIN CHALLENGERS
    with ParallelFor(values) as item:
        train_task =            train(in_experiment_name, 
                                      in_experiment_training_set, 
                                      in_vertexai_region, 
                                      in_vertexai_projectid, 
                                      staging_task.output, 
                                      item
                                     )
        
        #### STEP3: GATE - Identify best challenger and compare with current champion
    evaluation_gate_task = gate(in_experiment_name, 
                                in_experiment_training_set, 
                                in_vertexai_region, 
                                in_vertexai_projectid).after(train_task)
     
    
    with Condition(
        evaluation_gate_task.outputs['is_current_champion'] == "false", 
        name="deploy_new_champion"
    ): 
        ### STEP 5&6 Register new Chamption and deploy it to endpoint
        result = deploy(in_experiment_name, 
                        in_experiment_training_set, 
                        in_vertexai_region, 
                        in_vertexai_projectid,
                        evaluation_gate_task.outputs['experiment_info'],
                        in_vertex_serving_machine_type,
                        in_vertex_serving_min_replicas,
                        in_vertex_serving_max_replicas
                       )


In [10]:
compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="custom_model_training_spec.json"
)



In [11]:
DISPLAY_NAME = "cifar10_" + TIMESTAMP

job = aip.PipelineJob(
    display_name=DISPLAY_NAME,
    template_path="custom_model_training_spec.json",
    pipeline_root=PIPELINE_ROOT
)

In [12]:
job.run()

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/959641146622/locations/us-central1/pipelineJobs/wf-churn-20220921150726
INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:
INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/959641146622/locations/us-central1/pipelineJobs/wf-churn-20220921150726')
INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/wf-churn-20220921150726?project=959641146622
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/959641146622/locations/us-central1/pipelineJobs/wf-churn-20220921150726 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/959641146622/locations/us-central1/pipelineJobs/wf-churn-20220921150726 current s

RuntimeError: Job failed with:
code: 9
message: "The DAG failed because some tasks failed. The failed tasks are: [for-loop-1].; Job (project_id = laah-play, job_id = 6180364755333545984) is failed due to the above error.; Failed to handle the job: {project_number = 959641146622, job_id = 6180364755333545984}"
