### Import Libraries

In [None]:
# import required libraries
import kfp
from typing import Any
from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output, OutputPath, ClassificationMetrics,
                        Metrics, component)
import sys
import os
import re

from pathlib import Path
from datetime import date
from datetime import timedelta
from dateutil.relativedelta import relativedelta

import google
from google.oauth2 import credentials
from google.oauth2 import service_account
from google.oauth2.service_account import Credentials
from google.cloud import storage
from google.cloud.aiplatform import pipeline_jobs
from google_cloud_pipeline_components.v1.batch_predict_job import \
    ModelBatchPredictOp as batch_prediction_op

# Set global vars
pth_project = Path(os.getcwd())
sys.path.insert(0, pth_project.as_posix())

### YAML Parameters

In [None]:
#tag cell with parameters
PROJECT_ID=''
DATASET_ID=''
TABLE_ID=''
RESOURCE_BUCKET=''
FILE_BUCKET=''
REGION=''
MODEL_ID=''
STACK_NAME=''
MODEL_NAME=''
SERVICE_TYPE=''
SERVICE_TYPE_NAME=''
PIPELINE_TYPE=''
PIPELINE_PATH=''
UTILS_PATH=''
MODEL_TYPE=''
LOAD_SQL=''
PREPROCESS_OUTPUT_CSV=''
SAVE_FILE_NAME=''
STATS_FILE_NAME=''

In [None]:
#tag cell with parameters
PROJECT_ID =  'divg-groovyhoon-pr-d2eab4'
DATASET_ID = 'shs_churn'
TABLE_ID = 'master_features_set_prospect_train_vw'
RESOURCE_BUCKET = 'divg-groovyhoon-pr-d2eab4-default'
FILE_BUCKET = 'divg-groovyhoon-pr-d2eab4_shs_churn' # change
REGION = 'northamerica-northeast1'
MODEL_ID = '5023' # change
STACK_NAME = 'shs_churn'
MODEL_NAME = 'shs_churn' # change 
SERVICE_TYPE = 'shs_churn' # change
SERVICE_TYPE_NAME = 'shs-churn' # change
PIPELINE_TYPE='training_pipeline'
PIPELINE_PATH = 'models/training_pipeline' # change
UTILS_PATH =  'utils/resources' 
MODEL_TYPE='churn' # change 
LOAD_SQL='load_train_data.sql'
PREPROCESS_OUTPUT_CSV='df_train.csv' 
SAVE_FILE_NAME='df_test_exp.csv'
STATS_FILE_NAME='df_stats.csv'

### Pipeline Parameters

In [None]:
TRAINING_PIPELINE_NAME = f'{SERVICE_TYPE_NAME}-training-pipeline' # Same name as pulumi.yaml
SERVING_PIPELINE_NAME = f'{SERVICE_TYPE_NAME}-serving-pipeline' # Same name as pulumi.yaml
TRAINING_PIPELINE_DESCRIPTION = f'{SERVICE_TYPE_NAME}-training-pipeline'
SERVING_PIPELINE_DESCRIPTION = f'{SERVICE_TYPE_NAME}-serving-pipeline'
PIPELINE_ROOT = f"gs://{FILE_BUCKET}"
REGION = "northamerica-northeast1"

### Import kfp Components

In [None]:
def extract_dir_from_bucket(
    bucket: Any, local_path: Path, prefix: str, split_prefix: str = 'training_pipeline' 
):        
    """
    Download files from a specified bucket to a local path, excluding a specified prefix.

    Parameters:
      - bucket: The bucket object from which to download files.
      - local_path: The local path where the files will be downloaded to.
      - prefix: The prefix to filter the files in the bucket. Only files with this prefix will be downloaded.
      - split_prefix: The prefix to exclude from the downloaded file paths. Default is 'serving_pipeline'.
    """
    for blob in bucket.list_blobs(prefix=prefix):
        if not blob.name.endswith("/"):
            path = local_path / blob.name.split(f'{split_prefix}/')[-1]
            str_path = path.as_posix()
            Path(str_path[:str_path.rindex('/')]).mkdir(parents=True, exist_ok=True)
            blob.download_to_filename(str_path)

In [None]:
# download pipeline kfp components locally
storage_client = storage.Client(PROJECT_ID)
bucket = storage_client.bucket(RESOURCE_BUCKET)
prefix_core = f'{STACK_NAME}/{PIPELINE_PATH}/kfp_components'
extract_dir_from_bucket(bucket, pth_project, prefix_core)

# import kfp components
from kfp_components.run_sp import run_sp
from kfp_components.preprocess import preprocess
from kfp_components.train_and_save_model import train_and_save_model
from kfp_components.upload_model import upload_model

### Date Parameters

In [None]:
# training pipeline is to run on the 3rd of every month
# change the training time window every 3rd of month to R12 months as of {last day of 2 months ago} e.g. on May 1st, training window to be 2023-04-01 to 2024-03-31

# set training date (2 days ago from today)
trainingDate = (date.today() - relativedelta(days=2))

# training dates
TRAIN_DATE = trainingDate.strftime('%Y%m%d')  # date.today().strftime('%Y%m%d')
TRAIN_DATE_DASH = trainingDate.strftime('%Y-%m-%d')

# original 
FROM_DATE = trainingDate.replace(day=1) + relativedelta(months=-13)
TO_DATE = trainingDate.replace(day=1) + relativedelta(months=-6, days=-1)

FROM_DATE = FROM_DATE.strftime('%Y-%m-%d')
TO_DATE = TO_DATE.strftime('%Y-%m-%d')


In [None]:
print(FROM_DATE)
print(TO_DATE)

### Pipeline

In [None]:
# library imports
from kfp.v2 import compiler
from google.cloud.aiplatform import pipeline_jobs
@dsl.pipeline(
    name=TRAINING_PIPELINE_NAME, 
    description=TRAINING_PIPELINE_DESCRIPTION
    )
def pipeline(
        project_id: str = PROJECT_ID,
        region: str = REGION,
        resource_bucket: str = RESOURCE_BUCKET, 
        file_bucket: str = FILE_BUCKET
    ):
    
    #### this code block is only for a personal workbench 
    
    import google.oauth2.credentials
    token = !gcloud auth print-access-token
    token_str = token[0]
    
    #### the end
    
    from datetime import datetime
    update_ts = datetime.now()
    update_ts_str = update_ts.strftime('%Y-%m-%d %H:%M:%S')
    
    from pathlib import Path
    cwd = os.getcwd() 

    # ----- create training set --------
    run_sp_op = run_sp(from_date=FROM_DATE, 
                                to_date=TO_DATE, 
                                project_id=PROJECT_ID, 
                                dataset_id=DATASET_ID, 
                                token=token_str
                                )

    run_sp_op.set_memory_limit('32G')
    run_sp_op.set_cpu_limit('4')
    
    # ----- preprocessing train data --------
    preprocess_op = preprocess(project_id=PROJECT_ID,
                                dataset_id=DATASET_ID, 
                                table_id=TABLE_ID, 
                                file_bucket=FILE_BUCKET, 
                                resource_bucket=RESOURCE_BUCKET, 
                                stack_name=STACK_NAME, 
                                pipeline_path=PIPELINE_PATH,
                                utils_path=UTILS_PATH, 
                                model_type=MODEL_TYPE,
                                pipeline_type=PIPELINE_TYPE, 
                                load_sql=LOAD_SQL, 
                                preprocess_output_csv=PREPROCESS_OUTPUT_CSV, 
                                token=token_str
                                )

    preprocess_op.set_memory_limit('32G')
    preprocess_op.set_cpu_limit('8')
    
    train_and_save_model_op = train_and_save_model(file_bucket=FILE_BUCKET, 
                                resource_bucket=RESOURCE_BUCKET,
                                stack_name=STACK_NAME,
                                service_type=SERVICE_TYPE, 
                                project_id=PROJECT_ID, 
                                dataset_id=DATASET_ID, 
                                model_type=MODEL_TYPE, 
                                preprocess_output_csv=PREPROCESS_OUTPUT_CSV , 
                                save_file_name=SAVE_FILE_NAME,
                                stats_file_name=STATS_FILE_NAME, 
                                pipeline_path=PIPELINE_PATH,
                                utils_path=UTILS_PATH, 
                                pipeline_type=PIPELINE_TYPE,
                                # token=token_str
                               )

    train_and_save_model_op.set_memory_limit('32G')
    train_and_save_model_op.set_cpu_limit('8')
    
    preprocess_op.after(run_sp_op)
    train_and_save_model_op.after(preprocess_op)

### Run the Pipeline Job

In [None]:
# from kfp.v2 import compiler
# from google.cloud.aiplatform import pipeline_jobs
# import json

# compiler.Compiler().compile(
#    pipeline_func=pipeline, package_path="pipeline.json"
# )

# job = pipeline_jobs.PipelineJob(
#                                    display_name=TRAINING_PIPELINE_NAME,
#                                    template_path="pipeline.json",
#                                    location=REGION,
#                                    enable_caching=False,
#                                    pipeline_root = PIPELINE_ROOT
#                                 )
# job.run(service_account = f"bilayer-sa@{PROJECT_ID}.iam.gserviceaccount.com")


In [None]:
import google.oauth2.credentials
import json

token = !gcloud auth print-access-token
CREDENTIALS = google.oauth2.credentials.Credentials(token[0])

compiler.Compiler().compile(
   pipeline_func=pipeline, package_path="pipeline.json"
)

job = pipeline_jobs.PipelineJob(
   display_name=TRAINING_PIPELINE_NAME,
   template_path="pipeline.json",
   credentials = CREDENTIALS,
   pipeline_root = PIPELINE_ROOT,
   location=REGION,
   enable_caching=False # I encourage you to enable caching when testing as it will reduce resource use
)

job.run()