### Import Libraries

In [None]:
# import required libraries
import kfp
from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output, OutputPath, ClassificationMetrics,
                        Metrics, component)
import os
import re
from pathlib import Path

from datetime import date
from datetime import timedelta
from dateutil.relativedelta import relativedelta

import google
from google.oauth2 import credentials
from google.oauth2 import service_account
from google.oauth2.service_account import Credentials
from google.cloud import storage
from google.cloud.aiplatform import pipeline_jobs
from google_cloud_pipeline_components.v1.batch_predict_job import \
    ModelBatchPredictOp


### YAML Parameters

In [None]:
#tag cell with parameters
PROJECT_ID =  ''
DATASET_ID = ''
RESOURCE_BUCKET = ''
FILE_BUCKET = ''
REGION = ''
UCAR_SCORE_TABLE= 'bi-srv-mobilityds-pr-80a48d.ucar_ingestion.bq_product_instance_model_score'
MODEL_ID = '6535'
MODEL_NAME = 'telus_postpaid_churn'
PREDICTION_IMAGE = "northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/bi-platform/bi-aaaie/images/kfp-pycaret-slim:latest"


In [None]:
#tag cell with parameters
PROJECT_ID =  'divg-groovyhoon-pr-d2eab4'
DATASET_ID = 'telus_postpaid_churn'
RESOURCE_BUCKET = 'divg-groovyhoon-pr-d2eab4-default'
FILE_BUCKET = 'divg-groovyhoon-pr-d2eab4-default'
MODEL_ID = '6535'
MODEL_NAME = 'telus_postpaid_churn'

### Service Parameters

In [None]:
SERVICE_TYPE = 'telus_postpaid_churn'
SERVICE_TYPE_NAME = 'telus-postpaid-churn'
TABLE_ID = 'bq_telus_postpaid_churn_targets'
REGION = "northamerica-northeast1"
PROC_SERVING_DATASET_TABLE_NAME = 'bq_tpc_serving_dataset_preprocessed'

### Pipeline Parameters

In [None]:
STACK_NAME = 'telus_postpaid_churn'
SERVING_PIPELINE_NAME_PATH = 'telus_postpaid_churn_model/serving_pipeline'
SERVING_PIPELINE_NAME = 'telus-postpaid-churn-serving-pipeline' # Same name as pulumi.yaml
SERVING_PIPELINE_DESCRIPTION = 'telus-postpaid-churn-serving-pipeline'
PIPELINE_ROOT = f"gs://{FILE_BUCKET}"
REGION = "northamerica-northeast1"

In [None]:
SERVING_DATASET_TABLE_NAME = 'bq_tpc_serving_dataset'
SERVING_DATASET_SP_NAME = 'bq_sp_tpc_serving_dataset'
SCORE_TABLE_NAME = 'bq_telus_postpaid_churn_scores'
TEMP_TABLE='temp_telus_postpaid_churn_scores'

### Save Data Path Parameters

In [None]:
TRAINING_SAVE_DATA_PATH='gs://{}/{}/{}_train.csv'.format(FILE_BUCKET, SERVICE_TYPE, SERVICE_TYPE)
VALIDATION_SAVE_DATA_PATH='gs://{}/{}/{}_validation.csv'.format(FILE_BUCKET, SERVICE_TYPE, SERVICE_TYPE)
SERVING_SAVE_DATA_PATH='gs://{}/{}/{}_score.csv'.format(FILE_BUCKET, SERVICE_TYPE, SERVICE_TYPE)

### Import Pipeline Components

In [None]:
# download required component files to local
prefix = f'{STACK_NAME}/{SERVING_PIPELINE_NAME_PATH}/components/'
dl_dir = 'components/'

storage_client = storage.Client()
bucket = storage_client.bucket(RESOURCE_BUCKET)
blobs = bucket.list_blobs(prefix=prefix)  # Get list of files
for blob in blobs: # download each file that starts with "prefix" into "dl_dir"
    if blob.name.endswith("/"):
        continue
    file_split = blob.name.split(prefix)
    file_path = f"{dl_dir}{file_split[-1]}"
    directory = "/".join(file_path.split("/")[0:-1])
    Path(directory).mkdir(parents=True, exist_ok=True)
    blob.download_to_filename(file_path) 

# import main pipeline components
from components.bq_create_dataset import bq_create_dataset
from components.preprocess import preprocess
from components.batch_prediction import batch_prediction
from components.postprocess import postprocess
from components.load_ml_model import load_ml_model


### Import Pipeline Utils

In [None]:
# download required component files to local
prefix = f'{STACK_NAME}/{SERVING_PIPELINE_NAME_PATH}/utils/'
dl_dir = 'utils/'

storage_client = storage.Client()
bucket = storage_client.bucket(RESOURCE_BUCKET)
blobs = bucket.list_blobs(prefix=prefix)  # Get list of files
for blob in blobs: # download each file that starts with "prefix" into "dl_dir"
    if blob.name.endswith("/"):
        continue
    file_split = blob.name.split(prefix)
    file_path = f"{dl_dir}{file_split[-1]}"
    directory = "/".join(file_path.split("/")[0:-1])
    Path(directory).mkdir(parents=True, exist_ok=True)
    blob.download_to_filename(file_path) 

from utils.monitoring import generate_data_stats
from utils.monitoring import validate_stats 
from utils.monitoring import visualize_stats


### Date Parameters

In [None]:
# scoringDate = date(2023, 11, 1)  
scoringDate = date.today() - relativedelta(days=3)

# training dates
SCORE_DATE = scoringDate.strftime('%Y%m%d')  # date.today().strftime('%Y%m%d')
SCORE_DATE_DASH = scoringDate.strftime('%Y-%m-%d')
SCORE_DATE_MINUS_6_MOS_DASH = ((scoringDate - relativedelta(months=6)).replace(day=1)).strftime('%Y-%m-%d')
SCORE_DATE_LAST_MONTH_START_DASH = (scoringDate.replace(day=1) - timedelta(days=1)).replace(day=1).strftime('%Y-%m-%d')
SCORE_DATE_LAST_MONTH_END_DASH = ((scoringDate.replace(day=1)) - timedelta(days=1)).strftime('%Y-%m-%d')

#revert these changes after 2023-05-30
PROMO_EXPIRY_START = (scoringDate.replace(day=1) + relativedelta(months=3)).replace(day=1).strftime('%Y-%m-%d')
PROMO_EXPIRY_END = (scoringDate.replace(day=1) + relativedelta(months=4)).replace(day=1).strftime('%Y-%m-%d')

SCORE_DATE_DELTA = 0
SCORE_DATE_VAL_DELTA = 0
TICKET_DATE_WINDOW = 30  # Days of ticket data to be queried


In [None]:
print(scoringDate) 
print(SCORE_DATE)
print(PROMO_EXPIRY_START)
print(PROMO_EXPIRY_END)

### Model Monitoring Parameters

In [None]:
MODEL_MONITORING_STACK_NAME = 'util'
MODEL_MONITORING_PATH = 'pipeline_utils'
TRAINING_PIPELINE_NAME_PATH = 'telus_postpaid_churn_model/training_pipeline'
SERVING_PIPELINE_NAME_PATH = 'telus_postpaid_churn_model/serving_pipeline'

In [None]:
today = date.today()

# BQ table where training data is stored
INPUT_SERVING_DATA_CSV_PATH = 'gs://{}/{}/{}_score.csv'.format(FILE_BUCKET, SERVICE_TYPE, SERVICE_TYPE)

# BQ dataset where monitoring stats are stored
MODEL_MONITORING_DATASET = "telus_postpaid_churn"

# Paths to statistics artifacts in GCS
SERVING_STATISTICS_OUTPUT_PATH = f"gs://{FILE_BUCKET}/{STACK_NAME}/statistics/serving_statistics_{today}" 
ANOMALIES_PATH = f"gs://{FILE_BUCKET}/{STACK_NAME}/anomalies/anomalies_{today}"

# stats prefix
TRAINING_STATS_PREFIX = f"{STACK_NAME}/statistics/training_statistics"
SERVING_STATS_PREFIX = f"{STACK_NAME}/statistics/serving_statistics"
SCHEMA_PREFIX = f"{MODEL_NAME}/schemas/training_stats_schema"

# Paths to schemas in GCS
# SCHEMA_PATH = f'gs://{FILE_BUCKET}/{MODEL_NAME}/schemas/training_stats_schema_{today}'
# SATISTICS_PATH = f'gs://{FILE_BUCKET}/{MODEL_NAME}/schemas/training_statistics_{today}'

# Thresholds for anomalies
ANOMALY_THRESHOLDS_PATH = f"{STACK_NAME}/training_statistics/anomaly_thresholds.json" #same path structure as utils reading from bucket

# Filters for predictions monitoring
DATE_COL = 'partition_date'
DATE_FILTER = str(today)
TABLE_BLOCK_SAMPLE = 1 # no sampling
ROW_SAMPLE = 1 # no sampling


### Check if existing stats files for validation of serving data and predictions

In [None]:
def get_latest_file_by_date(storage_client,
                        bucket_name,
                        prefix):
    
    blob_updated_arr = []
    blob_name_arr = []
    blobs = storage_client.list_blobs(bucket_or_name=bucket_name, 
                                      prefix=prefix)
    # Get all files in bucket that match the prefix and append to list
    for blob in blobs:
        blob_updated_arr.append(blob.updated)
        blob_name_arr.append(blob.name)
    
    # if list is greater than 0 then files with the prefix in the bucket exists
    # retrieve the filename of the latest updated file
    if len(blob_name_arr) > 0:
        max_date_index = np.argmax(blob_updated_arr) # blob_updated_arr is a list of datetime.datetime object 
        max_name = blob_name_arr[max_date_index] # retrieves the name including the path defined by prefix based on index
        latest_file_path = f"gs://{FILE_BUCKET}/{max_name}"
        
        return latest_file_path, len(blob_name_arr), max(blob_updated_arr).strftime('%Y-%m-%d %H:%M:%S')
    
    else:
        return '', 0, ''


In [None]:
import numpy as np
import logging 

# check if serving and/or prediction stats available to compare to
previous_serving_stats_ind = False
previous_pred_stats_ind = False

storage_client = storage.Client()

# TRAINING_STATS_PREFIX = f"{STACK_NAME}/statistics/training_statistics"
training_stats_path, num_training_stats_files, training_max_date = get_latest_file_by_date(storage_client=storage_client,
                                                                                bucket_name=FILE_BUCKET,
                                                                                prefix=TRAINING_STATS_PREFIX)

# SERVING_STATS_PREFIX = f"{STACK_NAME}/statistics/serving_statistics"
previous_pred_stats_path, num_pred_stats_files, pred_max_date = get_latest_file_by_date(storage_client=storage_client,
                                                                                    bucket_name=FILE_BUCKET,
                                                                                    prefix=SERVING_STATS_PREFIX)

# Get latest SCHEMA_PATH file
# Generated from training pipline
SCHEMA_PATH, num_schema_files, schema_max_date = get_latest_file_by_date(storage_client=storage_client,
                                                        bucket_name=FILE_BUCKET,
                                                        prefix=SCHEMA_PREFIX)

# Determine if previous prediction                                                                                                                                   
if num_pred_stats_files > 0:
    previous_pred_stats_ind = True

### Pipeline

In [None]:
# library imports
from kfp.v2 import compiler
from google.cloud.aiplatform import pipeline_jobs
@dsl.pipeline(
    name=SERVING_PIPELINE_NAME, 
    description=SERVING_PIPELINE_DESCRIPTION
    )
def pipeline(
        project_id: str = PROJECT_ID,
        region: str = REGION,
        resource_bucket: str = RESOURCE_BUCKET,
        file_bucket: str = FILE_BUCKET
    ):

    #### this code block is only for a personal workbench 
    
    import google.oauth2.credentials
    token = !gcloud auth print-access-token
    token_str = token[0]
    
    #### the end
    
    from datetime import datetime
    update_ts = datetime.now()
    update_ts_str = update_ts.strftime('%Y-%m-%d %H:%M:%S')
    
    # ----- create training set --------
    bq_create_scoring_dataset_op = bq_create_dataset(score_date=SCORE_DATE_DASH
                                  , score_date_delta=SCORE_DATE_DELTA
                                  , project_id=PROJECT_ID
                                  , dataset_id=DATASET_ID
                                  , region=REGION
                                  , environment='serving'
                                  , token=token_str
                                  )
    
    # ----- preprocessing train data --------
    preprocess_scoring_op = preprocess(pipeline_dataset=SERVING_DATASET_TABLE_NAME
                                    , save_data_path=SERVING_SAVE_DATA_PATH
                                    , project_id=PROJECT_ID
                                    , dataset_id=DATASET_ID
                                    , score_date_dash=SCORE_DATE_DASH
                                    )

    preprocess_scoring_op.set_memory_limit('32G')
    preprocess_scoring_op.set_cpu_limit('4')
    
    load_ml_model_op = load_ml_model(project_id = PROJECT_ID
                                    , region = REGION
                                    , model_name = MODEL_NAME
                                    )

    load_ml_model_op.set_memory_limit('32G')
    load_ml_model_op.set_cpu_limit('4')

    batch_prediction_op = batch_prediction(project_id=PROJECT_ID
                                        , dataset_id=DATASET_ID
                                        , table_id=PROC_SERVING_DATASET_TABLE_NAME
                                        , file_bucket=FILE_BUCKET
                                        , save_data_path=SERVING_SAVE_DATA_PATH
                                        , service_type=SERVICE_TYPE
                                        , score_date_dash=SCORE_DATE_DASH
                                        , score_table=SCORE_TABLE_NAME
                                        , temp_table=TEMP_TABLE
                                        , model_uri=load_ml_model_op.outputs['model_uri']
                                        )
    
    batch_prediction_op.set_memory_limit('32G')
    batch_prediction_op.set_cpu_limit('4')
    
    postprocessing_op = postprocess(project_id=PROJECT_ID
                                    , file_bucket=FILE_BUCKET
                                    , dataset_id=DATASET_ID
                                    , service_type=SERVICE_TYPE
                                    , score_date_dash=SCORE_DATE_DASH
                                    , temp_table=TEMP_TABLE
                                    , ucar_score_table=UCAR_SCORE_TABLE
                                    , token=token_str
                                    ) 
    
    postprocessing_op.set_memory_limit('32G')
    postprocessing_op.set_cpu_limit('4')

#     # generate statistics
#     generate_serving_data_stats_op = generate_data_stats(project_id=PROJECT_ID
#                                                         , bucket_nm=FILE_BUCKET
#                                                         , data_type = 'csv'
#                                                         , op_type = 'serving'
#                                                         , model_nm = MODEL_NAME
#                                                         , update_ts = update_ts_str
#                                                         , token = token_str
#                                                         , dest_stats_gcs_path = SERVING_STATISTICS_OUTPUT_PATH
#                                                         , src_csv_path = INPUT_SERVING_DATA_CSV_PATH
#                                                         , table_block_sample = TABLE_BLOCK_SAMPLE
#                                                         , row_sample = ROW_SAMPLE
#                                                         , in_bq_ind = True
#                                                         , dest_stats_bq_dataset = MODEL_MONITORING_DATASET
#                                                         , pass_through_features = ['ban']
#                                                         ).set_display_name("generate-serving-data-statistics")

#     generate_serving_data_stats_op.set_memory_limit('32G')
#     generate_serving_data_stats_op.set_cpu_limit('4')
    
#     # compare to training data
#     # visualize serving statistics
#     visualize_serving_stats_op = visualize_stats(statistics = generate_serving_data_stats_op.outputs["statistics"]
#                                                 , stats_nm=f"Serving Statistics {update_ts_str}"
#                                                 , base_stats_path = training_stats_path # This should be previous_serving_stats_path
#                                                 , base_stats_nm=f"Training Statistics {training_max_date}"
#                                                 , op_type = 'serving'
#                                                 ).set_display_name("visualize-serving-data-statistics")
    
#     # validate stats + find anomalies by comparing with training stats
#     validate_serving_stats_op = validate_stats(project_id = PROJECT_ID
#                                             , bucket_nm = FILE_BUCKET # anomaly_thresholds.json is stored in resources bucket via github
#                                             , model_nm = MODEL_NAME
#                                             , update_ts = update_ts_str
#                                             , op_type = 'serving'
#                                             , validation_type = 'skew'
#                                             , dest_anomalies_bq_dataset = MODEL_MONITORING_DATASET #Change this to dataset of model monitoring
#                                             , statistics = generate_serving_data_stats_op.outputs["statistics"]
#                                             , base_stats_path = training_stats_path
#                                             , src_schema_path = SCHEMA_PATH # ok
#                                             , dest_anomalies_gcs_path = ANOMALIES_PATH # ok 
#                                             , src_anomaly_thresholds_path = ANOMALY_THRESHOLDS_PATH
#                                             , in_bq_ind = True
#                                             )
    
    preprocess_scoring_op.after(bq_create_scoring_dataset_op)
    
    load_ml_model_op.after(preprocess_scoring_op) 
    batch_prediction_op.after(load_ml_model_op)
    postprocessing_op.after(batch_prediction_op) 
    
#     generate_serving_data_stats_op.after(preprocess_scoring_op)
#     visualize_serving_stats_op.after(generate_serving_data_stats_op)
#     validate_serving_stats_op.after(visualize_serving_stats_op)
    

### Run the Pipeline Job

In [None]:
# from kfp.v2 import compiler
# from google.cloud.aiplatform import pipeline_jobs
# import json

# compiler.Compiler().compile(
#    pipeline_func=pipeline, package_path="pipeline.json"
# )

# job = pipeline_jobs.PipelineJob(
#                                    display_name=SERVING_PIPELINE_NAME,
#                                    template_path="pipeline.json",
#                                    location=REGION,
#                                    enable_caching=False,
#                                    pipeline_root = PIPELINE_ROOT
#                                 )
# job.run(service_account = f"bilayer-sa@{PROJECT_ID}.iam.gserviceaccount.com")


In [None]:
import google.oauth2.credentials
import json

token = !gcloud auth print-access-token
CREDENTIALS = google.oauth2.credentials.Credentials(token[0])

compiler.Compiler().compile(
   pipeline_func=pipeline, package_path="pipeline.json"
)

job = pipeline_jobs.PipelineJob(
   display_name=SERVING_PIPELINE_NAME,
   template_path="pipeline.json",
   credentials = CREDENTIALS,
   pipeline_root = PIPELINE_ROOT,
   location=REGION,
   enable_caching=False # I encourage you to enable caching when testing as it will reduce resource use
)

job.run()