### Import Libraries

In [None]:
# import required libraries
import kfp
from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output, OutputPath, ClassificationMetrics,
                        Metrics, component)
import os
import re
from pathlib import Path

from datetime import date
from datetime import timedelta
from dateutil.relativedelta import relativedelta

import google
from google.oauth2 import credentials
from google.oauth2 import service_account
from google.oauth2.service_account import Credentials
from google.cloud import storage
from google.cloud.aiplatform import pipeline_jobs
from google_cloud_pipeline_components.v1.batch_predict_job import \
    ModelBatchPredictOp as batch_prediction_op

### YAML Parameters

In [None]:
#tag cell with parameters
PROJECT_ID =  ''
DATASET_ID = ''
RESOURCE_BUCKET = ''
FILE_BUCKET = ''
REGION = ''
MODEL_ID = ''
MODEL_NAME = ''
PIPELINE_PATH = 'vertex_pipelines/hs_nba_prospects/training_pipeline'
HS_NBA_UTILS_PATH =  'vertex_pipelines/hs_nba_utils/notebook'
MODEL_TYPE='acquisition'
LOAD_SQL=''
TRAIN_CSV='' 
SAVE_FILE_NAME=''
STATS_FILE_NAME=''

In [None]:
#tag cell with parameters
PROJECT_ID =  'divg-groovyhoon-pr-d2eab4'
DATASET_ID = 'nba_product_reco_prospects'
RESOURCE_BUCKET = 'divg-groovyhoon-pr-d2eab4-default'
FILE_BUCKET = 'divg-groovyhoon-pr-d2eab4-default'
REGION = ''
MODEL_ID = ''
MODEL_NAME = 'nba_product_reco_prospects'
PIPELINE_PATH = 'vertex_pipelines/hs_nba_prospects/training_pipeline'
HS_NBA_UTILS_PATH =  'vertex_pipelines/hs_nba_utils/notebook'
MODEL_TYPE='acquisition'
LOAD_SQL='load_train_data.sql'
TRAIN_CSV='training_dataset.csv' 
SAVE_FILE_NAME='df_val_exp.csv'
STATS_FILE_NAME='df_stats.csv'

### Service Parameters

In [None]:
SERVICE_TYPE = 'nba_product_reco_prospects'
SERVICE_TYPE_NAME = 'nba-product-reco-prospects'
TABLE_ID = 'master_features_set_prospect'
REGION = "northamerica-northeast1"

### Pipeline Parameters

In [None]:
STACK_NAME = 'nba_product_reco_prospects'
TRAINING_PIPELINE_NAME_PATH = f'{STACK_NAME}_model/training_pipeline'
SERVING_PIPELINE_NAME_PATH = f'{STACK_NAME}_model/serving_pipeline'
TRAINING_PIPELINE_NAME = f'{SERVICE_TYPE_NAME}-train-pipeline' # Same name as pulumi.yaml
SERVING_PIPELINE_NAME = f'{SERVICE_TYPE_NAME}-serving-pipeline' # Same name as pulumi.yaml
TRAINING_PIPELINE_DESCRIPTION = f'{SERVICE_TYPE_NAME}-train-pipeline'
SERVING_PIPELINE_DESCRIPTION = f'{SERVICE_TYPE_NAME}-serving-pipeline'
PIPELINE_ROOT = f"gs://{FILE_BUCKET}"
REGION = "northamerica-northeast1"

### Import Pipeline Components

In [None]:
# download required component files to local
prefix = f'{STACK_NAME}/{TRAINING_PIPELINE_NAME_PATH}/components/'
dl_dir = 'components/'

storage_client = storage.Client()
bucket = storage_client.bucket(RESOURCE_BUCKET)
blobs = bucket.list_blobs(prefix=prefix)  # Get list of files
for blob in blobs: # download each file that starts with "prefix" into "dl_dir"
    if blob.name.endswith("/"):
        continue
    file_split = blob.name.split(prefix)
    file_path = f"{dl_dir}{file_split[-1]}"
    directory = "/".join(file_path.split("/")[0:-1])
    Path(directory).mkdir(parents=True, exist_ok=True)
    blob.download_to_filename(file_path)

# import main pipeline components
from components.bq_create_dataset import bq_create_dataset
from components.preprocess import preprocess
from components.train_and_save_model import train_and_save_model
from components.upload_model import upload_model

### Import Pipeline Utils

In [None]:
# download required component files to local
prefix = f'{STACK_NAME}/{TRAINING_PIPELINE_NAME_PATH}/utils/'
dl_dir = 'utils/'

storage_client = storage.Client()
bucket = storage_client.bucket(RESOURCE_BUCKET)
blobs = bucket.list_blobs(prefix=prefix)  # Get list of files
for blob in blobs: # download each file that starts with "prefix" into "dl_dir"
    if blob.name.endswith("/"):
        continue
    file_split = blob.name.split(prefix)
    file_path = f"{dl_dir}{file_split[-1]}"
    directory = "/".join(file_path.split("/")[0:-1])
    Path(directory).mkdir(parents=True, exist_ok=True)
    blob.download_to_filename(file_path)

from utils.monitoring import generate_data_stats
from utils.monitoring import validate_stats
from utils.monitoring import visualize_stats

### Date Parameters

In [None]:
# training pipeline is to run on the 3rd of every month
# change the training time window every 3rd of month to R12 months as of {last day of 2 months ago} e.g. on May 1st, training window to be 2023-04-01 to 2024-03-31

# set training dates
trainingDate = (date.today() - relativedelta(days=2))

# training dates
TRAIN_DATE = trainingDate.strftime('%Y%m%d')  # date.today().strftime('%Y%m%d')
TRAIN_DATE_DASH = trainingDate.strftime('%Y-%m-%d')

FROM_DATE = trainingDate.replace(day=1) + relativedelta(months=-13)
TO_DATE = trainingDate.replace(day=1) + relativedelta(months=-1, days=-1)

FROM_DATE = FROM_DATE.strftime('%Y-%m-%d')
TO_DATE = TO_DATE.strftime('%Y-%m-%d')

### Model Monitoring Parameters

In [None]:
# MODEL_MONITORING_STACK_NAME = 'util'
# MODEL_MONITORING_PATH = 'pipeline_utils'

In [None]:
# today = date.today()

# # BQ table where training data is stored
# INPUT_TRAINING_DATA_TABLE_PATH = f"{PROJECT_ID}.{DATASET_ID}.{TRAINING_DATASET_TABLE_NAME}"
# INPUT_TRAINING_DATA_CSV_PATH = 'gs://{}/{}/{}_train.csv'.format(FILE_BUCKET, SERVICE_TYPE, SERVICE_TYPE)                               

# # BQ dataset where monitoring stats are stored
# MODEL_MONITORING_DATASET = "telus_postpaid_churn_model"

# # Paths to statistics artifacts in GCS
# TRAINING_STATISTICS_PATH = f"gs://{FILE_BUCKET}/{STACK_NAME}/{TRAINING_PIPELINE_NAME_PATH}/training_statistics/training_statistics_{today}"
# TRAINING_STATS_PREFIX = f"{STACK_NAME}/statistics/training_statistics"
# TRAINING_STATISTICS_OUTPUT_PATH = f"gs://{FILE_BUCKET}/{STACK_NAME}/statistics/training_statistics_{today}" 

# ANOMALIES_PATH = f"gs://{FILE_BUCKET}/{STACK_NAME}/anomalies/anomalies_{today}"
# PREDICTION_ANOMALIES_PATH = f"gs://{FILE_BUCKET}/{STACK_NAME}/anomalies/prediction_anomalies_{today}"
# PREDICTION_STATS_PATH = f"gs://{FILE_BUCKET}/{STACK_NAME}/statistics/prediction_statistics_{today}"
# PREDICTION_STATS_PREFIX = f"{FILE_BUCKET}/statistics/prediction_statistics"

# # Paths to schemas in GCS
# SCHEMA_PATH = f'gs://{FILE_BUCKET}/{STACK_NAME}/schemas/training_stats_schema_{today}'
# # SATISTICS_PATH = f'gs://{FILE_BUCKET}/{STACK_NAME}/schemas/training_statistics_{today}'
# # Thresholds for anomalies
# ANOMALY_THRESHOLDS_PATH = f"{STACK_NAME}/{TRAINING_PIPELINE_NAME_PATH}/training_statistics/anomaly_thresholds.json" #same path structure as utils reading from bucket

# # Filters for predictions monitoring
# DATE_COL = 'partition_date'
# DATE_FILTER = str(today)
# TABLE_BLOCK_SAMPLE = 1 # no sampling
# ROW_SAMPLE = 1 # no sampling

### Pipeline

In [None]:
# library imports
from kfp.v2 import compiler
from google.cloud.aiplatform import pipeline_jobs
@dsl.pipeline(
    name=TRAINING_PIPELINE_NAME, 
    description=TRAINING_PIPELINE_DESCRIPTION
    )
def pipeline(
        project_id: str = PROJECT_ID,
        region: str = REGION,
        resource_bucket: str = RESOURCE_BUCKET, 
        file_bucket: str = FILE_BUCKET
    ):
    
    #### this code block is only for a personal workbench 
    
    import google.oauth2.credentials
    token = !gcloud auth print-access-token
    token_str = token[0]
    
    #### the end
    
    from datetime import datetime
    update_ts = datetime.now()
    update_ts_str = update_ts.strftime('%Y-%m-%d %H:%M:%S')
    
    from pathlib import Path
    cwd = os.getcwd() 
 
#     # ----- create training set --------
#     bq_create_training_dataset_op = bq_create_dataset(from_date=FROM_DATE, 
#                                 to_date=TO_DATE,
#                                 project_id=PROJECT_ID, 
#                                 dataset_id=DATASET_ID,
#                                 table_id=TABLE_ID,
#                                 region=REGION,
#                                 token=token_str
#                                 )

#     bq_create_training_dataset_op.set_memory_limit('32G')
#     bq_create_training_dataset_op.set_cpu_limit('4')
    
#     # ----- preprocessing train data --------
#     preprocess_train_op = preprocess(project_id=PROJECT_ID,
#                                     dataset_id=DATASET_ID, 
#                                     table_id=TABLE_ID, 
#                                     file_bucket=FILE_BUCKET, 
#                                     resource_bucket=RESOURCE_BUCKET, 
#                                     stack_name=STACK_NAME, 
#                                     pipeline_path=PIPELINE_PATH,
#                                     hs_nba_utils_path=HS_NBA_UTILS_PATH, 
#                                     model_type=MODEL_TYPE,
#                                     load_sql=LOAD_SQL, 
#                                     train_csv=TRAIN_CSV,
#                                     token=token_str
#                                     )

#     preprocess_train_op.set_memory_limit('32G')
#     preprocess_train_op.set_cpu_limit('4')
    
    train_and_save_model_op = train_and_save_model(file_bucket=FILE_BUCKET, 
                                                    resource_bucket=RESOURCE_BUCKET,
                                                    service_type=SERVICE_TYPE, 
                                                    project_id=PROJECT_ID, 
                                                    dataset_id=DATASET_ID, 
                                                    model_type=MODEL_TYPE, 
                                                    train_csv=TRAIN_CSV, 
                                                    save_file_name=SAVE_FILE_NAME,
                                                    stats_file_name=STATS_FILE_NAME, 
                                                    pipeline_path=PIPELINE_PATH,
                                                    hs_nba_utils_path=HS_NBA_UTILS_PATH, 
                                                    token=token_str
                                                   )

    train_and_save_model_op.set_memory_limit('32G')
    train_and_save_model_op.set_cpu_limit('4')
    
#     # col_input_op = col_list = bq_create_dataset_op.outputs["col_list"]
#     upload_model_op = upload_model(project_id = PROJECT_ID
#                                 , region = REGION
#                                 , model = train_and_save_model_op.outputs["model"]
#                                 , model_name = MODEL_NAME
#                                 , prediction_image = PREDICTION_IMAGE
#                                 , col_list = train_and_save_model_op.outputs["col_list"]
#                                 , model_uri = train_and_save_model_op.outputs["model_uri"]
#                                 )
    
#     upload_model_op.set_memory_limit('32G')
#     upload_model_op.set_cpu_limit('4')
        
    # preprocess_train_op.after(bq_create_training_dataset_op)
    # train_and_save_model_op.after(preprocess_train_op)
#     upload_model_op.after(train_and_save_model_op)

    train_and_save_model_op

### Run the Pipeline Job

In [None]:
# from kfp.v2 import compiler
# from google.cloud.aiplatform import pipeline_jobs
# import json

# compiler.Compiler().compile(
#    pipeline_func=pipeline, package_path="pipeline.json"
# )

# job = pipeline_jobs.PipelineJob(
#                                    display_name=TRAINING_PIPELINE_NAME,
#                                    template_path="pipeline.json",
#                                    location=REGION,
#                                    enable_caching=False,
#                                    pipeline_root = PIPELINE_ROOT
#                                 )
# job.run(service_account = f"bilayer-sa@{PROJECT_ID}.iam.gserviceaccount.com")



In [None]:
import google.oauth2.credentials
import json

token = !gcloud auth print-access-token
CREDENTIALS = google.oauth2.credentials.Credentials(token[0])

compiler.Compiler().compile(
   pipeline_func=pipeline, package_path="pipeline.json"
)

job = pipeline_jobs.PipelineJob(
   display_name=TRAINING_PIPELINE_NAME,
   template_path="pipeline.json",
   credentials = CREDENTIALS,
   pipeline_root = PIPELINE_ROOT,
   location=REGION,
   enable_caching=False # I encourage you to enable caching when testing as it will reduce resource use
)

job.run()

In [None]:
error

In [None]:
# # import global modules
# from google.cloud import storage
# from google.cloud import bigquery
# from pathlib import Path
# from yaml import safe_load
# import sys
# import os

# #tag cell with parameters
# PROJECT_ID =  'divg-groovyhoon-pr-d2eab4'
# DATASET_ID = 'nba_product_reco_prospects'
# RESOURCE_BUCKET = 'divg-groovyhoon-pr-d2eab4-default'
# FILE_BUCKET = 'divg-groovyhoon-pr-d2eab4'
# MODEL_ID = ''
# MODEL_NAME = 'nba_product_reco_prospects'

# project_id = PROJECT_ID
# dataset_id = DATASET_ID
# resource_bucket = RESOURCE_BUCKET
# file_bucket = FILE_BUCKET
# table_id = 'master_features_set_prospect'

# # set global vars
# pth_project = Path(os.getcwd())
# pth_model_config = pth_project / 'model_config.yaml'
# pth_queries = pth_project / 'queries'
# sys.path.insert(0, pth_project.as_posix())

# # import local modules
# from hs_nba_utils.etl.extract import extract_bq_data
# from hs_nba_utils.modeling.hs_features_preprocessing import process_hs_features

# # load model config
# d_model_config = safe_load(pth_model_config.open())

# # select columns to query
# target_column = d_model_config['target_column']
# str_feature_names = ','.join([f"cast({f['name']} as {f['type']}) as {f['name']}" for f in d_model_config['features']])
# str_customer_ids = ','.join([f"cast({f['name']} as {f['type']}) as {f['name']}" for f in d_model_config['customer_ids']])

# # extract training data
# sql = (pth_queries / 'load_train_data.sql').read_text().format(
#     project_id=project_id
#     , dataset_id=dataset_id
#     , table_id=table_id
#     , target_column=target_column
#     , customer_ids=str_customer_ids
#     , feature_names=str_feature_names
#     )

# print(sql)

# df = extract_bq_data(client, sql)
# print(f"Training dataset df.shape {df.shape}")


In [None]:
# import global modules
from google.cloud import storage
from google.cloud import bigquery
from pathlib import Path
from yaml import safe_load
import sys
import os

pth = Path(os.getcwd()) 

pth_model_config = pth / 'model_config.yaml'

d_model_config = safe_load(pth_model_config.open())

In [None]:
[f['name'] for f in d_model_config['features']]

In [None]:
pth_model_config = pth / 'model_config.yaml'

In [None]:
d_model_config = safe_load(pth_model_config.open())

In [None]:
d_model_config['target_variables']['tier']

In [None]:
','.join([f['name'] for f in d_model_config['target_variables']['acquisition']])

In [None]:
','.join([f"\"{f['name']}\"" for f in d_model_config['target_variables']['acquisition']])

In [None]:
','.join([f"cast({f['name']} as {f['type']}) as {f['name']}" for f in d_model_config['customer_ids']])

In [None]:
','.join([f"\"{f['name']}\"" for f in d_model_config['target_variables']['acquisition']])

In [None]:
d_model_config['customer_id_fields']

In [None]:
[f['name'] for f in d_model_config['customer_ids']]

In [None]:
len(d_model_config['target_variables']['acquisition'])

In [None]:
for i in range(10): 
    print(i)