### Import Libraries

In [None]:
# import required libraries
import kfp
from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output, OutputPath, ClassificationMetrics,
                        Metrics, component)
import os
import re
from pathlib import Path

from datetime import date
from datetime import timedelta
from dateutil.relativedelta import relativedelta

import google
from google.oauth2 import credentials
from google.oauth2 import service_account
from google.oauth2.service_account import Credentials
from google.cloud import storage
from google.cloud.aiplatform import pipeline_jobs
from google_cloud_pipeline_components.v1.batch_predict_job import \
    ModelBatchPredictOp as batch_prediction_op


### YAML Parameters

In [None]:
#tag cell with parameters
PROJECT_ID =  ''
BUCKET_NAME=''
DATASET_ID = ''
RESOURCE_BUCKET = ''
FILE_BUCKET = ''
REGION = ''
MODEL_ID = '9999'
MODEL_NAME = 'campaign_data_delivery'


In [None]:
#tag cell with parameters
PROJECT_ID =  'divg-josh-pr-d1cc3a'
BUCKET_NAME='divg-josh-pr-d1cc3a-default'
DATASET_ID = 'call_to_retention_dataset'
RESOURCE_BUCKET = 'divg-josh-pr-d1cc3a-default'
FILE_BUCKET = 'divg-josh-pr-d1cc3a-default'
MODEL_ID = '5090'
MODEL_NAME = 'call_to_retention'

### Service Parameters

In [None]:
SERVICE_TYPE = 'campaign_data_delivery'
SERVICE_TYPE_NAME = 'campaign-data-delivery'
TABLE_ID = 'bq_campaign_records_hcr'
REGION = "northamerica-northeast1"

### Pipeline Parameters

In [None]:
STACK_NAME = 'campaign_data_delivery'
SERVING_PIPELINE_NAME_PATH = 'campaign_data_delivery/serving_pipeline'
SERVING_PIPELINE_NAME = 'campaign-data-delivery-serving-pipeline' # Same name as pulumi.yaml
SERVING_PIPELINE_DESCRIPTION = 'campaign-data-delivery-serving-pipeline'
PIPELINE_ROOT = f"gs://{BUCKET_NAME}"
REGION = "northamerica-northeast1"

In [None]:
TRAINING_DATASET_TABLE_NAME = 'bq_campaign_data_element'
TRAINING_DATASET_SP_NAME = 'bq_sp_campaign_data_delivery_hcr_em'

### Utils Parameters

In [None]:
UTILS_STACK_NAME = 'utils' 
UTILS_NAME_PATH = 'common_functions'


### Import Pipeline Components

In [None]:
# download required component files to local
prefix = f'{STACK_NAME}/{TRAIN_PIPELINE_NAME_PATH}/components/'
dl_dir = 'components/'

storage_client = storage.Client()
bucket = storage_client.bucket(RESOURCE_BUCKET)
blobs = bucket.list_blobs(prefix=prefix)  # Get list of files
for blob in blobs: # download each file that starts with "prefix" into "dl_dir"
    if blob.name.endswith("/"):
        continue
    file_split = blob.name.split(prefix)
    file_path = f"{dl_dir}{file_split[-1]}"
    directory = "/".join(file_path.split("/")[0:-1])
    Path(directory).mkdir(parents=True, exist_ok=True)
    blob.download_to_filename(file_path) 

# import main pipeline components
from components.bq_create_dataset import bq_create_dataset
from components.preprocess import preprocess
from components.train_and_save_model import train_and_save_model
from components.upload_model import upload_model


### Import Pipeline Utils

In [None]:
# download required component files to local
prefix = f'{UTILS_STACK_NAME}/{UTILS_NAME_PATH}/gcp_utils/'
dl_dir = 'gcp_utils/'

storage_client = storage.Client()
bucket = storage_client.bucket(RESOURCE_BUCKET)
blobs = bucket.list_blobs(prefix=prefix)  # Get list of files
for blob in blobs: # download each file that starts with "prefix" into "dl_dir"
    if blob.name.endswith("/"):
        continue
    file_split = blob.name.split(prefix)
    file_path = f"{dl_dir}{file_split[-1]}"
    directory = "/".join(file_path.split("/")[0:-1])
    Path(directory).mkdir(parents=True, exist_ok=True)
    blob.download_to_filename(file_path) 

from gcp_utils.download_data_from_gcs import download_data_from_gcs
from gcp_utils.export_dataframe_to_bq import export_dataframe_to_bq


### Import Pipeline Utils

In [None]:
# download required component files to local
prefix = f'{STACK_NAME}/{TRAIN_PIPELINE_NAME_PATH}/utils/'
dl_dir = 'utils/'

storage_client = storage.Client()
bucket = storage_client.bucket(RESOURCE_BUCKET)
blobs = bucket.list_blobs(prefix=prefix)  # Get list of files
for blob in blobs: # download each file that starts with "prefix" into "dl_dir"
    if blob.name.endswith("/"):
        continue
    file_split = blob.name.split(prefix)
    file_path = f"{dl_dir}{file_split[-1]}"
    directory = "/".join(file_path.split("/")[0:-1])
    Path(directory).mkdir(parents=True, exist_ok=True)
    blob.download_to_filename(file_path) 

from utils.monitoring import generate_data_stats
from utils.monitoring import validate_stats 
from utils.monitoring import visualize_stats


In [None]:

import pandas as pd 
import numpy as np 
from google.cloud import storage 

def download_data_from_gcs(bucket_name, gcs_path, local_path): 
        
    """
    Download data from gcs bucket to local path. 
    
    Args: 
        bucket_name: The name of the GCS bucket where your file is located. 
        gcs_path: The full GCS path (including the filename) to download files from. 
        local_path: The name of the downloaded file in your local path. 

    Returns: 
        None 
        
    Example: 
        download_data_from_gcs('divg-josh-pr-d1cc3a-default', 'downloads/telus_rwrd_redemption_analysis_2018.csv', 'telus_rwrd_redemption_analysis_2018.csv')
    """
    
    bucket = storage.Client().bucket(bucket_name) 
    blob = bucket.blob(gcs_path) 
    blob.download_to_filename(local_path) 
    

In [None]:

from google.cloud import bigquery
import pandas as pd 
import numpy as np 

def export_dataframe_to_bq(df, client, table_id='', schema_list=[], generate_schema=True, write='overwrite'): 
    
    """
    Load the input dataframe to a table in bigquery. 
    
    Args: 
        df (DataFrame): A DataFrame that you want to export to BQ.
        client: A BigQuery client instance. e.g. client = bigquery.Client(project=project_id). 
        table_id (string): A string with dataset and table name. i.e 'ttv_churn_dataset.bq_tv_churn_score'. 
        schema_list (list of string, optional): List of the SchemaFields if provided, otherwise the function can generate it for you. 
        generate_schema (bool, optional): True (if True, the function will provide schema for you) or False (provide your own list) 
        write (string, optional):  if 'overwrite', the function will overwrite the existing table
                                   if 'append', it will append to the existing table 
                                   
    Returns: 
        None 
        
    Example: 
        export_dataframe_to_bq(final_df, bq_client, 'call_to_retention_dataset.bq_call_to_retention_scores_temp')
        
    """

    dtype_bq_mapping = { 
        np.dtype('int64'): 'INTEGER', 
        np.dtype('float64'): 'FLOAT', 
        np.dtype('float32'): 'FLOAT', 
        np.dtype('object'): 'STRING', 
        np.dtype('bool'): 'BOOLEAN', 
        np.dtype('datetime64[ns]'): 'DATE', 
        pd.Int64Dtype(): 'INTEGER' 
    } 
    
    if write == 'overwrite': 
        write_type = 'WRITE_TRUNCATE' 
    else: 
        write_type = 'WRITE_APPEND' 
        
    if len(schema_list) == 0: 
        generate_schema = True 
    else: 
        generate_schema = False
        
    try: 
        if generate_schema == True: 
            schema_list = [] 
            for column in df.columns: 
                schema_list.append(bigquery.SchemaField(column, dtype_bq_mapping[df.dtypes[column]], mode='NULLABLE')) 

        # Sending to bigquery 
        job_config = bigquery.LoadJobConfig(schema=schema_list, write_disposition=write_type) 
        job = client.load_table_from_dataframe(df, table_id, job_config=job_config) 
        job.result() 
        table = client.get_table(table_id) # Make an API request 
        print("Loaded {} rows and {} columns to {}".format(table.num_rows, len(table.schema), table_id)) 

    except NameError as e: 
        print(f"Error : {e}")

### Date Parameters

In [None]:
scoringDate = date.today() - relativedelta(days=1)

# training dates
SCORE_DATE = scoringDate.strftime('%Y%m%d')  # date.today().strftime('%Y%m%d')
SCORE_DATE_DASH = scoringDate.strftime('%Y-%m-%d')
SCORE_DATE_MINUS_6_MOS_DASH = ((scoringDate - relativedelta(months=6)).replace(day=1)).strftime('%Y-%m-%d')
SCORE_DATE_LAST_MONTH_START_DASH = (scoringDate.replace(day=1) - timedelta(days=1)).replace(day=1).strftime('%Y-%m-%d')
SCORE_DATE_LAST_MONTH_END_DASH = ((scoringDate.replace(day=1)) - timedelta(days=1)).strftime('%Y-%m-%d')

#revert these changes after 2023-05-30
PROMO_EXPIRY_START = (scoringDate.replace(day=1) + relativedelta(months=3)).replace(day=1).strftime('%Y-%m-%d')
PROMO_EXPIRY_END = (scoringDate.replace(day=1) + relativedelta(months=4)).replace(day=1).strftime('%Y-%m-%d')

SCORE_DATE_DELTA = 0
SCORE_DATE_VAL_DELTA = 0
TICKET_DATE_WINDOW = 30  # Days of ticket data to be queried


### Model Monitoring Parameters

In [None]:
MODEL_MONITORING_STACK_NAME = 'util'
MODEL_MONITORING_PATH = 'pipeline_utils'
TRAINING_PIPELINE_NAME_PATH = 'call_to_retention_model/training_pipeline'
SERVING_PIPELINE_NAME_PATH = 'call_to_retention_model/serving_pipeline'

In [None]:
today = date.today()

# BQ table where training data is stored
INPUT_TRAINING_DATA_TABLE_PATH = f"{PROJECT_ID}.{DATASET_ID}.{TRAINING_DATASET_TABLE_NAME}"
INPUT_TRAINING_DATA_CSV_PATH = 'gs://{}/{}_train_monitoring.csv'.format(FILE_BUCKET, SERVICE_TYPE)

# BQ dataset where monitoring stats are stored
MODEL_MONITORING_DATASET = "call_to_retention_dataset"

# Paths to statistics artifacts in GCS
TRAINING_STATISTICS_PATH = f"gs://{FILE_BUCKET}/{STACK_NAME}/{TRAINING_PIPELINE_NAME_PATH}/training_statistics/training_statistics_{today}"
TRAINING_STATS_PREFIX = f"{STACK_NAME}/statistics/training_statistics"
TRAINING_STATISTICS_OUTPUT_PATH = f"gs://{FILE_BUCKET}/{STACK_NAME}/statistics/training_statistics_{today}" 

ANOMALIES_PATH = f"gs://{FILE_BUCKET}/{STACK_NAME}/anomalies/anomalies_{today}"
PREDICTION_ANOMALIES_PATH = f"gs://{FILE_BUCKET}/{STACK_NAME}/anomalies/prediction_anomalies_{today}"
PREDICTION_STATS_PATH = f"gs://{FILE_BUCKET}/{STACK_NAME}/statistics/prediction_statistics_{today}"
PREDICTION_STATS_PREFIX = f"{FILE_BUCKET}/statistics/prediction_statistics"

# Paths to schemas in GCS
SCHEMA_PATH = f'gs://{FILE_BUCKET}/{MODEL_NAME}/schemas/training_stats_schema_{today}'
# SATISTICS_PATH = f'gs://{FILE_BUCKET}/{MODEL_NAME}/schemas/training_statistics_{today}'
# Thresholds for anomalies
ANOMALY_THRESHOLDS_PATH = f"{STACK_NAME}/{TRAINING_PIPELINE_NAME_PATH}/training_statistics/anomaly_thresholds.json" #same path structure as utils reading from bucket

# Filters for predictions monitoring
DATE_COL = 'partition_date'
DATE_FILTER = str(today)
TABLE_BLOCK_SAMPLE = 1 # no sampling
ROW_SAMPLE = 1 # no sampling

In [None]:
 
import google
from google.cloud import bigquery
from datetime import datetime
import logging 
import os 
import re 
from google.oauth2 import credentials
import google.oauth2.credentials

token = !gcloud auth print-access-token
token_str = token[0]

CREDENTIALS = google.oauth2.credentials.Credentials(token_str) # get credentials from token
    
client = bigquery.Client(project=PROJECT_ID, credentials=CREDENTIALS)

download_data_from_gcs(BUCKET_NAME, 'pyspark/pyspark_part-00000-3dcf5006-05e0-4b8b-b4d6-46019400f59d-c000.csv', 'campaign_records_hcr.csv')

df = pd.read_csv('campaign_records_hcr.csv')
df = df[df['camp_inhome'].apply(lambda x: len(str(x)) >= 14)] 
df['camp_inhome'] = pd.to_datetime(df['camp_inhome']).dt.strftime('%Y-%m-%d')
df['camp_inhome'] = pd.to_datetime(df['camp_inhome']).dt.date

schema_list = [bigquery.SchemaField('camp_inhome', 'DATE', 'NULLABLE', None, None, (), None), bigquery.SchemaField('camp_id', 'STRING', 'NULLABLE', None, None, (), None), bigquery.SchemaField('bacct_num', 'INTEGER', 'NULLABLE', None, None, (), None), bigquery.SchemaField('release_code_desc', 'STRING', 'NULLABLE', None, None, (), None), bigquery.SchemaField('camp_email', 'STRING', 'NULLABLE', None, None, (), None), bigquery.SchemaField('call_result', 'STRING', 'NULLABLE', None, None, (), None), bigquery.SchemaField('record_exhausted_ind', 'INTEGER', 'NULLABLE', None, None, (), None), bigquery.SchemaField('attempts', 'INTEGER', 'NULLABLE', None, None, (), None), bigquery.SchemaField('snet_premise_type_cd', 'STRING', 'NULLABLE', None, None, (), None)]

export_dataframe_to_bq(df, client, 'divg_compaign_element.bq_campaign_records_hcr', schema_list=schema_list, generate_schema=False, write='overwrite') 

### Pipeline

In [None]:
# library imports
from kfp.v2 import compiler
from google.cloud.aiplatform import pipeline_jobs
@dsl.pipeline(
    name=SERVING_PIPELINE_NAME, 
    description=SERVING_PIPELINE_DESCRIPTION
    )
def pipeline(
        project_id: str = PROJECT_ID,
        region: str = REGION,
        resource_bucket: str = RESOURCE_BUCKET, 
        file_bucket: str = FILE_BUCKET
    ):
    
    from datetime import datetime
    update_ts = datetime.now()
    update_ts_str = update_ts.strftime('%Y-%m-%d %H:%M:%S')

    # ----- run campaign data delivery stored procedure --------
    bq_run_cdd_sp_op = bq_run_cdd_sp(
        score_date_dash=score_date_dash, 
        project_id=PROJECT_ID,
        dataset_id=DATASET_ID, 
        region=REGION
    )

    bq_run_cdd_sp_op.set_memory_limit('32G')
    bq_run_cdd_sp_op.set_cpu_limit('4')
    
    bq_run_cdd_sp_op
    

### Run the Pipeline Job

In [None]:
from kfp.v2 import compiler
from google.cloud.aiplatform import pipeline_jobs
import json

compiler.Compiler().compile(
   pipeline_func=pipeline, package_path="pipeline.json"
)

job = pipeline_jobs.PipelineJob(
                                   display_name=TRAIN_PIPELINE_NAME,
                                   template_path="pipeline.json",
                                   location=REGION,
                                   enable_caching=False,
                                   pipeline_root = PIPELINE_ROOT
                                )
job.run(service_account = f"bilayer-sa@{PROJECT_ID}.iam.gserviceaccount.com")
