### Import Libraries

In [None]:
# import required libraries
from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output, OutputPath, ClassificationMetrics,
                        Metrics, component)
import os
import re
from pathlib import Path

from datetime import date
from datetime import timedelta
from dateutil.relativedelta import relativedelta

import google
from google.oauth2 import credentials
from google.oauth2 import service_account
from google.oauth2.service_account import Credentials
from google.cloud import storage
from google.cloud.aiplatform import pipeline_jobs
from google_cloud_pipeline_components.v1.batch_predict_job import \
    ModelBatchPredictOp as batch_prediction_op


### Parameters

In [None]:
#tag cell with parameters
PROJECT_ID =  ''
BUCKET_NAME=''
DATASET_ID = ''
RESOURCE_BUCKET = ''
FILE_BUCKET = ''
REGION = ''
MODEL_ID = '5090'

In [None]:
#tag cell with parameters
PROJECT_ID =  'divg-josh-pr-d1cc3a'
BUCKET_NAME='divg-josh-pr-d1cc3a-default'
DATASET_ID = 'call_to_retention_dataset'
RESOURCE_BUCKET = 'divg-josh-pr-d1cc3a-default'
FILE_BUCKET = 'divg-josh-pr-d1cc3a-default'
MODEL_ID = '5090'

### Service Parameters

In [None]:
SERVICE_TYPE = 'call_to_retention'
SERVICE_TYPE_NAME = 'call-to-retention'
TABLE_ID = 'bq_call_to_retention_targets'
REGION = "northamerica-northeast1"

### Pulumi Parameters

In [None]:
STACK_NAME = 'call_to_retention'
TRAIN_PIPELINE_NAME_PATH = 'train_pipeline'
PREDICT_PIPELINE_NAME_PATH = 'predict_pipeline'
TRAIN_PIPELINE_NAME = 'call-to-retention-train-pipeline' # Same name as pulumi.yaml
PREDICT_PIPELINE_NAME = 'call-to-retention-predict-pipeline' # Same name as pulumi.yaml
TRAIN_PIPELINE_DESCRIPTION = 'call-to-retention-train-pipeline'
PREDICT_PIPELINE_DESCRIPTION = 'call-to-retention-predict-pipeline'
REGION = "northamerica-northeast1"

### Query + Pre-Processing Component Parameters

In [None]:
TRAIN_QUERIES_PATH = f"{STACK_NAME}/{TRAIN_PIPELINE_NAME_PATH}/queries/" 
TRAIN_UTILS_FILE_PATH = f"{STACK_NAME}/{TRAIN_PIPELINE_NAME_PATH}/utils" 
UTILS_FILENAME = 'utils.py'

PROCESSED_SERVING_DATA_TABLENAME = 'processed_serving_data'
INPUT_SERVING_DATA_TABLENAME = 'input_serving_data'

QUERY_DATE = (date.today() - relativedelta(days=1)).strftime('%Y-%m-%d')
TARGET_TABLE_REF = '{}.{}.{}'.format(PROJECT_ID, DATASET_ID, TABLE_ID)

QUERIES_PATH = 'call_to_retention/queries/'

#Query Paths
ACCOUNT_PROMO_EXPIRY_LIST_QUERY_PATH = QUERIES_PATH + 'create_input_account_promo_expiry_list_query.sql'
ACCOUNT_CONSL_QUERY_PATH = QUERIES_PATH + 'create_input_account_consl_query.sql'
ACCOUNT_FFH_BILLING_QUERY_PATH = QUERIES_PATH + 'create_input_account_ffh_billing_query.sql'
ACCOUNT_FFH_DISCOUNTS_QUERY_PATH = QUERIES_PATH + 'create_input_account_ffh_discounts_query.sql'
ACCOUNT_HS_USAGE_QUERY_PATH = QUERIES_PATH + 'create_input_account_hs_usage_query.sql'
ACCOUNT_DEMO_INCOME_QUERY_PATH = QUERIES_PATH + 'create_input_account_demo_income_query.sql'
ACCOUNT_GPON_COPPER_QUERY_PATH = QUERIES_PATH + 'create_input_account_gpon_copper_query.sql'
ACCOUNT_PRICE_PLAN_QUERY_PATH = QUERIES_PATH + 'create_input_account_price_plan_query.sql'
ACCOUNT_CLCKSTRM_TELUS_QUERY_PATH = QUERIES_PATH + 'create_input_account_clckstrm_telus_query.sql'
ACCOUNT_CALL_HISTORY_QUERY_PATH = QUERIES_PATH + 'create_input_account_call_history_query.sql'


### Import Pipeline Components

In [None]:
# download required component files to local
prefix = f'{STACK_NAME}/{TRAIN_PIPELINE_NAME_PATH}/components/'
dl_dir = 'components/'

storage_client = storage.Client()
bucket = storage_client.bucket(RESOURCE_BUCKET)
blobs = bucket.list_blobs(prefix=prefix)  # Get list of files
for blob in blobs: # download each file that starts with "prefix" into "dl_dir"
    if blob.name.endswith("/"):
        continue
    file_split = blob.name.split(prefix)
    file_path = f"{dl_dir}{file_split[-1]}"
    directory = "/".join(file_path.split("/")[0:-1])
    Path(directory).mkdir(parents=True, exist_ok=True)
    blob.download_to_filename(file_path) 

# import main pipeline components
import components


### Date Parameters

In [None]:
scoringDate = date(2022, 4, 1)  # date.today() - relativedelta(days=2)- relativedelta(months=30)
valScoringDate = date(2022, 5, 1)  # scoringDate - relativedelta(days=2)

# training views
PROMO_EXPIRY_LIST_VIEW_NAME = '{}_pipeline_promo_expiry_list_data_training_bi_layer'.format(SERVICE_TYPE)  
CONSL_VIEW_NAME = '{}_pipeline_consl_data_training_bi_layer'.format(SERVICE_TYPE)  
FFH_BILLING_VIEW_NAME = '{}_pipeline_ffh_billing_data_training_bi_layer'.format(SERVICE_TYPE)  
FFH_DISCOUNTS_VIEW_NAME = '{}_pipeline_ffh_discounts_data_training_bi_layer'.format(SERVICE_TYPE)  
HS_USAGE_VIEW_NAME = '{}_pipeline_hs_usage_data_training_bi_layer'.format(SERVICE_TYPE)  
DEMO_INCOME_VIEW_NAME = '{}_pipeline_demo_income_data_training_bi_layer'.format(SERVICE_TYPE)  
GPON_COPPER_VIEW_NAME = '{}_pipeline_gpon_copper_data_training_bi_layer'.format(SERVICE_TYPE)  
PRICE_PLAN_VIEW_NAME = '{}_pipeline_price_plan_data_training_bi_layer'.format(SERVICE_TYPE)  
CLCKSTRM_TELUS_VIEW_NAME = '{}_pipeline_clckstrm_telus_training_bi_layer'.format(SERVICE_TYPE)
CALL_HISTORY_VIEW_NAME = '{}_pipeline_call_history_data_training_bi_layer'.format(SERVICE_TYPE)  

# validation views
PROMO_EXPIRY_LIST_VIEW_VALIDATION_NAME = '{}_pipeline_promo_expiry_list_data_validation_bi_layer'.format(SERVICE_TYPE)  
CONSL_VIEW_VALIDATION_NAME = '{}_pipeline_consl_data_validation_bi_layer'.format(SERVICE_TYPE)  
FFH_BILLING_VIEW_VALIDATION_NAME = '{}_pipeline_ffh_billing_data_validation_bi_layer'.format(SERVICE_TYPE)  
FFH_DISCOUNTS_VIEW_VALIDATION_NAME = '{}_pipeline_ffh_discounts_data_validation_bi_layer'.format(SERVICE_TYPE)  
HS_USAGE_VIEW_VALIDATION_NAME = '{}_pipeline_hs_usage_data_validation_bi_layer'.format(SERVICE_TYPE)  
DEMO_INCOME_VIEW_VALIDATION_NAME = '{}_pipeline_demo_income_data_validation_bi_layer'.format(SERVICE_TYPE)  
GPON_COPPER_VIEW_VALIDATION_NAME = '{}_pipeline_gpon_copper_data_validation_bi_layer'.format(SERVICE_TYPE)  
PRICE_PLAN_VIEW_VALIDATION_NAME = '{}_pipeline_price_plan_data_validation_bi_layer'.format(SERVICE_TYPE)  
CLCKSTRM_TELUS_VIEW_VALIDATION_NAME = '{}_pipeline_clckstrm_telus_validation_bi_layer'.format(SERVICE_TYPE)
CALL_HISTORY_VIEW_VALIDATION_NAME = '{}_pipeline_call_history_data_validation_bi_layer'.format(SERVICE_TYPE)  

# training dates
SCORE_DATE = scoringDate.strftime('%Y%m%d')  # date.today().strftime('%Y%m%d')
SCORE_DATE_DASH = scoringDate.strftime('%Y-%m-%d')
SCORE_DATE_MINUS_6_MOS_DASH = ((scoringDate - relativedelta(months=6)).replace(day=1)).strftime('%Y-%m-%d')
SCORE_DATE_THIS_MONTH_START_DASH = scoringDate.replace(day=1)
SCORE_DATE_THIS_MONTH_END_DASH = (((scoringDate.replace(day=1)) + relativedelta(months=1)).replace(day=1) - timedelta(days=1)).strftime('%Y-%m-%d')
SCORE_DATE_LAST_MONTH_START_DASH = (scoringDate.replace(day=1) - timedelta(days=1)).replace(day=1).strftime('%Y-%m-%d')
SCORE_DATE_LAST_MONTH_END_DASH = ((scoringDate.replace(day=1)) - timedelta(days=1)).strftime('%Y-%m-%d')
SCORE_DATE_LAST_MONTH_YEAR = ((scoringDate.replace(day=1)) - timedelta(days=1)).year
SCORE_DATE_LAST_MONTH_MONTH = ((scoringDate.replace(day=1)) - timedelta(days=1)).month
PROMO_EXPIRY_START = (scoringDate.replace(day=1) + relativedelta(months=3)).replace(day=1).strftime('%Y%m%d')
PROMO_EXPIRY_END = (scoringDate.replace(day=1) + relativedelta(months=4)).replace(day=1).strftime('%Y%m%d')

# validation dates
SCORE_DATE_VAL = valScoringDate.strftime('%Y%m%d')
SCORE_DATE_VAL_DASH = valScoringDate.strftime('%Y-%m-%d')
SCORE_DATE_VAL_MINUS_6_MOS_DASH = ((valScoringDate - relativedelta(months=6)).replace(day=1)).strftime('%Y-%m-%d')
SCORE_DATE_VAL_THIS_MONTH_START_DASH = valScoringDate.replace(day=1)
SCORE_DATE_VAL_THIS_MONTH_END_DASH = (((valScoringDate.replace(day=1)) + relativedelta(months=1)).replace(day=1) - timedelta(days=1)).strftime('%Y-%m-%d')
SCORE_DATE_VAL_LAST_MONTH_START_DASH = (valScoringDate.replace(day=1) - timedelta(days=1)).replace(day=1).strftime('%Y-%m-%d')
SCORE_DATE_VAL_LAST_MONTH_END_DASH = ((valScoringDate.replace(day=1)) - timedelta(days=1)).strftime('%Y-%m-%d')
SCORE_DATE_VAL_LAST_MONTH_YEAR = ((valScoringDate.replace(day=1)) - timedelta(days=1)).year
SCORE_DATE_VAL_LAST_MONTH_MONTH = ((valScoringDate.replace(day=1)) - timedelta(days=1)).month
PROMO_EXPIRY_START_VAL = (valScoringDate.replace(day=1) + relativedelta(months=3)).replace(day=1).strftime('%Y%m%d')
PROMO_EXPIRY_END_VAL = (valScoringDate.replace(day=1) + relativedelta(months=4)).replace(day=1).strftime('%Y%m%d')

SCORE_DATE_DELTA = 0
SCORE_DATE_VAL_DELTA = 0
TICKET_DATE_WINDOW = 30  # Days of ticket data to be queried


### 1.create_input_account_promo_expiry_list_view

In [None]:
def create_input_account_promo_expiry_list_view(view_name: str,
                                           score_date: str,
                                           score_date_delta: str,
                                           dataset_id: str,
                                           project_id: str,
                                           region: str,
                                           resource_bucket: str,
                                           query_path: str, 
                                           promo_expiry_start: str, 
                                           promo_expiry_end: str
                                           ):

    from google.cloud import bigquery
    from google.cloud import storage

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    bq_client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)
    dataset = bq_client.dataset(dataset_id)
    table_ref = dataset.table(view_name)

    # load query from .txt file
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(resource_bucket)
    blob = bucket.get_blob(query_path)
    content = blob.download_as_string()
    content = str(content, 'utf-8')

    def if_tbl_exists(client, table_ref):
        from google.cloud.exceptions import NotFound
        try:
            client.get_table(table_ref)
            return True
        except NotFound:
            return False

    if if_tbl_exists(bq_client, table_ref):
        bq_client.delete_table(table_ref)

    create_base_feature_set_query = content.format(score_date=score_date,
                                                   score_date_delta=score_date_delta,
                                                   view_name=view_name,
                                                   dataset_id=dataset_id,
                                                   project_id=project_id,
                                                   promo_expiry_start=promo_expiry_start, 
                                                   promo_expiry_end=promo_expiry_end
                                                   )
    shared_dataset_ref = bq_client.dataset(dataset_id)
    base_feature_set_view_ref = shared_dataset_ref.table(view_name)
    base_feature_set_view = bigquery.Table(base_feature_set_view_ref)
    base_feature_set_view.view_query = create_base_feature_set_query.format(project_id)
    base_feature_set_view = bq_client.create_table(base_feature_set_view)

### 2.create_input_account_consl_view

In [None]:
def create_input_account_consl_view(view_name: str,
                                    score_date: str,
                                    score_date_delta: str,
                                    project_id: str,
                                    dataset_id: str,
                                    region: str,
                                    resource_bucket: str,
                                    query_path: str,
                                    ):

    from google.cloud import bigquery
    from google.cloud import storage

    def if_tbl_exists(client, table_ref):
        from google.cloud.exceptions import NotFound
        try:
            client.get_table(table_ref)
            return True
        except NotFound:
            return False

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    bq_client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)
    dataset = bq_client.dataset(dataset_id)
    table_ref = dataset.table(view_name)

    # load query from .txt file
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(resource_bucket)
    blob = bucket.get_blob(query_path)
    content = blob.download_as_string()
    content = str(content, 'utf-8')

    if if_tbl_exists(bq_client, table_ref):
        bq_client.delete_table(table_ref)

    # content = open(query_path, 'r').read()

    create_base_feature_set_query = content.format(score_date=score_date,
                                                   score_date_delta=score_date_delta,
                                                   view_name=view_name,
                                                   dataset_id=dataset_id,
                                                   project_id=project_id,
                                                   )
    shared_dataset_ref = bq_client.dataset(dataset_id)
    base_feature_set_view_ref = shared_dataset_ref.table(view_name)
    base_feature_set_view = bigquery.Table(base_feature_set_view_ref)
    base_feature_set_view.view_query = create_base_feature_set_query.format(project_id)
    base_feature_set_view = bq_client.create_table(base_feature_set_view)

### 3.create_input_account_ffh_billing_view

In [None]:
def create_input_account_ffh_billing_view(view_name: str,
                                          v_report_date: str,
                                          v_start_date: str,
                                          v_end_date: str,
                                          v_bill_year: str,
                                          v_bill_month: str,
                                          dataset_id: str,
                                          project_id: str,
                                          region: str,
                                          resource_bucket: str,
                                          query_path: str
                                          ):
    from google.cloud import bigquery
    from google.cloud import storage

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    bq_client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)
    dataset = bq_client.dataset(dataset_id)
    table_ref = dataset.table(view_name)

    # load query from .txt file
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(resource_bucket)
    blob = bucket.get_blob(query_path)
    content = blob.download_as_string()
    content = str(content, 'utf-8')

    def if_tbl_exists(client, table_ref):
        from google.cloud.exceptions import NotFound
        try:
            client.get_table(table_ref)
            return True
        except NotFound:
            return False

    if if_tbl_exists(bq_client, table_ref):
        bq_client.delete_table(table_ref)

    create_base_feature_set_query = content.format(v_report_date=v_report_date,
                                                   v_start_date=v_start_date,
                                                   v_end_date=v_end_date,
                                                   v_bill_year=v_bill_year,
                                                   v_bill_month=v_bill_month,
                                                   )

    shared_dataset_ref = bq_client.dataset(dataset_id)
    base_feature_set_view_ref = shared_dataset_ref.table(view_name)
    base_feature_set_view = bigquery.Table(base_feature_set_view_ref)
    base_feature_set_view.view_query = create_base_feature_set_query.format(project_id)
    base_feature_set_view = bq_client.create_table(base_feature_set_view)


### 4.create_input_account_ffh_discounts_view

In [None]:
def create_input_account_ffh_discounts_view(view_name: str,
                                        score_date: str,
                                        score_date_delta: str,
                                        project_id: str,
                                        dataset_id: str,
                                        region: str,
                                        resource_bucket: str,
                                        query_path: str,
                                        ):

    from google.cloud import bigquery
    from google.cloud import storage

    def if_tbl_exists(client, table_ref):
        from google.cloud.exceptions import NotFound
        try:
            client.get_table(table_ref)
            return True
        except NotFound:
            return False

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    bq_client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)
    dataset = bq_client.dataset(dataset_id)
    table_ref = dataset.table(view_name)

    # load query from .txt file
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(resource_bucket)
    blob = bucket.get_blob(query_path)
    content = blob.download_as_string()
    content = str(content, 'utf-8')
    
    if if_tbl_exists(bq_client, table_ref):
        bq_client.delete_table(table_ref)

    # content = open(query_path, 'r').read()

    create_base_feature_set_query = content.format(score_date=score_date,
                                                   score_date_delta=score_date_delta, 
                                                   view_name=view_name,
                                                   dataset_id=dataset_id,
                                                   project_id=project_id
                                                   )
    shared_dataset_ref = bq_client.dataset(dataset_id)
    base_feature_set_view_ref = shared_dataset_ref.table(view_name)
    base_feature_set_view = bigquery.Table(base_feature_set_view_ref)
    base_feature_set_view.view_query = create_base_feature_set_query
    base_feature_set_view = bq_client.create_table(base_feature_set_view)


### 5.create_input_account_hs_usage_view

In [None]:
def create_input_account_hs_usage_view(view_name: str,
                                       v_report_date: str,
                                       v_start_date: str,
                                       v_end_date: str,
                                       v_bill_year: str,
                                       v_bill_month: str,
                                       dataset_id: str,
                                       project_id: str,
                                       region: str,
                                       resource_bucket: str,
                                       query_path: str
                                       ):

    from google.cloud import bigquery
    from google.cloud import storage

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    bq_client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)
    dataset = bq_client.dataset(dataset_id)
    table_ref = dataset.table(view_name)

    # load query from .txt file
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(resource_bucket)
    blob = bucket.get_blob(query_path)
    content = blob.download_as_string()
    content = str(content, 'utf-8')

    def if_tbl_exists(client, table_ref):
        from google.cloud.exceptions import NotFound
        try:
            client.get_table(table_ref)
            return True
        except NotFound:
            return False

    if if_tbl_exists(bq_client, table_ref):
        bq_client.delete_table(table_ref)

    create_base_feature_set_query = content.format(v_report_date=v_report_date,
                                                   v_start_date=v_start_date,
                                                   v_end_date=v_end_date,
                                                   v_bill_year=v_bill_year,
                                                   v_bill_month=v_bill_month,
                                                   )

    shared_dataset_ref = bq_client.dataset(dataset_id)
    base_feature_set_view_ref = shared_dataset_ref.table(view_name)
    base_feature_set_view = bigquery.Table(base_feature_set_view_ref)
    base_feature_set_view.view_query = create_base_feature_set_query.format(project_id)
    base_feature_set_view = bq_client.create_table(base_feature_set_view)


### 6.create_input_account_demo_income_view

In [None]:
def create_input_account_demo_income_view(view_name: str,
                                          score_date: str,
                                          score_date_delta: str,
                                          dataset_id: str,
                                          project_id: str,
                                          region: str,
                                          resource_bucket: str,
                                          query_path: str
                                          ):

    from google.cloud import bigquery
    from google.cloud import storage

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    bq_client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)
    dataset = bq_client.dataset(dataset_id)
    table_ref = dataset.table(view_name)

    # load query from .txt file
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(resource_bucket)
    blob = bucket.get_blob(query_path)
    content = blob.download_as_string()
    content = str(content, 'utf-8')

    def if_tbl_exists(client, table_ref):
        from google.cloud.exceptions import NotFound
        try:
            client.get_table(table_ref)
            return True
        except NotFound:
            return False

    if if_tbl_exists(bq_client, table_ref):
        bq_client.delete_table(table_ref)

    create_base_feature_set_query = content.format(score_date=score_date,
                                                   score_date_delta=score_date_delta,
                                                   project_id=project_id,
                                                   dataset_id='common_dataset',
                                                   )

    shared_dataset_ref = bq_client.dataset(dataset_id)
    base_feature_set_view_ref = shared_dataset_ref.table(view_name)
    base_feature_set_view = bigquery.Table(base_feature_set_view_ref)
    base_feature_set_view.view_query = create_base_feature_set_query.format(project_id)
    base_feature_set_view = bq_client.create_table(base_feature_set_view)


### 7.create_input_account_gpon_copper_view

In [None]:
def create_input_account_gpon_copper_view(view_name: str,
                                          score_date: str,
                                          score_date_delta: str,
                                          dataset_id: str,
                                          project_id: str,
                                          region: str,
                                          resource_bucket: str,
                                          query_path: str
                                          ):

    from google.cloud import bigquery
    from google.cloud import storage

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    bq_client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)
    dataset = bq_client.dataset(dataset_id)
    table_ref = dataset.table(view_name)

    # load query from .txt file
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(resource_bucket)
    blob = bucket.get_blob(query_path)
    content = blob.download_as_string()
    content = str(content, 'utf-8')

    def if_tbl_exists(client, table_ref):
        from google.cloud.exceptions import NotFound
        try:
            client.get_table(table_ref)
            return True
        except NotFound:
            return False

    if if_tbl_exists(bq_client, table_ref):
        bq_client.delete_table(table_ref)

    create_base_feature_set_query = content.format(score_date=score_date,
                                                   score_date_delta=score_date_delta,
                                                   )

    shared_dataset_ref = bq_client.dataset(dataset_id)
    base_feature_set_view_ref = shared_dataset_ref.table(view_name)
    base_feature_set_view = bigquery.Table(base_feature_set_view_ref)
    base_feature_set_view.view_query = create_base_feature_set_query.format(project_id)
    base_feature_set_view = bq_client.create_table(base_feature_set_view)


### 8.create_input_account_price_plan_view

In [None]:
def create_input_account_price_plan_view(view_name: str,
                                        score_date: str,
                                        score_date_delta: str,
                                        project_id: str,
                                        dataset_id: str,
                                        region: str,
                                        resource_bucket: str,
                                        query_path: str,
                                        ):

    from google.cloud import bigquery
    from google.cloud import storage

    def if_tbl_exists(client, table_ref):
        from google.cloud.exceptions import NotFound
        try:
            client.get_table(table_ref)
            return True
        except NotFound:
            return False

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    bq_client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)
    dataset = bq_client.dataset(dataset_id)
    table_ref = dataset.table(view_name)

    # load query from .txt file
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(resource_bucket)
    blob = bucket.get_blob(query_path)
    content = blob.download_as_string()
    content = str(content, 'utf-8')

    if if_tbl_exists(bq_client, table_ref):
        bq_client.delete_table(table_ref)

    # content = open(query_path, 'r').read()

    create_base_feature_set_query = content.format(score_date=score_date,
                                                   score_date_delta=score_date_delta,
                                                   view_name=view_name,
                                                   dataset_id=dataset_id,
                                                   project_id=project_id,
                                                   )
    shared_dataset_ref = bq_client.dataset(dataset_id)
    base_feature_set_view_ref = shared_dataset_ref.table(view_name)
    base_feature_set_view = bigquery.Table(base_feature_set_view_ref)
    base_feature_set_view.view_query = create_base_feature_set_query.format(project_id)
    base_feature_set_view = bq_client.create_table(base_feature_set_view)


### 9.create_input_account_clckstrm_telus_view

In [None]:
def create_input_account_clckstrm_telus_view(view_name: str,
                                    score_date: str,
                                    score_date_delta: str,
                                    project_id: str,
                                    dataset_id: str,
                                    region: str,
                                    resource_bucket: str,
                                    query_path: str,
                                    ):

    from google.cloud import bigquery
    from google.cloud import storage

    def if_tbl_exists(client, table_ref):
        from google.cloud.exceptions import NotFound
        try:
            client.get_table(table_ref)
            return True
        except NotFound:
            return False

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    bq_client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)
    dataset = bq_client.dataset(dataset_id)
    table_ref = dataset.table(view_name)

    # load query from .txt file
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(resource_bucket)
    blob = bucket.get_blob(query_path)
    content = blob.download_as_string()
    content = str(content, 'utf-8')

    if if_tbl_exists(bq_client, table_ref):
        bq_client.delete_table(table_ref)

    # content = open(query_path, 'r').read()

    create_base_feature_set_query = content.format(score_date=score_date,
                                                   score_date_delta=score_date_delta,
                                                   view_name=view_name,
                                                   dataset_id=dataset_id,
                                                   project_id=project_id,
                                                   )
    shared_dataset_ref = bq_client.dataset(dataset_id)
    base_feature_set_view_ref = shared_dataset_ref.table(view_name)
    base_feature_set_view = bigquery.Table(base_feature_set_view_ref)
    base_feature_set_view.view_query = create_base_feature_set_query.format(project_id)
    base_feature_set_view = bq_client.create_table(base_feature_set_view)



### 10.create_input_account_call_history_view

In [None]:
def create_input_account_call_history_view(view_name: str,
                                        score_date: str,
                                        score_date_delta: str,
                                        project_id: str,
                                        dataset_id: str,
                                        region: str,
                                        resource_bucket: str,
                                        query_path: str,
                                        ):

    from google.cloud import bigquery
    from google.cloud import storage

    def if_tbl_exists(client, table_ref):
        from google.cloud.exceptions import NotFound
        try:
            client.get_table(table_ref)
            return True
        except NotFound:
            return False

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    bq_client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)
    dataset = bq_client.dataset(dataset_id)
    table_ref = dataset.table(view_name)

    # load query from .txt file
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(resource_bucket)
    blob = bucket.get_blob(query_path)
    content = blob.download_as_string()
    content = str(content, 'utf-8')

    if if_tbl_exists(bq_client, table_ref):
        bq_client.delete_table(table_ref)

    # content = open(query_path, 'r').read()

    create_base_feature_set_query = content.format(score_date=score_date,
                                                   score_date_delta=score_date_delta,
                                                   view_name=view_name,
                                                   dataset_id=dataset_id,
                                                   project_id=project_id,
                                                   )
    shared_dataset_ref = bq_client.dataset(dataset_id)
    base_feature_set_view_ref = shared_dataset_ref.table(view_name)
    base_feature_set_view = bigquery.Table(base_feature_set_view_ref)
    base_feature_set_view.view_query = create_base_feature_set_query.format(project_id)
    base_feature_set_view = bq_client.create_table(base_feature_set_view)


### Preprocess

In [None]:
def preprocess(
        promo_expiry_list_view: str, 
        account_consl_view: str, 
        account_bill_view: str, 
        account_discounts_view: str, 
        hs_usage_view: str, 
        demo_income_view: str, 
        gpon_copper_view: str, 
        price_plan_view: str, 
        clckstrm_telus_view: str, 
        call_history_view: str, 
        save_data_path: str,
        project_id: str,
        dataset_id: str
):

    from google.cloud import bigquery
    import pandas as pd
    import gc
    import time

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)
    
    #1.df_promo_expiry_list
    promo_expiry_list_set = f"{project_id}.{dataset_id}.{promo_expiry_list_view}" 
    build_df_promo_expiry_list = '''SELECT * FROM `{promo_expiry_list_set}`'''.format(promo_expiry_list_set=promo_expiry_list_set)
    df_promo_expiry_list = client.query(build_df_promo_expiry_list).to_dataframe()
    df_promo_expiry_list = df_promo_expiry_list.set_index('ban')
    df_join = df_promo_expiry_list.copy()
    print('......df_promo_expiry_list done')
    
    #2.df_consl
    consl_data_set = f"{project_id}.{dataset_id}.{account_consl_view}" 
    build_df_consl = '''SELECT * FROM `{consl_data_set}`'''.format(consl_data_set=consl_data_set)
    df_consl = client.query(build_df_consl).to_dataframe()
    df_mix = df_consl[[
        'ban',
        'customer_tenure', 
        'product_mix_all',
        'hsic_count',
        'ttv_count',
        'sing_count',
        'mob_count',
        'shs_count',
        'new_hsic_ind',
        'new_ttv_ind',
        'new_sing_ind',
        'new_c_ind',
        'new_smhm_ind',
        'mnh_ind'
    ]]
    df_mix = df_mix.drop_duplicates(subset=['ban']).set_index('ban').add_prefix('productMix_').fillna(0)
    df_join = df_join.join(df_mix)
    del df_mix
    gc.collect()
    print('......df_consl done')
    
    #3.df_bill
    bill_data_set = f"{project_id}.{dataset_id}.{account_bill_view}" 
    build_df_bill = '''SELECT * FROM `{bill_data_set}`'''.format(bill_data_set=bill_data_set)
    df_bill = client.query(build_df_bill).to_dataframe() 
    df_bill = df_bill.set_index('ban').add_prefix('ffhBill_')
    df_join = df_join.join(df_bill).fillna(0) 
    del df_bill
    gc.collect()
    print('......df_bill done')
    
    #4.df_discounts
    discounts_data_set = f"{project_id}.{dataset_id}.{account_discounts_view}" 
    build_df_discounts = '''SELECT * FROM `{discounts_data_set}`'''.format(discounts_data_set=discounts_data_set)
    df_discounts = client.query(build_df_discounts).to_dataframe() 
    df_discounts = df_discounts.set_index('ban').add_prefix('ffhDiscounts_')
    df_join = df_join.join(df_discounts).fillna(0) 
    del df_discounts
    gc.collect()
    print('......df_discounts done')

    #5.df_hs_usage
    hs_usage_data_set = f"{project_id}.{dataset_id}.{hs_usage_view}" 
    build_df_hs_usage = '''SELECT * FROM `{hs_usage_data_set}`'''.format(hs_usage_data_set=hs_usage_data_set)
    df_hs_usage = client.query(build_df_hs_usage).to_dataframe() 
    df_hs_usage = df_hs_usage.set_index('ban').add_prefix('hsiaUsage_')
    df_join = df_join.join(df_hs_usage).fillna(0) 
    del df_hs_usage
    gc.collect()
    print('......df_hs_usage done')

    #6.df_income
    demo_income_data_set = f"{project_id}.{dataset_id}.{demo_income_view}" 
    build_df_demo_income = '''SELECT * FROM `{demo_income_data_set}`'''.format(demo_income_data_set=demo_income_data_set)
    df_income = client.query(build_df_demo_income).to_dataframe()
    df_income = df_income.set_index('ban')
    df_income['demo_urban_flag'] = df_income.demo_sgname.str.lower().str.contains('urban').fillna(0).astype(int)
    df_income['demo_rural_flag'] = df_income.demo_sgname.str.lower().str.contains('rural').fillna(0).astype(int)
    df_income['demo_family_flag'] = df_income.demo_lsname.str.lower().str.contains('families').fillna(0).astype(int)
    df_income_dummies = pd.get_dummies(df_income[['demo_lsname']])
    df_income_dummies.columns = df_income_dummies.columns.str.replace('&', 'and')
    df_income_dummies.columns = df_income_dummies.columns.str.replace(' ', '_')
    df_income = df_income[['demo_avg_income', 'demo_urban_flag', 'demo_rural_flag', 'demo_family_flag']].join(
        df_income_dummies)
    df_income.demo_avg_income = df_income.demo_avg_income.astype(float)
    df_income.demo_avg_income = df_income.demo_avg_income.fillna(df_income.demo_avg_income.median())
    df_group_income = df_income.groupby('ban').agg('mean')
    df_group_income = df_group_income.add_prefix('demographics_')
    df_join = df_join.join(df_group_income.fillna(df_group_income.median()))
    del df_group_income
    del df_income
    gc.collect()
    print('......df_income done')

    #7.df_gpon_copper
    gpon_copper_data_set = f"{project_id}.{dataset_id}.{gpon_copper_view}"
    build_df_gpon_copper = '''SELECT * FROM `{gpon_copper_data_set}`'''.format(gpon_copper_data_set=gpon_copper_data_set)
    df_gpon_copper = client.query(build_df_gpon_copper).to_dataframe()
    df_gpon_copper = df_gpon_copper.set_index('ban')
    df_join = df_join.join(df_gpon_copper.add_prefix('infra_')).fillna(0)
    del df_gpon_copper
    gc.collect()
    print('......df_gpon_copper done')

    #8.df_price_plan
    price_plan_data_set = f"{project_id}.{dataset_id}.{price_plan_view}"
    build_df_price_plan = '''SELECT * FROM `{price_plan_data_set}`'''.format(price_plan_data_set=price_plan_data_set)
    df_price_plan = client.query(build_df_price_plan).to_dataframe()
    df_price_plan = df_price_plan.set_index('ban')
    df_pp_dummies = pd.get_dummies(df_price_plan[['price_plan']])
    df_pp_dummies.columns = df_pp_dummies.columns.str.replace('&', 'and')
    df_pp_dummies.columns = df_pp_dummies.columns.str.replace(' ', '_')
    df_price_plan = df_price_plan.join(df_pp_dummies)
    df_price_plan.drop(columns=['price_plan'], axis=1, inplace=True)
    print(df_price_plan.columns)
    df_join = df_join.join(df_price_plan.add_prefix('infra_')).fillna(0)
    del df_price_plan
    gc.collect()
    print('......df_price_plan done')

    #9.df_clckstrm_telus
    clckstrm_telus_data_set = f"{project_id}.{dataset_id}.{clckstrm_telus_view}" 
    build_df_clckstrm_telus = '''SELECT * FROM `{clckstrm_telus_data_set}`'''.format(clckstrm_telus_data_set=clckstrm_telus_data_set)
    df_clckstrm_telus = client.query(build_df_clckstrm_telus).to_dataframe() 
    df_clckstrm_telus = df_clckstrm_telus.set_index('ban').add_prefix('clckstrmData_')
    df_join = df_join.join(df_clckstrm_telus).fillna(0) 
    del df_clckstrm_telus
    gc.collect()
    print('......df_clckstrm_telus done')

    #10.df_call_history
    call_history_data_set = f"{project_id}.{dataset_id}.{call_history_view}" 
    build_df_call_history = '''SELECT * FROM `{call_history_data_set}`'''.format(call_history_data_set=call_history_data_set)
    df_call_history = client.query(build_df_call_history).to_dataframe() 
    df_call_history = df_call_history.set_index('ban').add_prefix('callHistory_')
    df_join = df_join.join(df_call_history)
    df_join[['callHistory_frequency', 'callHistory_have_called']] = df_join[['callHistory_frequency', 'callHistory_have_called']].fillna(0)
    df_join[['callHistory_recency']] = df_join[['callHistory_recency']].fillna(999)
    del df_call_history
    gc.collect()
    print('......df_call_history done')

    #column name clean-up
    df_join.columns = df_join.columns.str.replace(' ', '_')
    df_join.columns = df_join.columns.str.replace('-', '_')

    #df_final
    df_final = df_join.copy()
    del df_join
    gc.collect()
    print('......df_final done')

    for f in df_final.columns:
        df_final[f] = list(df_final[f])

    df_final.to_csv(save_data_path, index=True, compression='gzip') 
    del df_final
    gc.collect()
    print(f'......csv saved in {save_data_path}')
    time.sleep(120)


### Train and Save Model

In [None]:
def train_and_save_model(
            resource_bucket: str,
            service_type: str,
            score_date_dash: str,
            score_date_val_dash: str,
            project_id: str,
            dataset_id: str
            # metrics: Output[Metrics],
            # metricsc: Output[ClassificationMetrics]
):

    import gc
    import time
    import pandas as pd
    import numpy as np
    import pickle
    from google.cloud import storage
    from google.cloud import bigquery
    from sklearn.model_selection import train_test_split

    def get_lift(prob, y_test, q):
        result = pd.DataFrame(columns=['Prob', 'Call_To_Retention'])
        result['Prob'] = prob
        result['Call_To_Retention'] = y_test
        result['Decile'] = pd.qcut(result['Prob'], q, labels=[i for i in range(q, 0, -1)])
        add = pd.DataFrame(result.groupby('Decile')['Call_To_Retention'].mean()).reset_index()
        add.columns = ['Decile', 'avg_real_call_to_retention_rate']
        result = result.merge(add, on='Decile', how='left')
        result.sort_values('Decile', ascending=True, inplace=True)
        lg = pd.DataFrame(result.groupby('Decile')['Prob'].mean()).reset_index()
        lg.columns = ['Decile', 'avg_model_pred_call_to_retention_rate']
        lg.sort_values('Decile', ascending=False, inplace=True)
        lg['avg_call_to_retention_rate_total'] = result['Call_To_Retention'].mean()
        lg = lg.merge(add, on='Decile', how='left')
        lg['lift'] = lg['avg_real_call_to_retention_rate'] / lg['avg_call_to_retention_rate_total']

        return lg

    df_train = pd.read_csv('gs://{}/{}_train.csv.gz'.format(resource_bucket, service_type),
                           compression='gzip')  
    df_test = pd.read_csv('gs://{}/{}_validation.csv.gz'.format(resource_bucket, service_type),  
                          compression='gzip')

    #set up df_train
    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)
    
    sql_train = ''' SELECT * FROM `{}.{}.bq_call_to_retention_targets` '''.format(project_id, dataset_id) 
    df_target_train = client.query(sql_train).to_dataframe()
    df_target_train = df_target_train.loc[
        df_target_train['YEAR_MONTH'] == '-'.join(score_date_dash.split('-')[:2])]  # score_date_dash = '2022-08-31'
    df_target_train['ban'] = df_target_train['ban'].astype('int64')
    df_target_train = df_target_train.groupby('ban').tail(1)
    df_train = df_train.merge(df_target_train[['ban', 'target_ind']], on='ban', how='left')
    df_train.rename(columns={'target_ind': 'target'}, inplace=True)
    df_train.dropna(subset=['target'], inplace=True)
    df_train['target'] = df_train['target'].astype(int)
    print(df_train.shape)

    #set up df_test
    sql_test = ''' SELECT * FROM `{}.{}.bq_call_to_retention_targets` '''.format(project_id, dataset_id) 
    df_target_test = client.query(sql_test).to_dataframe()
    df_target_test = df_target_test.loc[
        df_target_test['YEAR_MONTH'] == '-'.join(score_date_val_dash.split('-')[:2])]  # score_date_dash = '2022-09-30'
    df_target_test['ban'] = df_target_test['ban'].astype('int64')
    df_target_test = df_target_test.groupby('ban').tail(1)
    df_test = df_test.merge(df_target_test[['ban', 'target_ind']], on='ban', how='left')
    df_test.rename(columns={'target_ind': 'target'}, inplace=True)
    df_test.dropna(subset=['target'], inplace=True)
    df_test['target'] = df_test['target'].astype(int)
    print(df_test.shape)

    #set up features (list)
    cols_1 = df_train.columns.values
    cols_2 = df_test.columns.values
    cols = set(cols_1).intersection(set(cols_2))
    features = [f for f in cols if f not in ['ban', 'target']]

    #train test split
    df_train, df_val = train_test_split(df_train, shuffle=True, test_size=0.2, random_state=42,
                                        stratify=df_train['target']
                                        )

    ban_train = df_train['ban']
    X_train = df_train[features]
    y_train = np.squeeze(df_train['target'].values)

    ban_val = df_val['ban']
    X_val = df_val[features]
    y_val = np.squeeze(df_val['target'].values)

    ban_test = df_test['ban']
    X_test = df_test[features]
    y_test = np.squeeze(df_test['target'].values)

    del df_train, df_val, df_test
    gc.collect()

    # build model and fit in training data
    import xgboost as xgb
    from sklearn.metrics import roc_auc_score

    xgb_model = xgb.XGBClassifier(
        learning_rate=0.01,
        n_estimators=100,
        max_depth=8,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',
        nthread=4,
        scale_pos_weight=1
        # seed=27
    )

    xgb_model.fit(X_train, y_train)
    print('xgb training done')

    from sklearn.preprocessing import normalize

    #predictions on X_val
    y_pred = xgb_model.predict_proba(X_val, ntree_limit=xgb_model.best_iteration)[:, 1]
    y_pred_label = (y_pred > 0.5).astype(int)
    auc = roc_auc_score(y_val, y_pred_label)
    metrics.log_metric("AUC", auc)

    pred_prb = xgb_model.predict_proba(X_test, ntree_limit=xgb_model.best_iteration)[:, 1]
    lg = get_lift(pred_prb, y_test, 10)
    lg.to_csv('gs://{}/lift_on_scoring_data_{}.csv'.format(resource_bucket, create_time, index=False))

    # save the model in GCS
    from datetime import datetime
    models_dict = {}
    create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    models_dict['create_time'] = create_time
    models_dict['model'] = xgb_model
    models_dict['features'] = features

    with open('model_dict.pkl', 'wb') as handle:
        pickle.dump(models_dict, handle)
    handle.close()

    storage_client = storage.Client()
    bucket = storage_client.get_bucket(resource_bucket)

    MODEL_PATH = '{}_xgb_models/'.format(service_type)
    blob = bucket.blob(MODEL_PATH)
    if not blob.exists(storage_client):
        blob.upload_from_string('')

    model_name_onbkt = '{}{}_models_xgb_{}'.format(MODEL_PATH, service_type, models_dict['create_time'])
    blob = bucket.blob(model_name_onbkt)
    blob.upload_from_filename('model_dict.pkl')

    print(f"....model loaded to GCS done at {str(create_time)}")

    time.sleep(120)


### Pipeline

In [None]:

# @dsl.pipeline(
#     # A name for the pipeline.
#     name="{}-xgb-pipeline".format(SERVICE_TYPE_NAME),
#     description=' pipeline for training {} model'.format(SERVICE_TYPE_NAME)
# )
def pipeline(
        project_id: str = PROJECT_ID,
        region: str = REGION,
        resource_bucket: str = RESOURCE_BUCKET
        # file_bucket: str = FILE_BUCKET
    ):
    # ------------- train view ops ---------------
    #1.create_input_account_promo_expiry_list_view
    create_input_account_promo_expiry_list_view_op = create_input_account_promo_expiry_list_view(
        view_name=PROMO_EXPIRY_LIST_VIEW_NAME,
        score_date=SCORE_DATE,
        score_date_delta=SCORE_DATE_DELTA,
        promo_expiry_start = PROMO_EXPIRY_START, 
        promo_expiry_end = PROMO_EXPIRY_END,
        project_id=PROJECT_ID,
        dataset_id=DATASET_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_PROMO_EXPIRY_LIST_QUERY_PATH
    )
    # create_input_account_promo_expiry_list_view_op.set_memory_limit('16G')
    # create_input_account_promo_expiry_list_view_op.set_cpu_limit('4')

    #2.create_input_account_consl_view
    create_input_account_consl_view_op = create_input_account_consl_view(
        view_name=CONSL_VIEW_NAME,
        score_date=SCORE_DATE,
        score_date_delta=SCORE_DATE_DELTA,
        project_id=PROJECT_ID,
        dataset_id=DATASET_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_CONSL_QUERY_PATH
    )
    # create_input_account_consl_view_op.set_memory_limit('16G')
    # create_input_account_consl_view_op.set_cpu_limit('4')

    #3.create_input_account_ffh_billing_view
    create_input_account_ffh_billing_view_op = create_input_account_ffh_billing_view(
        v_report_date=SCORE_DATE_DASH,
        v_start_date=SCORE_DATE_MINUS_6_MOS_DASH,
        v_end_date=SCORE_DATE_LAST_MONTH_END_DASH,
        v_bill_year=SCORE_DATE_LAST_MONTH_YEAR,
        v_bill_month=SCORE_DATE_LAST_MONTH_MONTH,
        view_name=FFH_BILLING_VIEW_NAME,
        dataset_id=DATASET_ID,
        project_id=PROJECT_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_FFH_BILLING_QUERY_PATH 
    )

    # create_input_account_ffh_billing_view_op.set_memory_limit('16G')
    # create_input_account_ffh_billing_view_op.set_cpu_limit('4')

    #4.create_input_account_ffh_discounts_view
    create_input_account_ffh_discounts_view_op = create_input_account_ffh_discounts_view(
        view_name=FFH_DISCOUNTS_VIEW_NAME,
        score_date=SCORE_DATE,
        score_date_delta=SCORE_DATE_DELTA,
        project_id=PROJECT_ID,
        dataset_id=DATASET_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_FFH_DISCOUNTS_QUERY_PATH
    )

    # create_input_account_ffh_discounts_view_op.set_memory_limit('16G')
    # create_input_account_ffh_discounts_view_op.set_cpu_limit('4')

    #5.create_input_account_hs_usage_view
    create_input_account_hs_usage_view_op = create_input_account_hs_usage_view(
        v_report_date=SCORE_DATE_DASH,
        v_start_date=SCORE_DATE_MINUS_6_MOS_DASH,
        v_end_date=SCORE_DATE_LAST_MONTH_END_DASH,
        v_bill_year=SCORE_DATE_LAST_MONTH_YEAR,
        v_bill_month=SCORE_DATE_LAST_MONTH_MONTH,
        view_name=HS_USAGE_VIEW_NAME,
        dataset_id=DATASET_ID,
        project_id=PROJECT_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_HS_USAGE_QUERY_PATH 
    )

    # create_input_account_hs_usage_view_op.set_memory_limit('16G')
    # create_input_account_hs_usage_view_op.set_cpu_limit('4')

    #6.create_input_account_demo_income_view
    create_input_account_demo_income_view_op = create_input_account_demo_income_view(
        score_date=SCORE_DATE,
        score_date_delta=SCORE_DATE_DELTA,
        view_name=DEMO_INCOME_VIEW_NAME ,
        dataset_id=DATASET_ID,
        project_id=PROJECT_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_DEMO_INCOME_QUERY_PATH 
    )

    # create_input_account_demo_income_view_op.set_memory_limit('16G')
    # create_input_account_demo_income_view_op.set_cpu_limit('4')

    #7.create_input_account_gpon_copper_view
    create_input_account_gpon_copper_view_op = create_input_account_gpon_copper_view(
        score_date=SCORE_DATE,
        score_date_delta=SCORE_DATE_DELTA,
        view_name=GPON_COPPER_VIEW_NAME,
        dataset_id=DATASET_ID,
        project_id=PROJECT_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_GPON_COPPER_QUERY_PATH 
    )

    # create_input_account_gpon_copper_view_op.set_memory_limit('16G')
    # create_input_account_gpon_copper_view_op.set_cpu_limit('4')

    #8.create_input_account_price_plan_view
    create_input_account_price_plan_view_op = create_input_account_price_plan_view(
        score_date=SCORE_DATE,
        score_date_delta=SCORE_DATE_DELTA,
        view_name=PRICE_PLAN_VIEW_NAME ,
        dataset_id=DATASET_ID,
        project_id=PROJECT_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_PRICE_PLAN_QUERY_PATH 
    )

    # create_input_account_price_plan_view_op.set_memory_limit('16G')
    # create_input_account_price_plan_view_op.set_cpu_limit('4')

    #9.create_input_account_clckstrm_telus_view
    create_input_account_clckstrm_telus_view_op = create_input_account_clckstrm_telus_view(
        score_date=SCORE_DATE,
        score_date_delta=SCORE_DATE_DELTA,
        view_name=CLCKSTRM_TELUS_VIEW_NAME,
        dataset_id=DATASET_ID,
        project_id=PROJECT_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_CLCKSTRM_TELUS_QUERY_PATH
    )

    # create_input_account_clckstrm_telus_view_op.set_memory_limit('16G')
    # create_input_account_clckstrm_telus_view_op.set_cpu_limit('4')

    #10.create_input_account_call_history_view
    create_input_account_call_history_view_op = create_input_account_call_history_view(
        score_date=SCORE_DATE,
        score_date_delta=SCORE_DATE_DELTA,
        view_name=CALL_HISTORY_VIEW_NAME,
        dataset_id=DATASET_ID,
        project_id=PROJECT_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_CALL_HISTORY_QUERY_PATH 
    )

    # create_input_account_call_history_view_op.set_memory_limit('16G')
    # create_input_account_call_history_view_op.set_cpu_limit('4')

    # ----- preprocessing train data --------
    preprocess_train_op = preprocess(
        promo_expiry_list_view = PROMO_EXPIRY_LIST_VIEW_NAME, 
        account_consl_view=CONSL_VIEW_NAME,
        account_bill_view=FFH_BILLING_VIEW_NAME,
        account_discounts_view=FFH_DISCOUNTS_VIEW_NAME, 
        hs_usage_view=HS_USAGE_VIEW_NAME,
        demo_income_view=DEMO_INCOME_VIEW_NAME,
        gpon_copper_view=GPON_COPPER_VIEW_NAME,
        price_plan_view=PRICE_PLAN_VIEW_NAME,
        clckstrm_telus_view=CLCKSTRM_TELUS_VIEW_NAME, 
        call_history_view=CALL_HISTORY_VIEW_NAME, 
        save_data_path='gs://{}/{}_train.csv.gz'.format(RESOURCE_BUCKET, SERVICE_TYPE),
        project_id=PROJECT_ID,
        dataset_id=DATASET_ID
    )

    # preprocess_train_op.set_memory_limit('128G')
    # preprocess_train_op.set_cpu_limit('32')

    create_input_account_promo_expiry_list_view_op
    create_input_account_consl_view_op
    create_input_account_ffh_billing_view_op
    create_input_account_ffh_discounts_view_op
    create_input_account_hs_usage_view_op
    create_input_account_demo_income_view_op
    create_input_account_gpon_copper_view_op
    create_input_account_price_plan_view_op
    create_input_account_clckstrm_telus_view_op
    create_input_account_call_history_view_op
    preprocess_train_op

    # --------------- validation view ops ---------------
    #1.create_input_account_promo_expiry_list_view
    create_input_account_promo_expiry_list_validation_view_op = create_input_account_promo_expiry_list_view(
        view_name=PROMO_EXPIRY_LIST_VIEW_VALIDATION_NAME,
        score_date=SCORE_DATE_VAL,
        score_date_delta=SCORE_DATE_VAL_DELTA,
        promo_expiry_start = PROMO_EXPIRY_START_VAL, 
        promo_expiry_end = PROMO_EXPIRY_END_VAL, 
        project_id=PROJECT_ID,
        dataset_id=DATASET_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_PROMO_EXPIRY_LIST_QUERY_PATH
    )
    # create_input_account_promo_expiry_list_validation_view_op.set_memory_limit('16G')
    # create_input_account_promo_expiry_list_validation_view_op.set_cpu_limit('4')

    #2.create_input_account_consl_view
    create_input_account_consl_validation_view_op = create_input_account_consl_view(
        view_name=CONSL_VIEW_VALIDATION_NAME, 
        score_date=SCORE_DATE_VAL,
        score_date_delta=SCORE_DATE_VAL_DELTA,
        project_id=PROJECT_ID,
        dataset_id=DATASET_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_CONSL_QUERY_PATH
    )
    # create_input_account_consl_validation_view_op.set_memory_limit('16G')
    # create_input_account_consl_validation_view_op.set_cpu_limit('4')

    #3.create_input_account_ffh_billing_view
    create_input_account_ffh_billing_validation_view_op = create_input_account_ffh_billing_view(
        v_report_date=SCORE_DATE_VAL_DASH,
        v_start_date=SCORE_DATE_VAL_MINUS_6_MOS_DASH,
        v_end_date=SCORE_DATE_VAL_LAST_MONTH_END_DASH,
        v_bill_year=SCORE_DATE_VAL_LAST_MONTH_YEAR,
        v_bill_month=SCORE_DATE_VAL_LAST_MONTH_MONTH,
        view_name=FFH_BILLING_VIEW_VALIDATION_NAME,
        dataset_id=DATASET_ID,
        project_id=PROJECT_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_FFH_BILLING_QUERY_PATH 
    )

    # create_input_account_ffh_billing_validation_view_op.set_memory_limit('16G')
    # create_input_account_ffh_billing_validation_view_op.set_cpu_limit('4')

    #4.create_input_account_ffh_discounts_view
    create_input_account_ffh_discounts_validation_view_op = create_input_account_ffh_discounts_view(
        view_name=FFH_DISCOUNTS_VIEW_VALIDATION_NAME, 
        score_date=SCORE_DATE_VAL,
        score_date_delta=SCORE_DATE_VAL_DELTA,
        project_id=PROJECT_ID,
        dataset_id=DATASET_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_FFH_DISCOUNTS_QUERY_PATH
    )

    # create_input_account_ffh_discounts_validation_view_op.set_memory_limit('16G')
    # create_input_account_ffh_discounts_validation_view_op.set_cpu_limit('4')

    #5.create_input_account_hs_usage_view
    create_input_account_hs_usage_validation_view_op = create_input_account_hs_usage_view(
        v_report_date=SCORE_DATE_VAL_DASH,
        v_start_date=SCORE_DATE_VAL_MINUS_6_MOS_DASH,
        v_end_date=SCORE_DATE_VAL_LAST_MONTH_END_DASH,
        v_bill_year=SCORE_DATE_VAL_LAST_MONTH_YEAR,
        v_bill_month=SCORE_DATE_VAL_LAST_MONTH_MONTH,
        view_name=HS_USAGE_VIEW_VALIDATION_NAME,
        dataset_id=DATASET_ID,
        project_id=PROJECT_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_HS_USAGE_QUERY_PATH 
    )

    # create_input_account_hs_usage_validation_view_op.set_memory_limit('16G')
    # create_input_account_hs_usage_validation_view_op.set_cpu_limit('4')

    #6.create_input_account_demo_income_view
    create_input_account_demo_income_validation_view_op = create_input_account_demo_income_view(
        score_date=SCORE_DATE_VAL,
        score_date_delta=SCORE_DATE_VAL_DELTA,
        view_name=DEMO_INCOME_VIEW_VALIDATION_NAME,
        dataset_id=DATASET_ID,
        project_id=PROJECT_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_DEMO_INCOME_QUERY_PATH
    )

    # create_input_account_demo_income_validation_view_op.set_memory_limit('16G')
    # create_input_account_demo_income_validation_view_op.set_cpu_limit('4')

    #7.create_input_account_gpon_copper_view
    create_input_account_gpon_copper_validation_view_op = create_input_account_gpon_copper_view(
        score_date=SCORE_DATE_VAL,
        score_date_delta=SCORE_DATE_VAL_DELTA,
        view_name=GPON_COPPER_VIEW_VALIDATION_NAME,
        dataset_id=DATASET_ID,
        project_id=PROJECT_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_GPON_COPPER_QUERY_PATH
    )

    # create_input_account_gpon_copper_validation_view_op.set_memory_limit('16G')
    # create_input_account_gpon_copper_validation_view_op.set_cpu_limit('4')

    #8.create_input_account_price_plan_view
    create_input_account_price_plan_validation_view_op = create_input_account_price_plan_view(
        score_date=SCORE_DATE_VAL,
        score_date_delta=SCORE_DATE_VAL_DELTA,
        view_name=PRICE_PLAN_VIEW_VALIDATION_NAME,
        dataset_id=DATASET_ID,
        project_id=PROJECT_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_PRICE_PLAN_QUERY_PATH
    )

    # create_input_account_price_plan_validation_view_op.set_memory_limit('16G')
    # create_input_account_price_plan_validation_view_op.set_cpu_limit('4')

    #9.create_input_account_clckstrm_telus_view
    create_input_account_clckstrm_telus_validation_view_op = create_input_account_clckstrm_telus_view(
        score_date=SCORE_DATE_VAL,
        score_date_delta=SCORE_DATE_VAL_DELTA,
        view_name=CLCKSTRM_TELUS_VIEW_VALIDATION_NAME,
        dataset_id=DATASET_ID,
        project_id=PROJECT_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_CLCKSTRM_TELUS_QUERY_PATH
    )

    # create_input_account_clckstrm_telus_validation_view_op.set_memory_limit('16G')
    # create_input_account_clckstrm_telus_validation_view_op.set_cpu_limit('4')

    #10.create_input_account_call_history_view
    create_input_account_call_history_validation_view_op = create_input_account_call_history_view(
        score_date=SCORE_DATE_VAL,
        score_date_delta=SCORE_DATE_VAL_DELTA,
        view_name=CALL_HISTORY_VIEW_VALIDATION_NAME,
        dataset_id=DATASET_ID,
        project_id=PROJECT_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_CALL_HISTORY_QUERY_PATH 
    )

    # create_input_account_call_history_validation_view_op.set_memory_limit('16G')
    # create_input_account_call_history_validation_view_op.set_cpu_limit('4')

    # ----- preprocessing validation data --------
    preprocess_validation_op = preprocess(
        promo_expiry_list_view = PROMO_EXPIRY_LIST_VIEW_VALIDATION_NAME, 
        account_consl_view=CONSL_VIEW_VALIDATION_NAME,
        account_bill_view=FFH_BILLING_VIEW_VALIDATION_NAME,
        account_discounts_view=FFH_DISCOUNTS_VIEW_VALIDATION_NAME, 
        hs_usage_view=HS_USAGE_VIEW_VALIDATION_NAME,
        demo_income_view=DEMO_INCOME_VIEW_VALIDATION_NAME,
        gpon_copper_view=GPON_COPPER_VIEW_VALIDATION_NAME,
        price_plan_view=PRICE_PLAN_VIEW_VALIDATION_NAME,
        clckstrm_telus_view=CLCKSTRM_TELUS_VIEW_VALIDATION_NAME, 
        call_history_view=CALL_HISTORY_VIEW_VALIDATION_NAME, 
        save_data_path='gs://{}/{}_validation.csv.gz'.format(RESOURCE_BUCKET, SERVICE_TYPE),
        project_id=PROJECT_ID,
        dataset_id=DATASET_ID
    )

    # preprocess_validation_op.set_memory_limit('256G')
    # preprocess_validation_op.set_cpu_limit('32')

    create_input_account_promo_expiry_list_validation_view_op
    create_input_account_consl_validation_view_op
    create_input_account_ffh_billing_validation_view_op
    create_input_account_ffh_discounts_validation_view_op
    create_input_account_hs_usage_validation_view_op
    create_input_account_demo_income_validation_view_op
    create_input_account_gpon_copper_validation_view_op
    create_input_account_price_plan_validation_view_op
    create_input_account_clckstrm_telus_validation_view_op
    create_input_account_call_history_validation_view_op
    preprocess_train_op

    train_and_save_model_op = train_and_save_model(resource_bucket=RESOURCE_BUCKET,
                                                   service_type=SERVICE_TYPE,
                                                   score_date_dash=SCORE_DATE_DASH,
                                                   score_date_val_dash=SCORE_DATE_VAL_DASH,
                                                   project_id=PROJECT_ID,
                                                   dataset_id=DATASET_ID,
                                                   )
    
    train_and_save_model_op
    
#     train_and_save_model_op.set_memory_limit('256G')
#     train_and_save_model_op.set_cpu_limit('32')

#     train_and_save_model_op.after(preprocess_train_op)
#     train_and_save_model_op.after(preprocess_validation_op)


### Run the Pipeline Job

In [None]:
# pipeline(project_id = PROJECT_ID,
#         region = REGION,
#         resource_bucket = RESOURCE_BUCKET,
#         file_bucket = FILE_BUCKET)


pipeline(project_id = PROJECT_ID,
        region = REGION,
        resource_bucket = RESOURCE_BUCKET)

In [None]:
# from kfp.v2 import compiler
# from google.cloud.aiplatform import pipeline_jobs

# import json

# compiler.Compiler().compile(
#    pipeline_func=pipeline, package_path="pipeline.json"
# )

# job = pipeline_jobs.PipelineJob(
#                                display_name=PIPELINE_NAME,
#                                template_path="pipeline.json",
#                                location=REGION,
#                                enable_caching=False,
#                                pipeline_root = f"gs://{RESOURCE_BUCKET}"
# )
# job.run(
#    service_account = f"bilayer-sa@{PROJECT_ID}.iam.gserviceaccount.com"
# )