### Import Libraries

In [None]:
# import required libraries
from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output, OutputPath, ClassificationMetrics,
                        Metrics, component)
import os
import re
from pathlib import Path

from datetime import date
from datetime import timedelta
from dateutil.relativedelta import relativedelta

import google
from google.oauth2 import credentials
from google.oauth2 import service_account
from google.oauth2.service_account import Credentials
from google.cloud import storage
from google.cloud.aiplatform import pipeline_jobs
from google_cloud_pipeline_components.v1.batch_predict_job import \
    ModelBatchPredictOp as batch_prediction_op


### Parameters

In [None]:
#tag cell with parameters
PROJECT_ID =  ''
BUCKET_NAME=''
DATASET_ID = ''
RESOURCE_BUCKET = ''
FILE_BUCKET = ''
REGION = ''
MODEL_ID = '5090'

In [None]:
#tag cell with parameters
PROJECT_ID =  'divg-josh-pr-d1cc3a'
BUCKET_NAME='divg-josh-pr-d1cc3a-default'
DATASET_ID = 'call_to_retention_dataset'
RESOURCE_BUCKET = 'divg-josh-pr-d1cc3a-default'
FILE_BUCKET = 'divg-josh-pr-d1cc3a-default'
MODEL_ID = '5090'

### Service Parameters

In [None]:
SERVICE_TYPE = 'call_to_retention'
SERVICE_TYPE_NAME = 'call-to-retention'
TABLE_ID = 'bq_call_to_retention_targets'
REGION = "northamerica-northeast1"

### Pulumi Parameters

In [None]:
STACK_NAME = 'call_to_retention'
TRAIN_PIPELINE_NAME_PATH = 'train_pipeline'
PREDICT_PIPELINE_NAME_PATH = 'predict_pipeline'
TRAIN_PIPELINE_NAME = 'call-to-retention-train-pipeline' # Same name as pulumi.yaml
PREDICT_PIPELINE_NAME = 'call-to-retention-predict-pipeline' # Same name as pulumi.yaml
TRAIN_PIPELINE_DESCRIPTION = 'call-to-retention-train-pipeline'
PREDICT_PIPELINE_DESCRIPTION = 'call-to-retention-predict-pipeline'
REGION = "northamerica-northeast1"

### Query + Pre-Processing Component Parameters

In [None]:
TRAIN_QUERIES_PATH = f"{STACK_NAME}/{TRAIN_PIPELINE_NAME_PATH}/queries/" 
TRAIN_UTILS_FILE_PATH = f"{STACK_NAME}/{TRAIN_PIPELINE_NAME_PATH}/utils" 
UTILS_FILENAME = 'utils.py'

PROCESSED_SERVING_DATA_TABLENAME = 'processed_serving_data'
INPUT_SERVING_DATA_TABLENAME = 'input_serving_data'

QUERY_DATE = (date.today() - relativedelta(days=1)).strftime('%Y-%m-%d')
TARGET_TABLE_REF = '{}.{}.{}'.format(PROJECT_ID, DATASET_ID, TABLE_ID)

QUERIES_PATH = 'call_to_retention/queries/'

#Query Paths
ACCOUNT_PROMO_EXPIRY_LIST_QUERY_PATH = QUERIES_PATH + 'create_input_account_promo_expiry_list_query.sql'
ACCOUNT_CONSL_QUERY_PATH = QUERIES_PATH + 'create_input_account_consl_query.sql'
ACCOUNT_FFH_BILLING_QUERY_PATH = QUERIES_PATH + 'create_input_account_ffh_billing_query.sql'
ACCOUNT_FFH_DISCOUNTS_QUERY_PATH = QUERIES_PATH + 'create_input_account_ffh_discounts_query.sql'
ACCOUNT_HS_USAGE_QUERY_PATH = QUERIES_PATH + 'create_input_account_hs_usage_query.sql'
ACCOUNT_DEMO_INCOME_QUERY_PATH = QUERIES_PATH + 'create_input_account_demo_income_query.sql'
ACCOUNT_GPON_COPPER_QUERY_PATH = QUERIES_PATH + 'create_input_account_gpon_copper_query.sql'
ACCOUNT_PRICE_PLAN_QUERY_PATH = QUERIES_PATH + 'create_input_account_price_plan_query.sql'
ACCOUNT_CLCKSTRM_TELUS_QUERY_PATH = QUERIES_PATH + 'create_input_account_clckstrm_telus_query.sql'
ACCOUNT_CALL_HISTORY_QUERY_PATH = QUERIES_PATH + 'create_input_account_call_history_query.sql'


### Import Pipeline Components

In [None]:
# download required component files to local
prefix = f'{STACK_NAME}/{TRAIN_PIPELINE_NAME_PATH}/components/'
dl_dir = 'components/'

storage_client = storage.Client()
bucket = storage_client.bucket(RESOURCE_BUCKET)
blobs = bucket.list_blobs(prefix=prefix)  # Get list of files
for blob in blobs: # download each file that starts with "prefix" into "dl_dir"
    if blob.name.endswith("/"):
        continue
    file_split = blob.name.split(prefix)
    file_path = f"{dl_dir}{file_split[-1]}"
    directory = "/".join(file_path.split("/")[0:-1])
    Path(directory).mkdir(parents=True, exist_ok=True)
    blob.download_to_filename(file_path) 

# import main pipeline components
import components


### Date Parameters

In [None]:
scoringDate = date.today() - relativedelta(days=3)

# current views
PROMO_EXPIRY_LIST_VIEW_NAME = '{}_pipeline_promo_expiry_list_data_curr_bi_layer'.format(SERVICE_TYPE)  
CONSL_VIEW_NAME = '{}_pipeline_consl_data_curr_bi_layer'.format(SERVICE_TYPE)  
FFH_BILLING_VIEW_NAME = '{}_pipeline_ffh_billing_data_curr_bi_layer'.format(SERVICE_TYPE)  
FFH_DISCOUNTS_VIEW_NAME = '{}_pipeline_ffh_discounts_data_curr_bi_layer'.format(SERVICE_TYPE)  
HS_USAGE_VIEW_NAME = '{}_pipeline_hs_usage_data_curr_bi_layer'.format(SERVICE_TYPE)  
DEMO_INCOME_VIEW_NAME = '{}_pipeline_demo_income_data_curr_bi_layer'.format(SERVICE_TYPE)  
GPON_COPPER_VIEW_NAME = '{}_pipeline_gpon_copper_data_curr_bi_layer'.format(SERVICE_TYPE)  
PRICE_PLAN_VIEW_NAME = '{}_pipeline_price_plan_data_curr_bi_layer'.format(SERVICE_TYPE)  
CLCKSTRM_TELUS_VIEW_NAME = '{}_pipeline_clckstrm_telus_curr_bi_layer'.format(SERVICE_TYPE)
CALL_HISTORY_VIEW_NAME = '{}_pipeline_call_history_data_curr_bi_layer'.format(SERVICE_TYPE)  

# training dates
SCORE_DATE = scoringDate.strftime('%Y%m%d')  # date.today().strftime('%Y%m%d')
SCORE_DATE_DASH = scoringDate.strftime('%Y-%m-%d')
SCORE_DATE_MINUS_6_MOS_DASH = ((scoringDate - relativedelta(months=6)).replace(day=1)).strftime('%Y-%m-%d')
SCORE_DATE_THIS_MONTH_START_DASH = scoringDate.replace(day=1)
SCORE_DATE_THIS_MONTH_END_DASH = (((scoringDate.replace(day=1)) + relativedelta(months=1)).replace(day=1) - timedelta(days=1)).strftime('%Y-%m-%d')
SCORE_DATE_LAST_MONTH_START_DASH = (scoringDate.replace(day=1) - timedelta(days=1)).replace(day=1).strftime('%Y-%m-%d')
SCORE_DATE_LAST_MONTH_END_DASH = ((scoringDate.replace(day=1)) - timedelta(days=1)).strftime('%Y-%m-%d')
SCORE_DATE_LAST_MONTH_YEAR = ((scoringDate.replace(day=1)) - timedelta(days=1)).year
SCORE_DATE_LAST_MONTH_MONTH = ((scoringDate.replace(day=1)) - timedelta(days=1)).month

#revert these changes after 2023-05-30
# PROMO_EXPIRY_START = (scoringDate.replace(day=1) + relativedelta(months=3)).replace(day=1).strftime('%Y%m%d')
# PROMO_EXPIRY_END = (scoringDate.replace(day=1) + relativedelta(months=4)).replace(day=1).strftime('%Y%m%d')
PROMO_EXPIRY_START = (scoringDate.replace(day=1) + relativedelta(months=4)).replace(day=1).strftime('%Y%m%d')
PROMO_EXPIRY_END = (scoringDate.replace(day=1) + relativedelta(months=5)).replace(day=1).strftime('%Y%m%d')

SCORE_DATE_DELTA = 0
SCORE_DATE_VAL_DELTA = 0
TICKET_DATE_WINDOW = 30  # Days of ticket data to be queried


### 1.create_input_account_promo_expiry_list_view

In [None]:
def create_input_account_promo_expiry_list_view(view_name: str,
                                           score_date: str,
                                           score_date_delta: str,
                                           dataset_id: str,
                                           project_id: str,
                                           region: str,
                                           resource_bucket: str,
                                           query_path: str, 
                                           promo_expiry_start: str, 
                                           promo_expiry_end: str
                                           ):

    from google.cloud import bigquery
    from google.cloud import storage

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    bq_client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)
    dataset = bq_client.dataset(dataset_id)
    table_ref = dataset.table(view_name)

    # load query from .txt file
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(resource_bucket)
    blob = bucket.get_blob(query_path)
    content = blob.download_as_string()
    content = str(content, 'utf-8')

    def if_tbl_exists(client, table_ref):
        from google.cloud.exceptions import NotFound
        try:
            client.get_table(table_ref)
            return True
        except NotFound:
            return False

    if if_tbl_exists(bq_client, table_ref):
        bq_client.delete_table(table_ref)

    create_base_feature_set_query = content.format(score_date=score_date,
                                                   score_date_delta=score_date_delta,
                                                   view_name=view_name,
                                                   dataset_id=dataset_id,
                                                   project_id=project_id,
                                                   promo_expiry_start=promo_expiry_start, 
                                                   promo_expiry_end=promo_expiry_end
                                                   )
    shared_dataset_ref = bq_client.dataset(dataset_id)
    base_feature_set_view_ref = shared_dataset_ref.table(view_name)
    base_feature_set_view = bigquery.Table(base_feature_set_view_ref)
    base_feature_set_view.view_query = create_base_feature_set_query.format(project_id)
    base_feature_set_view = bq_client.create_table(base_feature_set_view)

### 2.create_input_account_consl_view

In [None]:
def create_input_account_consl_view(view_name: str,
                                    score_date: str,
                                    score_date_delta: str,
                                    project_id: str,
                                    dataset_id: str,
                                    region: str,
                                    resource_bucket: str,
                                    query_path: str,
                                    ):

    from google.cloud import bigquery
    from google.cloud import storage

    def if_tbl_exists(client, table_ref):
        from google.cloud.exceptions import NotFound
        try:
            client.get_table(table_ref)
            return True
        except NotFound:
            return False

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    bq_client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)
    dataset = bq_client.dataset(dataset_id)
    table_ref = dataset.table(view_name)

    # load query from .txt file
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(resource_bucket)
    blob = bucket.get_blob(query_path)
    content = blob.download_as_string()
    content = str(content, 'utf-8')

    if if_tbl_exists(bq_client, table_ref):
        bq_client.delete_table(table_ref)

    # content = open(query_path, 'r').read()

    create_base_feature_set_query = content.format(score_date=score_date,
                                                   score_date_delta=score_date_delta,
                                                   view_name=view_name,
                                                   dataset_id=dataset_id,
                                                   project_id=project_id,
                                                   )
    shared_dataset_ref = bq_client.dataset(dataset_id)
    base_feature_set_view_ref = shared_dataset_ref.table(view_name)
    base_feature_set_view = bigquery.Table(base_feature_set_view_ref)
    base_feature_set_view.view_query = create_base_feature_set_query.format(project_id)
    base_feature_set_view = bq_client.create_table(base_feature_set_view)

### 3.create_input_account_ffh_billing_view

In [None]:
def create_input_account_ffh_billing_view(view_name: str,
                                          v_report_date: str,
                                          v_start_date: str,
                                          v_end_date: str,
                                          v_bill_year: str,
                                          v_bill_month: str,
                                          dataset_id: str,
                                          project_id: str,
                                          region: str,
                                          resource_bucket: str,
                                          query_path: str
                                          ):
    from google.cloud import bigquery
    from google.cloud import storage

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    bq_client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)
    dataset = bq_client.dataset(dataset_id)
    table_ref = dataset.table(view_name)

    # load query from .txt file
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(resource_bucket)
    blob = bucket.get_blob(query_path)
    content = blob.download_as_string()
    content = str(content, 'utf-8')

    def if_tbl_exists(client, table_ref):
        from google.cloud.exceptions import NotFound
        try:
            client.get_table(table_ref)
            return True
        except NotFound:
            return False

    if if_tbl_exists(bq_client, table_ref):
        bq_client.delete_table(table_ref)

    create_base_feature_set_query = content.format(v_report_date=v_report_date,
                                                   v_start_date=v_start_date,
                                                   v_end_date=v_end_date,
                                                   v_bill_year=v_bill_year,
                                                   v_bill_month=v_bill_month,
                                                   )

    shared_dataset_ref = bq_client.dataset(dataset_id)
    base_feature_set_view_ref = shared_dataset_ref.table(view_name)
    base_feature_set_view = bigquery.Table(base_feature_set_view_ref)
    base_feature_set_view.view_query = create_base_feature_set_query.format(project_id)
    base_feature_set_view = bq_client.create_table(base_feature_set_view)


### 4.create_input_account_ffh_discounts_view

In [None]:
def create_input_account_ffh_discounts_view(view_name: str,
                                        score_date: str,
                                        score_date_delta: str,
                                        project_id: str,
                                        dataset_id: str,
                                        region: str,
                                        resource_bucket: str,
                                        query_path: str,
                                        ):

    from google.cloud import bigquery
    from google.cloud import storage

    def if_tbl_exists(client, table_ref):
        from google.cloud.exceptions import NotFound
        try:
            client.get_table(table_ref)
            return True
        except NotFound:
            return False

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    bq_client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)
    dataset = bq_client.dataset(dataset_id)
    table_ref = dataset.table(view_name)

    # load query from .txt file
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(resource_bucket)
    blob = bucket.get_blob(query_path)
    content = blob.download_as_string()
    content = str(content, 'utf-8')
    
    if if_tbl_exists(bq_client, table_ref):
        bq_client.delete_table(table_ref)

    # content = open(query_path, 'r').read()

    create_base_feature_set_query = content.format(score_date=score_date,
                                                   score_date_delta=score_date_delta, 
                                                   view_name=view_name,
                                                   dataset_id=dataset_id,
                                                   project_id=project_id
                                                   )
    shared_dataset_ref = bq_client.dataset(dataset_id)
    base_feature_set_view_ref = shared_dataset_ref.table(view_name)
    base_feature_set_view = bigquery.Table(base_feature_set_view_ref)
    base_feature_set_view.view_query = create_base_feature_set_query
    base_feature_set_view = bq_client.create_table(base_feature_set_view)


### 5.create_input_account_hs_usage_view

In [None]:
def create_input_account_hs_usage_view(view_name: str,
                                       v_report_date: str,
                                       v_start_date: str,
                                       v_end_date: str,
                                       v_bill_year: str,
                                       v_bill_month: str,
                                       dataset_id: str,
                                       project_id: str,
                                       region: str,
                                       resource_bucket: str,
                                       query_path: str
                                       ):

    from google.cloud import bigquery
    from google.cloud import storage

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    bq_client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)
    dataset = bq_client.dataset(dataset_id)
    table_ref = dataset.table(view_name)

    # load query from .txt file
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(resource_bucket)
    blob = bucket.get_blob(query_path)
    content = blob.download_as_string()
    content = str(content, 'utf-8')

    def if_tbl_exists(client, table_ref):
        from google.cloud.exceptions import NotFound
        try:
            client.get_table(table_ref)
            return True
        except NotFound:
            return False

    if if_tbl_exists(bq_client, table_ref):
        bq_client.delete_table(table_ref)

    create_base_feature_set_query = content.format(v_report_date=v_report_date,
                                                   v_start_date=v_start_date,
                                                   v_end_date=v_end_date,
                                                   v_bill_year=v_bill_year,
                                                   v_bill_month=v_bill_month,
                                                   )

    shared_dataset_ref = bq_client.dataset(dataset_id)
    base_feature_set_view_ref = shared_dataset_ref.table(view_name)
    base_feature_set_view = bigquery.Table(base_feature_set_view_ref)
    base_feature_set_view.view_query = create_base_feature_set_query.format(project_id)
    base_feature_set_view = bq_client.create_table(base_feature_set_view)


### 6.create_input_account_demo_income_view

In [None]:
def create_input_account_demo_income_view(view_name: str,
                                          score_date: str,
                                          score_date_delta: str,
                                          dataset_id: str,
                                          project_id: str,
                                          region: str,
                                          resource_bucket: str,
                                          query_path: str
                                          ):

    from google.cloud import bigquery
    from google.cloud import storage

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    bq_client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)
    dataset = bq_client.dataset(dataset_id)
    table_ref = dataset.table(view_name)

    # load query from .txt file
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(resource_bucket)
    blob = bucket.get_blob(query_path)
    content = blob.download_as_string()
    content = str(content, 'utf-8')

    def if_tbl_exists(client, table_ref):
        from google.cloud.exceptions import NotFound
        try:
            client.get_table(table_ref)
            return True
        except NotFound:
            return False

    if if_tbl_exists(bq_client, table_ref):
        bq_client.delete_table(table_ref)

    create_base_feature_set_query = content.format(score_date=score_date,
                                                   score_date_delta=score_date_delta,
                                                   project_id=project_id,
                                                   dataset_id='common_dataset',
                                                   )

    shared_dataset_ref = bq_client.dataset(dataset_id)
    base_feature_set_view_ref = shared_dataset_ref.table(view_name)
    base_feature_set_view = bigquery.Table(base_feature_set_view_ref)
    base_feature_set_view.view_query = create_base_feature_set_query.format(project_id)
    base_feature_set_view = bq_client.create_table(base_feature_set_view)


### 7.create_input_account_gpon_copper_view

In [None]:
def create_input_account_gpon_copper_view(view_name: str,
                                          score_date: str,
                                          score_date_delta: str,
                                          dataset_id: str,
                                          project_id: str,
                                          region: str,
                                          resource_bucket: str,
                                          query_path: str
                                          ):

    from google.cloud import bigquery
    from google.cloud import storage

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    bq_client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)
    dataset = bq_client.dataset(dataset_id)
    table_ref = dataset.table(view_name)

    # load query from .txt file
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(resource_bucket)
    blob = bucket.get_blob(query_path)
    content = blob.download_as_string()
    content = str(content, 'utf-8')

    def if_tbl_exists(client, table_ref):
        from google.cloud.exceptions import NotFound
        try:
            client.get_table(table_ref)
            return True
        except NotFound:
            return False

    if if_tbl_exists(bq_client, table_ref):
        bq_client.delete_table(table_ref)

    create_base_feature_set_query = content.format(score_date=score_date,
                                                   score_date_delta=score_date_delta,
                                                   )

    shared_dataset_ref = bq_client.dataset(dataset_id)
    base_feature_set_view_ref = shared_dataset_ref.table(view_name)
    base_feature_set_view = bigquery.Table(base_feature_set_view_ref)
    base_feature_set_view.view_query = create_base_feature_set_query.format(project_id)
    base_feature_set_view = bq_client.create_table(base_feature_set_view)


### 8.create_input_account_price_plan_view

In [None]:
def create_input_account_price_plan_view(view_name: str,
                                        score_date: str,
                                        score_date_delta: str,
                                        project_id: str,
                                        dataset_id: str,
                                        region: str,
                                        resource_bucket: str,
                                        query_path: str,
                                        ):

    from google.cloud import bigquery
    from google.cloud import storage

    def if_tbl_exists(client, table_ref):
        from google.cloud.exceptions import NotFound
        try:
            client.get_table(table_ref)
            return True
        except NotFound:
            return False

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    bq_client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)
    dataset = bq_client.dataset(dataset_id)
    table_ref = dataset.table(view_name)

    # load query from .txt file
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(resource_bucket)
    blob = bucket.get_blob(query_path)
    content = blob.download_as_string()
    content = str(content, 'utf-8')

    if if_tbl_exists(bq_client, table_ref):
        bq_client.delete_table(table_ref)

    # content = open(query_path, 'r').read()

    create_base_feature_set_query = content.format(score_date=score_date,
                                                   score_date_delta=score_date_delta,
                                                   view_name=view_name,
                                                   dataset_id=dataset_id,
                                                   project_id=project_id,
                                                   )
    shared_dataset_ref = bq_client.dataset(dataset_id)
    base_feature_set_view_ref = shared_dataset_ref.table(view_name)
    base_feature_set_view = bigquery.Table(base_feature_set_view_ref)
    base_feature_set_view.view_query = create_base_feature_set_query.format(project_id)
    base_feature_set_view = bq_client.create_table(base_feature_set_view)


### 9.create_input_account_clckstrm_telus_view

In [None]:
def create_input_account_clckstrm_telus_view(view_name: str,
                                    score_date: str,
                                    score_date_delta: str,
                                    project_id: str,
                                    dataset_id: str,
                                    region: str,
                                    resource_bucket: str,
                                    query_path: str,
                                    ):

    from google.cloud import bigquery
    from google.cloud import storage

    def if_tbl_exists(client, table_ref):
        from google.cloud.exceptions import NotFound
        try:
            client.get_table(table_ref)
            return True
        except NotFound:
            return False

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    bq_client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)
    dataset = bq_client.dataset(dataset_id)
    table_ref = dataset.table(view_name)

    # load query from .txt file
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(resource_bucket)
    blob = bucket.get_blob(query_path)
    content = blob.download_as_string()
    content = str(content, 'utf-8')

    if if_tbl_exists(bq_client, table_ref):
        bq_client.delete_table(table_ref)

    # content = open(query_path, 'r').read()

    create_base_feature_set_query = content.format(score_date=score_date,
                                                   score_date_delta=score_date_delta,
                                                   view_name=view_name,
                                                   dataset_id=dataset_id,
                                                   project_id=project_id,
                                                   )
    shared_dataset_ref = bq_client.dataset(dataset_id)
    base_feature_set_view_ref = shared_dataset_ref.table(view_name)
    base_feature_set_view = bigquery.Table(base_feature_set_view_ref)
    base_feature_set_view.view_query = create_base_feature_set_query.format(project_id)
    base_feature_set_view = bq_client.create_table(base_feature_set_view)



### 10.create_input_account_call_history_view

In [None]:
def create_input_account_call_history_view(view_name: str,
                                        score_date: str,
                                        score_date_delta: str,
                                        project_id: str,
                                        dataset_id: str,
                                        region: str,
                                        resource_bucket: str,
                                        query_path: str,
                                        ):

    from google.cloud import bigquery
    from google.cloud import storage

    def if_tbl_exists(client, table_ref):
        from google.cloud.exceptions import NotFound
        try:
            client.get_table(table_ref)
            return True
        except NotFound:
            return False

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    bq_client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)
    dataset = bq_client.dataset(dataset_id)
    table_ref = dataset.table(view_name)

    # load query from .txt file
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(resource_bucket)
    blob = bucket.get_blob(query_path)
    content = blob.download_as_string()
    content = str(content, 'utf-8')

    if if_tbl_exists(bq_client, table_ref):
        bq_client.delete_table(table_ref)

    # content = open(query_path, 'r').read()

    create_base_feature_set_query = content.format(score_date=score_date,
                                                   score_date_delta=score_date_delta,
                                                   view_name=view_name,
                                                   dataset_id=dataset_id,
                                                   project_id=project_id,
                                                   )
    shared_dataset_ref = bq_client.dataset(dataset_id)
    base_feature_set_view_ref = shared_dataset_ref.table(view_name)
    base_feature_set_view = bigquery.Table(base_feature_set_view_ref)
    base_feature_set_view.view_query = create_base_feature_set_query.format(project_id)
    base_feature_set_view = bq_client.create_table(base_feature_set_view)


### Preprocess

In [None]:
def preprocess(
        promo_expiry_list_view: str, 
        account_consl_view: str, 
        account_bill_view: str, 
        account_discounts_view: str, 
        hs_usage_view: str, 
        demo_income_view: str, 
        gpon_copper_view: str, 
        # price_plan_view: str, 
        clckstrm_telus_view: str, 
        call_history_view: str, 
        save_data_path: str,
        project_id: str,
        dataset_id: str
):

    from google.cloud import bigquery
    import pandas as pd
    import gc
    import time

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)
    
    #1.df_promo_expiry_list
    promo_expiry_list_set = f"{project_id}.{dataset_id}.{promo_expiry_list_view}" 
    build_df_promo_expiry_list = '''SELECT * FROM `{promo_expiry_list_set}`'''.format(promo_expiry_list_set=promo_expiry_list_set)
    df_promo_expiry_list = client.query(build_df_promo_expiry_list).to_dataframe()
    df_promo_expiry_list = df_promo_expiry_list.set_index('ban')
    df_join = df_promo_expiry_list.copy()
    print('......df_promo_expiry_list done')
    
    #2.df_consl
    consl_data_set = f"{project_id}.{dataset_id}.{account_consl_view}" 
    build_df_consl = '''SELECT * FROM `{consl_data_set}`'''.format(consl_data_set=consl_data_set)
    df_consl = client.query(build_df_consl).to_dataframe()
    df_mix = df_consl[[
        'ban',
        'customer_tenure', 
        'product_mix_all',
        'hsic_count',
        'ttv_count',
        'sing_count',
        'mob_count',
        'shs_count',
        'new_hsic_ind',
        'new_ttv_ind',
        'new_sing_ind',
        'new_c_ind',
        'new_smhm_ind',
        'mnh_ind'
    ]]
    df_mix = df_mix.drop_duplicates(subset=['ban']).set_index('ban').add_prefix('productMix_').fillna(0)
    df_join = df_join.join(df_mix)
    del df_mix
    gc.collect()
    print('......df_consl done')
    
    #3.df_bill
    bill_data_set = f"{project_id}.{dataset_id}.{account_bill_view}" 
    build_df_bill = '''SELECT * FROM `{bill_data_set}`'''.format(bill_data_set=bill_data_set)
    df_bill = client.query(build_df_bill).to_dataframe() 
    df_bill = df_bill.set_index('ban').add_prefix('ffhBill_')
    df_join = df_join.join(df_bill).fillna(0) 
    del df_bill
    gc.collect()
    print('......df_bill done')
    
    #4.df_discounts
    discounts_data_set = f"{project_id}.{dataset_id}.{account_discounts_view}" 
    build_df_discounts = '''SELECT * FROM `{discounts_data_set}`'''.format(discounts_data_set=discounts_data_set)
    df_discounts = client.query(build_df_discounts).to_dataframe() 
    df_discounts = df_discounts.set_index('ban').add_prefix('ffhDiscounts_')
    df_join = df_join.join(df_discounts).fillna(0) 
    del df_discounts
    gc.collect()
    print('......df_discounts done')

    #5.df_hs_usage
    hs_usage_data_set = f"{project_id}.{dataset_id}.{hs_usage_view}" 
    build_df_hs_usage = '''SELECT * FROM `{hs_usage_data_set}`'''.format(hs_usage_data_set=hs_usage_data_set)
    df_hs_usage = client.query(build_df_hs_usage).to_dataframe() 
    df_hs_usage = df_hs_usage.set_index('ban').add_prefix('hsiaUsage_')
    df_join = df_join.join(df_hs_usage).fillna(0) 
    del df_hs_usage
    gc.collect()
    print('......df_hs_usage done')

    #6.df_income
    demo_income_data_set = f"{project_id}.{dataset_id}.{demo_income_view}" 
    build_df_demo_income = '''SELECT * FROM `{demo_income_data_set}`'''.format(demo_income_data_set=demo_income_data_set)
    df_income = client.query(build_df_demo_income).to_dataframe()
    df_income = df_income.set_index('ban')
    df_income['demo_urban_flag'] = df_income.demo_sgname.str.lower().str.contains('urban').fillna(0).astype(int)
    df_income['demo_rural_flag'] = df_income.demo_sgname.str.lower().str.contains('rural').fillna(0).astype(int)
    df_income['demo_family_flag'] = df_income.demo_lsname.str.lower().str.contains('families').fillna(0).astype(int)
    df_income_dummies = pd.get_dummies(df_income[['demo_lsname']])
    df_income_dummies.columns = df_income_dummies.columns.str.replace('&', 'and')
    df_income_dummies.columns = df_income_dummies.columns.str.replace(' ', '_')
    df_income = df_income[['demo_avg_income', 'demo_urban_flag', 'demo_rural_flag', 'demo_family_flag']].join(
        df_income_dummies)
    df_income.demo_avg_income = df_income.demo_avg_income.astype(float)
    df_income.demo_avg_income = df_income.demo_avg_income.fillna(df_income.demo_avg_income.median())
    df_group_income = df_income.groupby('ban').agg('mean')
    df_group_income = df_group_income.add_prefix('demographics_')
    df_join = df_join.join(df_group_income.fillna(df_group_income.median()))
    del df_group_income
    del df_income
    gc.collect()
    print('......df_income done')

    #7.df_gpon_copper
    gpon_copper_data_set = f"{project_id}.{dataset_id}.{gpon_copper_view}"
    build_df_gpon_copper = '''SELECT * FROM `{gpon_copper_data_set}`'''.format(gpon_copper_data_set=gpon_copper_data_set)
    df_gpon_copper = client.query(build_df_gpon_copper).to_dataframe()
    df_gpon_copper = df_gpon_copper.set_index('ban')
    df_join = df_join.join(df_gpon_copper.add_prefix('infra_')).fillna(0)
    del df_gpon_copper
    gc.collect()
    print('......df_gpon_copper done')

#     #8.df_price_plan
#     price_plan_data_set = f"{project_id}.{dataset_id}.{price_plan_view}"
#     build_df_price_plan = '''SELECT * FROM `{price_plan_data_set}`'''.format(price_plan_data_set=price_plan_data_set)
#     df_price_plan = client.query(build_df_price_plan).to_dataframe()
#     df_price_plan = df_price_plan.set_index('ban')
#     df_pp_dummies = pd.get_dummies(df_price_plan[['price_plan']])
#     df_pp_dummies.columns = df_pp_dummies.columns.str.replace('&', 'and')
#     df_pp_dummies.columns = df_pp_dummies.columns.str.replace(' ', '_')
#     df_price_plan = df_price_plan.join(df_pp_dummies)
#     df_price_plan.drop(columns=['price_plan'], axis=1, inplace=True)
#     print(df_price_plan.columns)
#     df_join = df_join.join(df_price_plan.add_prefix('infra_')).fillna(0)
#     del df_price_plan
#     gc.collect()
#     print('......df_price_plan done')

    #9.df_clckstrm_telus
    clckstrm_telus_data_set = f"{project_id}.{dataset_id}.{clckstrm_telus_view}" 
    build_df_clckstrm_telus = '''SELECT * FROM `{clckstrm_telus_data_set}`'''.format(clckstrm_telus_data_set=clckstrm_telus_data_set)
    df_clckstrm_telus = client.query(build_df_clckstrm_telus).to_dataframe() 
    df_clckstrm_telus = df_clckstrm_telus.set_index('ban').add_prefix('clckstrmData_')
    df_join = df_join.join(df_clckstrm_telus).fillna(0) 
    del df_clckstrm_telus
    gc.collect()
    print('......df_clckstrm_telus done')

    #10.df_call_history
    call_history_data_set = f"{project_id}.{dataset_id}.{call_history_view}" 
    build_df_call_history = '''SELECT * FROM `{call_history_data_set}`'''.format(call_history_data_set=call_history_data_set)
    df_call_history = client.query(build_df_call_history).to_dataframe() 
    df_call_history = df_call_history.set_index('ban').add_prefix('callHistory_')
    df_join = df_join.join(df_call_history)
    df_join[['callHistory_frequency', 'callHistory_have_called']] = df_join[['callHistory_frequency', 'callHistory_have_called']].fillna(0)
    df_join[['callHistory_recency']] = df_join[['callHistory_recency']].fillna(999)
    del df_call_history
    gc.collect()
    print('......df_call_history done')

    #column name clean-up
    df_join.columns = df_join.columns.str.replace(' ', '_')
    df_join.columns = df_join.columns.str.replace('-', '_')

    #df_final
    df_final = df_join.copy()
    del df_join
    gc.collect()
    print('......df_final done')

    for f in df_final.columns:
        df_final[f] = list(df_final[f])

    df_final.to_csv(save_data_path, index=True, compression='gzip') 
    del df_final
    gc.collect()
    print(f'......csv saved in {save_data_path}')
    time.sleep(120)


### Batch Prediction

In [None]:
def batch_prediction(
        project_id: str,
        dataset_id: str,
        file_bucket: str,
        service_type: str,
        score_table: str,
        score_date_dash: str
):
    import time
    import pandas as pd
    import numpy as np
    import pickle
    from datetime import date
    from dateutil.relativedelta import relativedelta
    from google.cloud import bigquery
    from google.cloud import storage
    
    MODEL_ID = '5090'
    
    def if_tbl_exists(bq_client, table_ref):
        from google.cloud.exceptions import NotFound
        try:
            bq_client.get_table(table_ref)
            return True
        except NotFound:
            return False

    def upsert_table(project_id, dataset_id, table_id, sql, result):
        new_values = ',\n'.join(result.apply(lambda row: row_format(row), axis=1))
        new_sql = sql.format(proj_id=project_id, dataset_id=dataset_id, table_id=table_id,
                             new_values=new_values)

        def get_gcp_bqclient(project_id, use_local_credential=True):
            token = os.popen('gcloud auth print-access-token').read()
            token = re.sub(f'\n$', '', token)
            credentials = google.oauth2.credentials.Credentials(token)

            bq_client = bigquery.Client(project=project_id)
            if use_local_credential:
                bq_client = bigquery.Client(project=project_id, credentials=credentials)
            return bq_client

        bq_client = get_gcp_bqclient(project_id)

        # bq_client = bigquery.Client(project=project_id)
        
        code = bq_client.query(new_sql)
        time.sleep(5)

    def row_format(row):
        values = row.values
        new_values = ""
        v = str(values[0]) if not pd.isnull(values[0]) else 'NULL'
        if 'str' in str(type(values[0])):
            new_values += f"'{v}'"
        else:
            new_values += f"{v}"

        for i in range(1, len(values)):
            v = str(values[i]) if not pd.isnull(values[i]) else 'NULL'
            if 'str' in str(type(values[i])):
                new_values += f",'{v}'"
            else:
                new_values += f",{v}"
        return '(' + new_values + ')'

    def generate_sql_file(ll):
        s = 'MERGE INTO `{proj_id}.{dataset_id}.{table_id}` a'
        s += " USING UNNEST("
        s += "[struct<"
        for i in range(len(ll) - 1):
            v = ll[i]
            s += "{} {},".format(v[0], v[1])
        s += "{} {}".format(ll[-1][0], ll[-1][1])
        s += ">{new_values}]"
        s += ") b"
        s += " ON a.ban = b.ban and a.score_date = b.score_date"
        s += " WHEN MATCHED THEN"
        s += " UPDATE SET "
        s += "a.{}=b.{},".format(ll[0][0], ll[0][0])
        for i in range(1, len(ll) - 1):
            v = ll[i]
            s += "a.{}=b.{},".format(v[0], v[0])
        s += "a.{}=b.{}".format(ll[-1][0], ll[-1][0])
        s += " WHEN NOT MATCHED THEN"
        s += " INSERT("
        for i in range(len(ll) - 1):
            v = ll[i]
            s += "{},".format(v[0])
        s += "{})".format(ll[-1][0])
        s += " VALUES("
        for i in range(len(ll) - 1):
            s += "b.{},".format(ll[i][0])
        s += "b.{}".format(ll[-1][0])
        s += ")"

        return s

    MODEL_PATH = '{}_xgb_models/'.format(service_type)
    df_score = pd.read_csv('gs://{}/{}_score.csv.gz'.format(file_bucket, service_type), compression='gzip')
    df_score.dropna(subset=['ban'], inplace=True)
    df_score.reset_index(drop=True, inplace=True)
    print('......scoring data loaded:{}'.format(df_score.shape))
    time.sleep(10)

    storage_client = storage.Client()
    bucket = storage_client.get_bucket(file_bucket)
    blobs = storage_client.list_blobs(file_bucket, prefix='{}{}_models_xgb_'.format(MODEL_PATH, service_type))

    model_lists = []
    for blob in blobs:
        model_lists.append(blob.name)

    blob = bucket.blob(model_lists[-1])
    blob_in = blob.download_as_string()
    model_dict = pickle.loads(blob_in)
    model_xgb = model_dict['model']
    features = model_dict['features']
    print('...... model loaded')
    time.sleep(10)

    ll = [('ban', 'string'), ('score_date', 'string'), ('model_id', 'string'), ('score', 'float64')]
    sql = generate_sql_file(ll)

    df_score['ban'] = df_score['ban'].astype(int)
    print('.... scoring for {} promo expiry bans base'.format(len(df_score)))

    # get full score to cave into bucket
    pred_prob = model_xgb.predict_proba(df_score[features], ntree_limit=model_xgb.best_iteration)[:, 1]
    result = pd.DataFrame(columns=['ban', 'score_date', 'model_id', 'score'])
    result['score'] = list(pred_prob)
    result['score'] = result['score'].fillna(0.0).astype('float64')
    result['ban'] = list(df_score['ban'])
    result['ban'] = result['ban'].astype('str')
    result['score_date'] = score_date_dash
    result['model_id'] = MODEL_ID

    result.to_csv('gs://{}/ucar/{}_prediction.csv.gz'.format(file_bucket, service_type), compression='gzip',
                  index=False)
    time.sleep(60)

    batch_size = 1000
    n_batchs = int(df_score.shape[0] / batch_size) + 1
    print('...... will upsert {} batches'.format(n_batchs))

    # start batch prediction
    all_scores = np.array(result['score'].values)
    for i in range(n_batchs):
    
        s, e = i * batch_size, (i + 1) * batch_size
        if e >= df_score.shape[0]:
            e = df_score.shape[0]

        df_temp = df_score.iloc[s:e]
        pred_prob = all_scores[s:e]
        batch_result = pd.DataFrame(columns=['ban', 'score_date', 'model_id', 'score'])
        batch_result['score'] = list(pred_prob)
        batch_result['score'] = batch_result['score'].fillna(0.0).astype('float64')
        batch_result['ban'] = list(df_temp['ban'])
        batch_result['ban'] = batch_result['ban'].astype('str')
        batch_result['score_date'] = score_date_dash
        batch_result['model_id'] = MODEL_ID

        upsert_table(project_id,
                     dataset_id,
                     score_table,
                     sql,
                     batch_result,
                     )
        if i % 20 == 0:
            print('predict for batch {} done'.format(i), end=' ')

    time.sleep(120)
    
    

### Post Process

In [None]:
def postprocess(
        project_id: str,
        file_bucket: str,
        service_type: str,
        score_date_dash: str,
):
    import time
    import pandas as pd
    from google.cloud import bigquery

    MODEL_ID = '5090'
    file_name = 'gs://{}/ucar/{}_prediction.csv.gz'.format(file_bucket, service_type)
    df_orig = pd.read_csv(file_name, compression='gzip')
    df_orig.dropna(subset=['ban'], inplace=True)
    df_orig.reset_index(drop=True, inplace=True)
    df_orig['scoring_date'] = score_date_dash
    df_orig.ban = df_orig.ban.astype(int)
    df_orig = df_orig.rename(columns={'ban': 'bus_bacct_num', 'score': 'score_num'})
    df_orig.score_num = df_orig.score_num.astype(float)
    df_orig['decile_grp_num'] = pd.qcut(df_orig['score_num'], q=10, labels=[i for i in range(10, 0, -1)])
    df_orig['percentile_pct'] = df_orig.score_num.rank(pct=True)
    df_orig['predict_model_nm'] = 'FFH Call To Retention Model - DIVG'
    df_orig['model_type_cd'] = 'FFH'
    df_orig['subscriber_no'] = ""
    df_orig['prod_instnc_resrc_str'] = ""
    df_orig['service_instnc_id'] = ""
    df_orig['segment_nm'] = ""
    df_orig['segment_id'] = ""
    df_orig['classn_nm'] = ""
    df_orig['predict_model_id'] = MODEL_ID
    df_orig.drop(columns=['model_id', 'score_date'], axis=1, inplace=True)

    get_cust_id = """
    WITH bq_snpsht_max_date AS(
    SELECT PARSE_DATE('%Y%m%d', MAX(partition_id)) AS max_date
        FROM `cio-datahub-enterprise-pr-183a.ent_cust_cust.INFORMATION_SCHEMA.PARTITIONS` 
    WHERE table_name = 'bq_prod_instnc_snpsht' 
        AND partition_id <> '__NULL__'
    ),
    -- BANs can have multiple Cust ID. Create rank by product type and status, prioritizing ban/cust id with active FFH products
    rank_prod_type AS (
    SELECT DISTINCT
        bacct_bus_bacct_num,
        consldt_cust_bus_cust_id AS cust_id,
        CASE WHEN pi_prod_instnc_resrc_typ_cd IN ('SING', 'HSIC', 'TTV', 'SMHM', 'STV', 'DIIC') AND pi_prod_instnc_stat_cd = 'A' THEN 1
                WHEN pi_prod_instnc_resrc_typ_cd IN ('SING', 'HSIC', 'TTV', 'SMHM', 'STV', 'DIIC') THEN 2
                WHEN pi_prod_instnc_stat_cd = 'A' THEN 3
                ELSE 4
                END AS prod_rank
    FROM `cio-datahub-enterprise-pr-183a.ent_cust_cust.bq_prod_instnc_snpsht`
    CROSS JOIN bq_snpsht_max_date
    WHERE CAST(prod_instnc_ts AS DATE)=bq_snpsht_max_date.max_date
    AND bus_prod_instnc_src_id = 1001
    ),
    --Rank Cust ID
    rank_cust_id AS (
    SELECT DISTINCT
        bacct_bus_bacct_num,
        cust_id,
        RANK() OVER(PARTITION BY bacct_bus_bacct_num
                        ORDER BY prod_rank,
                                    cust_id) AS cust_id_rank               
    FROM rank_prod_type
    )
    --Select best cust id
    SELECT bacct_bus_bacct_num,
        cust_id
    FROM rank_cust_id
    WHERE cust_id_rank = 1
    """

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)
    
    # client = bigquery.Client(project=project_id)
    df_cust = client.query(get_cust_id).to_dataframe()
    df_final = df_orig.set_index('bus_bacct_num').join(df_cust.set_index('bacct_bus_bacct_num')).reset_index()
    df_final = df_final.rename(columns={'index': 'bus_bacct_num', 'cust_bus_cust_id': 'cust_id'})
    df_final = df_final.sort_values(by=['score_num'], ascending=False)
    df_final.to_csv(file_name, compression='gzip', index=False)
    time.sleep(300)


### Pipeline

In [None]:

# @dsl.pipeline(
#     # A name for the pipeline.
#     name="{}-xgb-pipeline".format(SERVICE_TYPE_NAME),
#     description=' pipeline for training {} model'.format(SERVICE_TYPE_NAME)
# )
def pipeline(
        project_id: str = PROJECT_ID,
        region: str = REGION,
        resource_bucket: str = RESOURCE_BUCKET,
        file_bucket: str = FILE_BUCKET
    ):
    # ------------- train view ops ---------------
    #1.create_input_account_promo_expiry_list_view
    create_input_account_promo_expiry_list_view_op = create_input_account_promo_expiry_list_view(
        view_name=PROMO_EXPIRY_LIST_VIEW_NAME,
        score_date=SCORE_DATE,
        score_date_delta=SCORE_DATE_DELTA,
        promo_expiry_start = PROMO_EXPIRY_START, 
        promo_expiry_end = PROMO_EXPIRY_END,
        project_id=PROJECT_ID,
        dataset_id=DATASET_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_PROMO_EXPIRY_LIST_QUERY_PATH
    )
    # create_input_account_promo_expiry_list_view_op.set_memory_limit('16G')
    # create_input_account_promo_expiry_list_view_op.set_cpu_limit('4')

    #2.create_input_account_consl_view
    create_input_account_consl_view_op = create_input_account_consl_view(
        view_name=CONSL_VIEW_NAME,
        score_date=SCORE_DATE,
        score_date_delta=SCORE_DATE_DELTA,
        project_id=PROJECT_ID,
        dataset_id=DATASET_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_CONSL_QUERY_PATH
    )
    # create_input_account_consl_view_op.set_memory_limit('16G')
    # create_input_account_consl_view_op.set_cpu_limit('4')

    #3.create_input_account_ffh_billing_view
    create_input_account_ffh_billing_view_op = create_input_account_ffh_billing_view(
        v_report_date=SCORE_DATE_DASH,
        v_start_date=SCORE_DATE_MINUS_6_MOS_DASH,
        v_end_date=SCORE_DATE_LAST_MONTH_END_DASH,
        v_bill_year=SCORE_DATE_LAST_MONTH_YEAR,
        v_bill_month=SCORE_DATE_LAST_MONTH_MONTH,
        view_name=FFH_BILLING_VIEW_NAME,
        dataset_id=DATASET_ID,
        project_id=PROJECT_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_FFH_BILLING_QUERY_PATH 
    )

    # create_input_account_ffh_billing_view_op.set_memory_limit('16G')
    # create_input_account_ffh_billing_view_op.set_cpu_limit('4')

    #4.create_input_account_ffh_discounts_view
    create_input_account_ffh_discounts_view_op = create_input_account_ffh_discounts_view(
        view_name=FFH_DISCOUNTS_VIEW_NAME,
        score_date=SCORE_DATE,
        score_date_delta=SCORE_DATE_DELTA,
        project_id=PROJECT_ID,
        dataset_id=DATASET_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_FFH_DISCOUNTS_QUERY_PATH
    )

    # create_input_account_ffh_discounts_view_op.set_memory_limit('16G')
    # create_input_account_ffh_discounts_view_op.set_cpu_limit('4')

    #5.create_input_account_hs_usage_view
    create_input_account_hs_usage_view_op = create_input_account_hs_usage_view(
        v_report_date=SCORE_DATE_DASH,
        v_start_date=SCORE_DATE_MINUS_6_MOS_DASH,
        v_end_date=SCORE_DATE_LAST_MONTH_END_DASH,
        v_bill_year=SCORE_DATE_LAST_MONTH_YEAR,
        v_bill_month=SCORE_DATE_LAST_MONTH_MONTH,
        view_name=HS_USAGE_VIEW_NAME,
        dataset_id=DATASET_ID,
        project_id=PROJECT_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_HS_USAGE_QUERY_PATH 
    )

    # create_input_account_hs_usage_view_op.set_memory_limit('16G')
    # create_input_account_hs_usage_view_op.set_cpu_limit('4')

    #6.create_input_account_demo_income_view
    create_input_account_demo_income_view_op = create_input_account_demo_income_view(
        score_date=SCORE_DATE,
        score_date_delta=SCORE_DATE_DELTA,
        view_name=DEMO_INCOME_VIEW_NAME ,
        dataset_id=DATASET_ID,
        project_id=PROJECT_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_DEMO_INCOME_QUERY_PATH 
    )

    # create_input_account_demo_income_view_op.set_memory_limit('16G')
    # create_input_account_demo_income_view_op.set_cpu_limit('4')

    #7.create_input_account_gpon_copper_view
    create_input_account_gpon_copper_view_op = create_input_account_gpon_copper_view(
        score_date=SCORE_DATE,
        score_date_delta=SCORE_DATE_DELTA,
        view_name=GPON_COPPER_VIEW_NAME,
        dataset_id=DATASET_ID,
        project_id=PROJECT_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_GPON_COPPER_QUERY_PATH 
    )

    # create_input_account_gpon_copper_view_op.set_memory_limit('16G')
    # create_input_account_gpon_copper_view_op.set_cpu_limit('4')

#     #8.create_input_account_price_plan_view
#     create_input_account_price_plan_view_op = create_input_account_price_plan_view(
#         score_date=SCORE_DATE,
#         score_date_delta=SCORE_DATE_DELTA,
#         view_name=PRICE_PLAN_VIEW_NAME ,
#         dataset_id=DATASET_ID,
#         project_id=PROJECT_ID,
#         region=REGION,
#         resource_bucket=RESOURCE_BUCKET,
#         query_path=ACCOUNT_PRICE_PLAN_QUERY_PATH 
#     )

#     # create_input_account_price_plan_view_op.set_memory_limit('16G')
#     # create_input_account_price_plan_view_op.set_cpu_limit('4')

    #9.create_input_account_clckstrm_telus_view
    create_input_account_clckstrm_telus_view_op = create_input_account_clckstrm_telus_view(
        score_date=SCORE_DATE,
        score_date_delta=SCORE_DATE_DELTA,
        view_name=CLCKSTRM_TELUS_VIEW_NAME,
        dataset_id=DATASET_ID,
        project_id=PROJECT_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_CLCKSTRM_TELUS_QUERY_PATH
    )

    # create_input_account_clckstrm_telus_view_op.set_memory_limit('16G')
    # create_input_account_clckstrm_telus_view_op.set_cpu_limit('4')

    #10.create_input_account_call_history_view
    create_input_account_call_history_view_op = create_input_account_call_history_view(
        score_date=SCORE_DATE,
        score_date_delta=SCORE_DATE_DELTA,
        view_name=CALL_HISTORY_VIEW_NAME,
        dataset_id=DATASET_ID,
        project_id=PROJECT_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=ACCOUNT_CALL_HISTORY_QUERY_PATH 
    )

    # create_input_account_call_history_view_op.set_memory_limit('16G')
    # create_input_account_call_history_view_op.set_cpu_limit('4')

    # ----- preprocessing train data --------
    preprocess_train_op = preprocess(
        promo_expiry_list_view = PROMO_EXPIRY_LIST_VIEW_NAME, 
        account_consl_view=CONSL_VIEW_NAME,
        account_bill_view=FFH_BILLING_VIEW_NAME,
        account_discounts_view=FFH_DISCOUNTS_VIEW_NAME, 
        hs_usage_view=HS_USAGE_VIEW_NAME,
        demo_income_view=DEMO_INCOME_VIEW_NAME,
        gpon_copper_view=GPON_COPPER_VIEW_NAME,
        # price_plan_view=PRICE_PLAN_VIEW_NAME,
        clckstrm_telus_view=CLCKSTRM_TELUS_VIEW_NAME, 
        call_history_view=CALL_HISTORY_VIEW_NAME, 
        save_data_path='gs://{}/{}_score.csv.gz'.format(RESOURCE_BUCKET, SERVICE_TYPE),
        project_id=PROJECT_ID,
        dataset_id=DATASET_ID
    )

    # preprocess_train_op.set_memory_limit('128G')
    # preprocess_train_op.set_cpu_limit('32')

    create_input_account_promo_expiry_list_view_op
    create_input_account_consl_view_op
    create_input_account_ffh_billing_view_op
    create_input_account_ffh_discounts_view_op
    create_input_account_hs_usage_view_op
    create_input_account_demo_income_view_op
    create_input_account_gpon_copper_view_op
    # create_input_account_price_plan_view_op
    create_input_account_clckstrm_telus_view_op
    create_input_account_call_history_view_op
    preprocess_train_op

    batch_prediction_op = batch_prediction(
        project_id=PROJECT_ID,
        dataset_id=DATASET_ID,
        file_bucket=FILE_BUCKET,
        service_type=SERVICE_TYPE,
        score_date_dash=SCORE_DATE_DASH,
        score_table='bq_call_to_retention_scores',
    )
    # batch_prediction_op.set_memory_limit('32G')
    # batch_prediction_op.set_cpu_limit('4')

    batch_prediction_op

    postprocessing_op = postprocess(
        project_id=PROJECT_ID,
        file_bucket=FILE_BUCKET,
        service_type=SERVICE_TYPE,
        score_date_dash=SCORE_DATE_DASH,
    )
    # postprocessing_op.set_memory_limit('16G')
    # postprocessing_op.set_cpu_limit('4')

    postprocessing_op


### Run the Pipeline Job

In [None]:
# pipeline(project_id = PROJECT_ID,
#         region = REGION,
#         resource_bucket = RESOURCE_BUCKET,
#         file_bucket = FILE_BUCKET)


pipeline(project_id = PROJECT_ID,
        region = REGION,
        resource_bucket = RESOURCE_BUCKET, 
        file_bucket = FILE_BUCKET)

In [None]:
# from kfp.v2 import compiler
# from google.cloud.aiplatform import pipeline_jobs

# import json

# compiler.Compiler().compile(
#    pipeline_func=pipeline, package_path="pipeline.json"
# )

# job = pipeline_jobs.PipelineJob(
#                                display_name=PIPELINE_NAME,
#                                template_path="pipeline.json",
#                                location=REGION,
#                                enable_caching=False,
#                                pipeline_root = f"gs://{RESOURCE_BUCKET}"
# )
# job.run(
#    service_account = f"bilayer-sa@{PROJECT_ID}.iam.gserviceaccount.com"
# )