### Import Libraries

In [None]:
# import required libraries
from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output, OutputPath, ClassificationMetrics,
                        Metrics, component)
import os
import re
from pathlib import Path

from datetime import date
from datetime import timedelta
from dateutil.relativedelta import relativedelta

import google
from google.oauth2 import credentials
from google.oauth2 import service_account
from google.oauth2.service_account import Credentials
from google.cloud import storage
from google.cloud.aiplatform import pipeline_jobs
from google_cloud_pipeline_components.v1.batch_predict_job import \
    ModelBatchPredictOp as batch_prediction_op


### Parameters

In [None]:
#tag cell with parameters
PROJECT_ID =  ''
BUCKET_NAME=''
DATASET_ID = ''
RESOURCE_BUCKET = ''
FILE_BUCKET = ''
REGION = ''
MODEL_ID = '5090'

In [None]:
#tag cell with parameters
PROJECT_ID =  'divg-josh-pr-d1cc3a'
BUCKET_NAME='divg-josh-pr-d1cc3a-default'
DATASET_ID = 'call_to_retention_dataset'
RESOURCE_BUCKET = 'divg-josh-pr-d1cc3a-default'
FILE_BUCKET = 'divg-josh-pr-d1cc3a-default'
MODEL_ID = '5090'

### Service Parameters

In [None]:
SERVICE_TYPE = 'call_to_retention'
SERVICE_TYPE_NAME = 'call-to-retention'
TABLE_ID = 'bq_call_to_retention_targets'
REGION = "northamerica-northeast1"

### Pulumi Parameters

In [None]:
STACK_NAME = 'call_to_retention'
TRAIN_PIPELINE_NAME_PATH = 'train_pipeline'
PREDICT_PIPELINE_NAME_PATH = 'predict_pipeline'
TRAIN_PIPELINE_NAME = 'call-to-retention-train-pipeline' # Same name as pulumi.yaml
PREDICT_PIPELINE_NAME = 'call-to-retention-predict-pipeline' # Same name as pulumi.yaml
TRAIN_PIPELINE_DESCRIPTION = 'call-to-retention-train-pipeline'
PREDICT_PIPELINE_DESCRIPTION = 'call-to-retention-predict-pipeline'
REGION = "northamerica-northeast1"

### Query + Pre-Processing Component Parameters

In [None]:
TRAIN_QUERIES_PATH = f"{STACK_NAME}/{TRAIN_PIPELINE_NAME_PATH}/queries/" 
TRAIN_UTILS_FILE_PATH = f"{STACK_NAME}/{TRAIN_PIPELINE_NAME_PATH}/utils" 
UTILS_FILENAME = 'utils.py'

PROCESSED_SERVING_DATA_TABLENAME = 'processed_serving_data'
INPUT_SERVING_DATA_TABLENAME = 'input_serving_data'

QUERY_DATE = (date.today() - relativedelta(days=1)).strftime('%Y-%m-%d')
TARGET_TABLE_REF = '{}.{}.{}'.format(PROJECT_ID, DATASET_ID, TABLE_ID)

QUERIES_PATH = 'call_to_retention/queries/'


### Import Pipeline Components

In [None]:
# download required component files to local
prefix = f'{STACK_NAME}/{TRAIN_PIPELINE_NAME_PATH}/components/'
dl_dir = 'components/'

storage_client = storage.Client()
bucket = storage_client.bucket(RESOURCE_BUCKET)
blobs = bucket.list_blobs(prefix=prefix)  # Get list of files
for blob in blobs: # download each file that starts with "prefix" into "dl_dir"
    if blob.name.endswith("/"):
        continue
    file_split = blob.name.split(prefix)
    file_path = f"{dl_dir}{file_split[-1]}"
    directory = "/".join(file_path.split("/")[0:-1])
    Path(directory).mkdir(parents=True, exist_ok=True)
    blob.download_to_filename(file_path) 

# import main pipeline components
import components


### Date Parameters

In [None]:
scoringDate = date.today() - relativedelta(days=3)

# training dates
SCORE_DATE = scoringDate.strftime('%Y%m%d')  # date.today().strftime('%Y%m%d')
SCORE_DATE_DASH = scoringDate.strftime('%Y-%m-%d')
SCORE_DATE_MINUS_6_MOS_DASH = ((scoringDate - relativedelta(months=6)).replace(day=1)).strftime('%Y-%m-%d')
SCORE_DATE_LAST_MONTH_START_DASH = (scoringDate.replace(day=1) - timedelta(days=1)).replace(day=1).strftime('%Y-%m-%d')
SCORE_DATE_LAST_MONTH_END_DASH = ((scoringDate.replace(day=1)) - timedelta(days=1)).strftime('%Y-%m-%d')

#revert these changes after 2023-05-30
PROMO_EXPIRY_START = (scoringDate.replace(day=1) + relativedelta(months=4)).replace(day=1).strftime('%Y-%m-%d')
PROMO_EXPIRY_END = (scoringDate.replace(day=1) + relativedelta(months=5)).replace(day=1).strftime('%Y-%m-%d')

SCORE_DATE_DELTA = 0
SCORE_DATE_VAL_DELTA = 0
TICKET_DATE_WINDOW = 30  # Days of ticket data to be queried


In [None]:
PROMO_EXPIRY_START

### bq_create_dataset.py

In [None]:
# import kfp
# from kfp import dsl
# # from kfp.v2.dsl import (Model, Input, component)
# from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,HTML,
#                         OutputPath, ClassificationMetrics, Metrics, component)
# from typing import NamedTuple
# # Create Training Dataset for training pipeline
# @component(
#     base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/wb-platform/pipelines/kubeflow-pycaret:latest",
#     output_component_file="bq_create_dataset.yaml",
# )
def bq_create_dataset(score_date: str,
                      score_date_delta: int,
                      project_id: str,
                      dataset_id: str,
                      region: str,
                      promo_expiry_start: str, 
                      promo_expiry_end: str, 
                      v_start_date: str,
                      v_end_date: str):
# -> NamedTuple("output", [("col_list", list)])
 
    from google.cloud import bigquery
    import logging 
    from datetime import datetime
    # For wb
    # import google.oauth2.credentials
    # CREDENTIALS = google.oauth2.credentials.Credentials(token)
    
    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    client = get_gcp_bqclient(project_id)
    # client = bigquery.Client(project=project_id, location=region)
    job_config = bigquery.QueryJobConfig()
    
    # Change dataset / table + sp table name to version in bi-layer
    query =\
        f'''
            DECLARE score_date DATE DEFAULT "{score_date}";
            DECLARE promo_expiry_start DATE DEFAULT "{promo_expiry_start}";
            DECLARE promo_expiry_end DATE DEFAULT "{promo_expiry_end}";
            DECLARE start_date DATE DEFAULT "{v_start_date}";
            DECLARE end_date DATE DEFAULT "{v_end_date}";
        
            -- Change dataset / sp name to the version in the bi_layer
            CALL {dataset_id}.bq_sp_ctr_pipeline_dataset(score_date, promo_expiry_start, promo_expiry_end, start_date, end_date);

            SELECT
                *
            FROM {dataset_id}.INFORMATION_SCHEMA.PARTITIONS
            WHERE table_name='bq_ctr_pipeline_dataset'
            
        '''
    
    df = client.query(query, job_config=job_config).to_dataframe()
    logging.info(df.to_string())
    
    logging.info(f"Loaded {df.total_rows[0]} rows into \
             {df.table_catalog[0]}.{df.table_schema[0]}.{df.table_name[0]} on \
             {datetime.strftime((df.last_modified_time[0]), '%Y-%m-%d %H:%M:%S') } !")
    
    ######################################## Save column list_##########################
    query =\
        f'''
           SELECT
                *
            FROM {dataset_id}.bq_ctr_pipeline_dataset

        '''
    
    df = client.query(query, job_config=job_config).to_dataframe()
    
    col_list = list([col for col in df.columns])
    return (col_list,)
    

### Preprocess

In [None]:
def preprocess(
        pipeline_dataset: str, 
        save_data_path: str,
        project_id: str,
        dataset_id: str
):
    from google.cloud import bigquery
    import pandas as pd
    import gc
    import time

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)

    # pipeline_dataset 
    pipeline_dataset_name = f"{project_id}.{dataset_id}.{pipeline_dataset}" 
    build_df_pipeline_dataset = f'SELECT * FROM `{pipeline_dataset_name}`'
    df_pipeline_dataset = client.query(build_df_pipeline_dataset).to_dataframe()
    df_pipeline_dataset = df_pipeline_dataset.set_index('ban') 

    # demo columns
    df_pipeline_dataset['demo_urban_flag'] = df_pipeline_dataset.demo_sgname.str.lower().str.contains('urban').fillna(0).astype(int)
    df_pipeline_dataset['demo_rural_flag'] = df_pipeline_dataset.demo_sgname.str.lower().str.contains('rural').fillna(0).astype(int)
    df_pipeline_dataset['demo_family_flag'] = df_pipeline_dataset.demo_lsname.str.lower().str.contains('families').fillna(0).astype(int)

    df_income_dummies = pd.get_dummies(df_pipeline_dataset[['demo_lsname']]) 
    df_income_dummies.columns = df_income_dummies.columns.str.replace('&', 'and')
    df_income_dummies.columns = df_income_dummies.columns.str.replace(' ', '_')

    df_pipeline_dataset.drop(columns=['demo_sgname', 'demo_lsname'], axis=1, inplace=True)

    df_pipeline_dataset = df_pipeline_dataset.join(df_income_dummies)

    df_join = df_pipeline_dataset.copy()

    #column name clean-up
    df_join.columns = df_join.columns.str.replace(' ', '_')
    df_join.columns = df_join.columns.str.replace('-', '_')

    df_join.head()

    #df_final
    df_final = df_join.copy()
    del df_join
    gc.collect()
    print('......df_final done')

    for f in df_final.columns:
        df_final[f] = list(df_final[f])

    df_final.to_csv(save_data_path, index=True, compression='gzip') 
    del df_final
    gc.collect()
    print(f'......csv saved in {save_data_path}')
    time.sleep(120)


### Batch Prediction

In [None]:
def batch_prediction(
        project_id: str,
        dataset_id: str,
        file_bucket: str,
        service_type: str,
        score_table: str,
        score_date_dash: str
):
    import time
    import pandas as pd
    import numpy as np
    import pickle
    from datetime import date
    from dateutil.relativedelta import relativedelta
    from google.cloud import bigquery
    from google.cloud import storage
    
    MODEL_ID = '5090'
    
    def if_tbl_exists(bq_client, table_ref):
        from google.cloud.exceptions import NotFound
        try:
            bq_client.get_table(table_ref)
            return True
        except NotFound:
            return False

    def upsert_table(project_id, dataset_id, table_id, sql, result):
        new_values = ',\n'.join(result.apply(lambda row: row_format(row), axis=1))
        new_sql = sql.format(proj_id=project_id, dataset_id=dataset_id, table_id=table_id,
                             new_values=new_values)

        def get_gcp_bqclient(project_id, use_local_credential=True):
            token = os.popen('gcloud auth print-access-token').read()
            token = re.sub(f'\n$', '', token)
            credentials = google.oauth2.credentials.Credentials(token)

            bq_client = bigquery.Client(project=project_id)
            if use_local_credential:
                bq_client = bigquery.Client(project=project_id, credentials=credentials)
            return bq_client

        bq_client = get_gcp_bqclient(project_id)

        # bq_client = bigquery.Client(project=project_id)
        
        code = bq_client.query(new_sql)
        time.sleep(5)

    def row_format(row):
        values = row.values
        new_values = ""
        v = str(values[0]) if not pd.isnull(values[0]) else 'NULL'
        if 'str' in str(type(values[0])):
            new_values += f"'{v}'"
        else:
            new_values += f"{v}"

        for i in range(1, len(values)):
            v = str(values[i]) if not pd.isnull(values[i]) else 'NULL'
            if 'str' in str(type(values[i])):
                new_values += f",'{v}'"
            else:
                new_values += f",{v}"
        return '(' + new_values + ')'

    def generate_sql_file(ll):
        s = 'MERGE INTO `{proj_id}.{dataset_id}.{table_id}` a'
        s += " USING UNNEST("
        s += "[struct<"
        for i in range(len(ll) - 1):
            v = ll[i]
            s += "{} {},".format(v[0], v[1])
        s += "{} {}".format(ll[-1][0], ll[-1][1])
        s += ">{new_values}]"
        s += ") b"
        s += " ON a.ban = b.ban and a.score_date = b.score_date"
        s += " WHEN MATCHED THEN"
        s += " UPDATE SET "
        s += "a.{}=b.{},".format(ll[0][0], ll[0][0])
        for i in range(1, len(ll) - 1):
            v = ll[i]
            s += "a.{}=b.{},".format(v[0], v[0])
        s += "a.{}=b.{}".format(ll[-1][0], ll[-1][0])
        s += " WHEN NOT MATCHED THEN"
        s += " INSERT("
        for i in range(len(ll) - 1):
            v = ll[i]
            s += "{},".format(v[0])
        s += "{})".format(ll[-1][0])
        s += " VALUES("
        for i in range(len(ll) - 1):
            s += "b.{},".format(ll[i][0])
        s += "b.{}".format(ll[-1][0])
        s += ")"

        return s

    MODEL_PATH = '{}_xgb_models/'.format(service_type)
    df_score = pd.read_csv('gs://{}/{}_score.csv.gz'.format(file_bucket, service_type), compression='gzip')
    df_score.dropna(subset=['ban'], inplace=True)
    df_score.reset_index(drop=True, inplace=True)
    print('......scoring data loaded:{}'.format(df_score.shape))
    time.sleep(10)

    storage_client = storage.Client()
    bucket = storage_client.get_bucket(file_bucket)
    blobs = storage_client.list_blobs(file_bucket, prefix='{}{}_models_xgb_'.format(MODEL_PATH, service_type))

    model_lists = []
    for blob in blobs:
        model_lists.append(blob.name)

    blob = bucket.blob(model_lists[-1])
    blob_in = blob.download_as_string()
    model_dict = pickle.loads(blob_in)
    model_xgb = model_dict['model']
    features = model_dict['features']
    print('...... model loaded')
    time.sleep(10)

    ll = [('ban', 'string'), ('score_date', 'string'), ('model_id', 'string'), ('score', 'float64')]
    sql = generate_sql_file(ll)

    df_score['ban'] = df_score['ban'].astype(int)
    print('.... scoring for {} promo expiry bans base'.format(len(df_score)))

    # get full score to cave into bucket
    pred_prob = model_xgb.predict_proba(df_score[features], ntree_limit=model_xgb.best_iteration)[:, 1]
    result = pd.DataFrame(columns=['ban', 'score_date', 'model_id', 'score'])
    result['score'] = list(pred_prob)
    result['score'] = result['score'].fillna(0.0).astype('float64')
    result['ban'] = list(df_score['ban'])
    result['ban'] = result['ban'].astype('str')
    result['score_date'] = score_date_dash
    result['model_id'] = MODEL_ID

    result.to_csv('gs://{}/ucar/{}_prediction.csv.gz'.format(file_bucket, service_type), compression='gzip',
                  index=False)
    time.sleep(60)

    batch_size = 1000
    n_batchs = int(df_score.shape[0] / batch_size) + 1
    print('...... will upsert {} batches'.format(n_batchs))

    # start batch prediction
    all_scores = np.array(result['score'].values)
    for i in range(n_batchs):
    
        s, e = i * batch_size, (i + 1) * batch_size
        if e >= df_score.shape[0]:
            e = df_score.shape[0]

        df_temp = df_score.iloc[s:e]
        pred_prob = all_scores[s:e]
        batch_result = pd.DataFrame(columns=['ban', 'score_date', 'model_id', 'score'])
        batch_result['score'] = list(pred_prob)
        batch_result['score'] = batch_result['score'].fillna(0.0).astype('float64')
        batch_result['ban'] = list(df_temp['ban'])
        batch_result['ban'] = batch_result['ban'].astype('str')
        batch_result['score_date'] = score_date_dash
        batch_result['model_id'] = MODEL_ID

        upsert_table(project_id,
                     dataset_id,
                     score_table,
                     sql,
                     batch_result,
                     )
        if i % 20 == 0:
            print('predict for batch {} done'.format(i), end=' ')

    time.sleep(120)
    
    

### Post Process

In [None]:
def postprocess(
        project_id: str,
        file_bucket: str,
        service_type: str,
        score_date_dash: str,
):
    import time
    import pandas as pd
    from google.cloud import bigquery

    MODEL_ID = '5090'
    file_name = 'gs://{}/ucar/{}_prediction.csv.gz'.format(file_bucket, service_type)
    df_orig = pd.read_csv(file_name, compression='gzip')
    df_orig.dropna(subset=['ban'], inplace=True)
    df_orig.reset_index(drop=True, inplace=True)
    df_orig['scoring_date'] = score_date_dash
    df_orig.ban = df_orig.ban.astype(int)
    df_orig = df_orig.rename(columns={'ban': 'bus_bacct_num', 'score': 'score_num'})
    df_orig.score_num = df_orig.score_num.astype(float)
    df_orig['decile_grp_num'] = pd.qcut(df_orig['score_num'], q=10, labels=[i for i in range(10, 0, -1)])
    df_orig['percentile_pct'] = df_orig.score_num.rank(pct=True)
    df_orig['predict_model_nm'] = 'FFH Call To Retention Model - DIVG'
    df_orig['model_type_cd'] = 'FFH'
    df_orig['subscriber_no'] = ""
    df_orig['prod_instnc_resrc_str'] = ""
    df_orig['service_instnc_id'] = ""
    df_orig['segment_nm'] = ""
    df_orig['segment_id'] = ""
    df_orig['classn_nm'] = ""
    df_orig['predict_model_id'] = MODEL_ID
    df_orig.drop(columns=['model_id', 'score_date'], axis=1, inplace=True)

    get_cust_id = """
    WITH bq_snpsht_max_date AS(
    SELECT PARSE_DATE('%Y%m%d', MAX(partition_id)) AS max_date
        FROM `cio-datahub-enterprise-pr-183a.ent_cust_cust.INFORMATION_SCHEMA.PARTITIONS` 
    WHERE table_name = 'bq_prod_instnc_snpsht' 
        AND partition_id <> '__NULL__'
    ),
    -- BANs can have multiple Cust ID. Create rank by product type and status, prioritizing ban/cust id with active FFH products
    rank_prod_type AS (
    SELECT DISTINCT
        bacct_bus_bacct_num,
        consldt_cust_bus_cust_id AS cust_id,
        CASE WHEN pi_prod_instnc_resrc_typ_cd IN ('SING', 'HSIC', 'TTV', 'SMHM', 'STV', 'DIIC') AND pi_prod_instnc_stat_cd = 'A' THEN 1
                WHEN pi_prod_instnc_resrc_typ_cd IN ('SING', 'HSIC', 'TTV', 'SMHM', 'STV', 'DIIC') THEN 2
                WHEN pi_prod_instnc_stat_cd = 'A' THEN 3
                ELSE 4
                END AS prod_rank
    FROM `cio-datahub-enterprise-pr-183a.ent_cust_cust.bq_prod_instnc_snpsht`
    CROSS JOIN bq_snpsht_max_date
    WHERE CAST(prod_instnc_ts AS DATE)=bq_snpsht_max_date.max_date
    AND bus_prod_instnc_src_id = 1001
    ),
    --Rank Cust ID
    rank_cust_id AS (
    SELECT DISTINCT
        bacct_bus_bacct_num,
        cust_id,
        RANK() OVER(PARTITION BY bacct_bus_bacct_num
                        ORDER BY prod_rank,
                                    cust_id) AS cust_id_rank               
    FROM rank_prod_type
    )
    --Select best cust id
    SELECT bacct_bus_bacct_num,
        cust_id
    FROM rank_cust_id
    WHERE cust_id_rank = 1
    """

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    client = get_gcp_bqclient(project_id)

    # bq_client = bigquery.Client(project=project_id)
    
    # client = bigquery.Client(project=project_id)
    df_cust = client.query(get_cust_id).to_dataframe()
    df_final = df_orig.set_index('bus_bacct_num').join(df_cust.set_index('bacct_bus_bacct_num')).reset_index()
    df_final = df_final.rename(columns={'index': 'bus_bacct_num', 'cust_bus_cust_id': 'cust_id'})
    df_final = df_final.sort_values(by=['score_num'], ascending=False)
    df_final.to_csv(file_name, compression='gzip', index=False)
    time.sleep(300)


### Pipeline

In [None]:

# @dsl.pipeline(
#     # A name for the pipeline.
#     name="{}-xgb-pipeline".format(SERVICE_TYPE_NAME),
#     description=' pipeline for training {} model'.format(SERVICE_TYPE_NAME)
# )
def pipeline(
        project_id: str = PROJECT_ID,
        region: str = REGION,
        resource_bucket: str = RESOURCE_BUCKET,
        file_bucket: str = FILE_BUCKET
    ):
    
    # ----- create training set --------
    bq_create_scoring_dataset_op = bq_create_dataset(score_date=SCORE_DATE_DASH,
                          score_date_delta=SCORE_DATE_DELTA,
                          project_id=PROJECT_ID,
                          dataset_id=DATASET_ID,
                          region=REGION,
                          promo_expiry_start=PROMO_EXPIRY_START, 
                          promo_expiry_end=PROMO_EXPIRY_END, 
                          v_start_date=SCORE_DATE_MINUS_6_MOS_DASH,
                          v_end_date=SCORE_DATE_LAST_MONTH_END_DASH)
    
    # ----- preprocessing train data --------
    preprocess_scoring_op = preprocess(
        pipeline_dataset='bq_ctr_pipeline_dataset', 
        save_data_path='gs://{}/{}_score.csv.gz'.format(FILE_BUCKET, SERVICE_TYPE),
        project_id=PROJECT_ID,
        dataset_id=DATASET_ID
    )

    # preprocess_train_op.set_memory_limit('128G')
    # preprocess_train_op.set_cpu_limit('32')

    bq_create_scoring_dataset_op 
    preprocess_scoring_op

    batch_prediction_op = batch_prediction(
        project_id=PROJECT_ID,
        dataset_id=DATASET_ID,
        file_bucket=FILE_BUCKET,
        service_type=SERVICE_TYPE,
        score_date_dash=SCORE_DATE_DASH,
        score_table='bq_call_to_retention_scores',
    )
    # batch_prediction_op.set_memory_limit('32G')
    # batch_prediction_op.set_cpu_limit('4')

    batch_prediction_op

    postprocessing_op = postprocess(
        project_id=PROJECT_ID,
        file_bucket=FILE_BUCKET,
        service_type=SERVICE_TYPE,
        score_date_dash=SCORE_DATE_DASH,
    )
    # postprocessing_op.set_memory_limit('16G')
    # postprocessing_op.set_cpu_limit('4')

    postprocessing_op


### Run the Pipeline Job

In [None]:
# pipeline(project_id = PROJECT_ID,
#         region = REGION,
#         resource_bucket = RESOURCE_BUCKET,
#         file_bucket = FILE_BUCKET)


pipeline(project_id = PROJECT_ID,
        region = REGION,
        resource_bucket = RESOURCE_BUCKET, 
        file_bucket = FILE_BUCKET)

In [None]:
# from kfp.v2 import compiler
# from google.cloud.aiplatform import pipeline_jobs

# import json

# compiler.Compiler().compile(
#    pipeline_func=pipeline, package_path="pipeline.json"
# )

# job = pipeline_jobs.PipelineJob(
#                                display_name=PIPELINE_NAME,
#                                template_path="pipeline.json",
#                                location=REGION,
#                                enable_caching=False,
#                                pipeline_root = f"gs://{RESOURCE_BUCKET}"
# )
# job.run(
#    service_account = f"bilayer-sa@{PROJECT_ID}.iam.gserviceaccount.com"
# )