In [1]:


def main(mapping):
    print(mapping)
    from kfp import dsl
    from kfp.v2.dsl import component
    from datetime import date
    from dateutil.relativedelta import relativedelta

    SERVICE_TYPE = 'whsia-churn'
    DATASET_ID = 'whsia_churn_dataset'
    PROJECT_ID = mapping['PROJECT_ID']
    RESOURCE_BUCKET = mapping['resources_bucket']
    FILE_BUCKET = mapping['gcs_csv_bucket']
    REGION = mapping['REGION']
    FOLDER_NAME = 'whsia_churn_deploy'
    TABLE_ID = 'bq_whsia_churn_score'
    QUERIES_PATH = 'vertex_pipelines/' + FOLDER_NAME + '/queries/'

    QUERY_DATE = (date.today() - relativedelta(days=1)).strftime('%Y-%m-%d')
    wHSIA_QUERY_VIEW_NAME = 'whsia_query_path_view'
    wHSIA_QUERY_PATH = QUERIES_PATH + 'create_input_account_active_whsia_bans_query.txt'
    TARGET_TABLE_REF = '{}.{}.{}'.format(PROJECT_ID, DATASET_ID, TABLE_ID)
    
    @component(
        base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/bi-platform/vertex_pipelines/kfp-preprocess-slim:latest",
        output_component_file="whsia_churn_model_bans_list.yaml",
    )
    def create_wHSIA_view(view_name: str,
                        query_date: str,
                        project_id: str,
                        dataset_id: str,
                        region: str,
                        resource_bucket: str,
                        query_path: str,
                        ):

        from google.cloud import bigquery
        from google.cloud import storage

        def if_tbl_exists(client, table_ref):
            from google.cloud.exceptions import NotFound
            try:
                client.get_table(table_ref)
                return True
            except NotFound:
                return False

        bq_client = bigquery.Client(project=project_id)
        dataset = bq_client.dataset(dataset_id)
        table_ref = dataset.table(view_name)

        # load query from .txt file
        storage_client = storage.Client()
        bucket = storage_client.get_bucket(resource_bucket)
        blob = bucket.get_blob(query_path)
        content = blob.download_as_string()
        content = str(content, 'utf-8')

        if if_tbl_exists(bq_client, table_ref):
            bq_client.delete_table(table_ref)

        create_wHSIA_query = content.format(query_date=query_date)
        shared_dataset_ref = bq_client.dataset(dataset_id)
        base_feature_set_view_ref = shared_dataset_ref.table(view_name)
        base_feature_set_view = bigquery.Table(base_feature_set_view_ref)
        base_feature_set_view.view_query = create_wHSIA_query
        base_feature_set_view = bq_client.create_table(base_feature_set_view)

    @component(
        base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/bi-platform/vertex_pipelines/kfp-preprocess-slim:latest",
        output_component_file="whsia_churn_process.yaml",
    )
    def wHSIA_processing(wHSIA_view: str,
                       project_id: str,
                       dataset_id: str,
                       table_id: str,
                       query_date: str,
                       file_bucket: str,
                       ):

        from google.cloud import bigquery
        import pandas as pd
        import time
        
        def upsert_table(project_id, dataset_id, table_id, sql, result):
            new_values = ',\n'.join(result.apply(lambda row: row_format(row), axis=1))
            new_sql = sql.format(proj_id=project_id, dataset_id=dataset_id, table_id=table_id,
                                 new_values=new_values)
            bq_client = bigquery.Client(project=project_id)
            code = bq_client.query(new_sql)
            time.sleep(5)

        def row_format(row):
            values = row.values
            new_values = ""
            v = str(values[0]) if not pd.isnull(values[0]) else 'NULL'
            if 'str' in str(type(values[0])):
                new_values += f"'{v}'"
            else:
                new_values += f"{v}"

            for i in range(1, len(values)):
                v = str(values[i]) if not pd.isnull(values[i]) else 'NULL'
                if 'str' in str(type(values[i])):
                    new_values += f",'{v}'"
                else:
                    new_values += f",{v}"
            return '(' + new_values + ')'

        def generate_sql_file(ll):
            s = 'MERGE INTO `{proj_id}.{dataset_id}.{table_id}` a'
            s += " USING UNNEST("
            s += "[struct<"
            for i in range(len(ll) - 1):
                v = ll[i]
                s += "{} {},".format(v[0], v[1])
            s += "{} {}".format(ll[-1][0], ll[-1][1])
            s += ">{new_values}]"
            s += ") b"
            s += " ON a.score_date = b.score_date and a.ban = b.ban"
            s += " WHEN MATCHED THEN"
            s += " UPDATE SET "
            s += "a.{}=b.{},".format(ll[0][0], ll[0][0])
            for i in range(1, len(ll) - 1):
                v = ll[i]
                s += "a.{}=b.{},".format(v[0], v[0])
            s += "a.{}=b.{}".format(ll[-1][0], ll[-1][0])
            s += " WHEN NOT MATCHED THEN"
            s += " INSERT("
            for i in range(len(ll) - 1):
                v = ll[i]
                s += "{},".format(v[0])
            s += "{})".format(ll[-1][0])
            s += " VALUES("
            for i in range(len(ll) - 1):
                s += "b.{},".format(ll[i][0])
            s += "b.{}".format(ll[-1][0])
            s += ")"

            return s 

        MODEL_ID = '5070'
        client = bigquery.Client(project=project_id)
        wHSIA_data = f"{project_id}.{dataset_id}.{wHSIA_view}"

        wHSIA = '''SELECT * FROM `{wHSIA_data}`'''.format(wHSIA_data=wHSIA_data)
        df_wHSIA = client.query(wHSIA).to_dataframe()
        cols = ['ban', 'score_date', 'model_id', 'score']
        df_wHSIA = df_wHSIA[cols]
        print('......wHSIA table generated with {} samples'.format(df_wHSIA.shape[0]))

        # save current results to bucket for UCAR inputs
        file_name = 'gs://{}/ucar/wHSIA_churn.csv.gz'.format(file_bucket)
        results = df_wHSIA
        results.to_csv(file_name, compression='gzip', index=False)

        ll = [('ban', 'string'), ('score_date', 'string'), ('model_id', 'string'), ('score', 'float')]
        sql = generate_sql_file(ll)

        batch_size = 2000
        n_batchs = int(df_wHSIA.shape[0] / batch_size) + 1
        print('...... will upsert {} batches'.format(n_batchs))
        df_wHSIA['ban'] = df_wHSIA['ban'].astype(str)
        df_wHSIA['model_id'] = df_wHSIA['model_id'].astype(str)
        df_wHSIA['score_date'] = df_wHSIA['score_date'].astype(str)

        for i in range(n_batchs):
            s, e = i * batch_size, (i + 1) * batch_size
            if e >= df_wHSIA.shape[0]:
                e = df_wHSIA.shape[0]

            df_temp = df_wHSIA.iloc[s:e]

            upsert_table(project_id,
                         dataset_id,
                         table_id,
                         sql,
                         df_temp,
                         )
            if i % 20 == 0:
                print('predict for batch {} done'.format(i), end=' ')
        
        time.sleep(120)
        
    @dsl.pipeline(
        # A name for the pipeline.
        name="whsia-churn-base-table",
        description='pipeline for whsia churn - part 1'
    )
    def pipeline(
            project_id: str = PROJECT_ID,
            region: str = REGION,
            resource_bucket: str = RESOURCE_BUCKET,
            file_bucket: str = FILE_BUCKET
    ):
    
        # -------------  create ops ---------------
        create_wHSIA_view_op = create_wHSIA_view(
            view_name=wHSIA_QUERY_VIEW_NAME,
            query_date=QUERY_DATE,
            project_id=PROJECT_ID,
            dataset_id=DATASET_ID,
            region=REGION,
            resource_bucket=RESOURCE_BUCKET,
            query_path=wHSIA_QUERY_PATH,
        )        
        create_wHSIA_view_op.set_memory_limit('32G')
        create_wHSIA_view_op.set_cpu_limit('4')

        wHSIA_processing_op = wHSIA_processing(
            wHSIA_view=wHSIA_QUERY_VIEW_NAME,
            project_id=PROJECT_ID,
            dataset_id=DATASET_ID,
            table_id=TABLE_ID,
            query_date=QUERY_DATE,
            file_bucket=FILE_BUCKET,
        )
        wHSIA_processing_op.set_memory_limit('32G')
        wHSIA_processing_op.set_cpu_limit('4')
        wHSIA_processing_op.after(create_wHSIA_op)

    return pipeline

In [7]:
from datetime import date
from dateutil.relativedelta import relativedelta

SERVICE_TYPE = 'whsia-churn'
DATASET_ID = 'whsia_churn_dataset'
PROJECT_ID = 'project_id'
FOLDER_NAME = 'whsia_churn_deploy'
TABLE_ID = 'bq_whsia_churn_score'
QUERIES_PATH = 'vertex_pipelines/' + FOLDER_NAME + '/queries/'

QUERY_DATE = (date.today() - relativedelta(days=1)).strftime('%Y-%m-%d')
wHSIA_QUERY_PATH_NAME = 'whsia_query_path_view'
wHSIA_QUERY_PATH = QUERIES_PATH + 'wHSIA.txt'

In [8]:
QUERY_DATE

'2023-03-23'

In [9]:
wHSIA_QUERY_PATH_NAME

'wHSIA_QUERY_PATH_view'

In [10]:
wHSIA_QUERY_PATH

'vertex_pipelines/whsia_churn_deploy/queries/wHSIA.txt'