In [1]:

import os
import re
import google
from google.oauth2 import credentials
from google.oauth2 import service_account
from google.oauth2.service_account import Credentials
from datetime import date
from datetime import timedelta
from dateutil.relativedelta import relativedelta

SERVICE_TYPE = 'whsia-churn'
PROJECT_ID = 'divg-josh-pr-d1cc3a'
DATASET_ID = 'whsia_churn_dataset'
TABLE_ID = 'bq_whsia_churn_score'
RESOURCE_BUCKET = 'divg-josh-pr-d1cc3a-default'
FILE_BUCKET = 'divg-josh-pr-d1cc3a-default'
REGION = 'northamerica-northeast1'
FOLDER_NAME = 'whsia_churn_deploy'
QUERIES_PATH = 'vertex_pipelines/' + FOLDER_NAME + '/queries/'

QUERY_DATE = (date.today() - relativedelta(days=1)).strftime('%Y-%m-%d')
wHSIA_QUERY_VIEW_NAME = 'whsia_query_path_view'
wHSIA_QUERY_PATH = QUERIES_PATH + 'whsia_bans_query.txt'
TARGET_TABLE_REF = '{}.{}.{}'.format(PROJECT_ID, DATASET_ID, TABLE_ID)


In [2]:
def create_wHSIA_view(view_name: str,
                    query_date: str,
                    project_id: str,
                    dataset_id: str,
                    region: str,
                    resource_bucket: str,
                    query_path: str,
                    ):

    from google.cloud import bigquery
    from google.cloud import storage

    def if_tbl_exists(client, table_ref):
        from google.cloud.exceptions import NotFound
        try:
            client.get_table(table_ref)
            return True
        except NotFound:
            return False

    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    # bq_client = bigquery.Client(project=project_id)
    bq_client = get_gcp_bqclient(project_id)
    dataset = bq_client.dataset(dataset_id)
    table_ref = dataset.table(view_name)

    # load query from .txt file
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(resource_bucket)
    blob = bucket.get_blob(query_path)
    content = blob.download_as_string()
    content = str(content, 'utf-8')

    if if_tbl_exists(bq_client, table_ref):
        bq_client.delete_table(table_ref)

    create_wHSIA_query = content.format(query_date=query_date)
    shared_dataset_ref = bq_client.dataset(dataset_id)
    base_feature_set_view_ref = shared_dataset_ref.table(view_name)
    base_feature_set_view = bigquery.Table(base_feature_set_view_ref)
    base_feature_set_view.view_query = create_wHSIA_query
    base_feature_set_view = bq_client.create_table(base_feature_set_view)


In [3]:
def wHSIA_processing(wHSIA_view: str,
                   project_id: str,
                   dataset_id: str,
                   table_id: str,
                   query_date: str,
                   file_bucket: str,
                   ):

    from google.cloud import bigquery
    import pandas as pd
    import numpy as np
    import time

    def upsert_table(project_id, dataset_id, table_id, sql, result):
        new_values = ',\n'.join(result.apply(lambda row: row_format(row), axis=1))
        
        new_sql = sql.format(proj_id=project_id, dataset_id=dataset_id, table_id=table_id,
                             new_values=new_values)
        
        def get_gcp_bqclient(project_id, use_local_credential=True):
            token = os.popen('gcloud auth print-access-token').read()
            token = re.sub(f'\n$', '', token)
            credentials = google.oauth2.credentials.Credentials(token)

            bq_client = bigquery.Client(project=project_id)
            if use_local_credential:
                bq_client = bigquery.Client(project=project_id, credentials=credentials)
            return bq_client

        # bq_client = bigquery.Client(project=project_id)
        bq_client = get_gcp_bqclient(project_id)
        
        code = bq_client.query(new_sql)
        time.sleep(5)

    def row_format(row):
        values = row.values
        new_values = ""
        v = str(values[0]) if not pd.isnull(values[0]) else 'NULL'
        if 'str' in str(type(values[0])):
            new_values += f"'{v}'"
        else:
            new_values += f"{v}"

        for i in range(1, len(values)):
            v = str(values[i]) if not pd.isnull(values[i]) else 'NULL'
            if 'str' in str(type(values[i])):
                new_values += f",'{v}'"
            else:
                new_values += f",{v}"
        return '(' + new_values + ')'

    def generate_sql_file(ll):
        s = 'MERGE INTO `{proj_id}.{dataset_id}.{table_id}` a'
        s += " USING UNNEST("
        s += "[struct<"
        for i in range(len(ll) - 1):
            v = ll[i]
            s += "{} {},".format(v[0], v[1])
        s += "{} {}".format(ll[-1][0], ll[-1][1])
        s += ">{new_values}]"
        s += ") b"
        s += " ON a.score_date = b.score_date and a.ban = b.ban"
        s += " WHEN MATCHED THEN"
        s += " UPDATE SET "
        s += "a.{}=b.{},".format(ll[0][0], ll[0][0])
        for i in range(1, len(ll) - 1):
            v = ll[i]
            s += "a.{}=b.{},".format(v[0], v[0])
        s += "a.{}=b.{}".format(ll[-1][0], ll[-1][0])
        s += " WHEN NOT MATCHED THEN"
        s += " INSERT("
        for i in range(len(ll) - 1):
            v = ll[i]
            s += "{},".format(v[0])
        s += "{})".format(ll[-1][0])
        s += " VALUES("
        for i in range(len(ll) - 1):
            s += "b.{},".format(ll[i][0])
        s += "b.{}".format(ll[-1][0])
        s += ")"

        return s 

    MODEL_ID = '5070'
    
    def get_gcp_bqclient(project_id, use_local_credential=True):
        token = os.popen('gcloud auth print-access-token').read()
        token = re.sub(f'\n$', '', token)
        credentials = google.oauth2.credentials.Credentials(token)

        bq_client = bigquery.Client(project=project_id)
        if use_local_credential:
            bq_client = bigquery.Client(project=project_id, credentials=credentials)
        return bq_client

    # bq_client = bigquery.Client(project=project_id)
    bq_client = get_gcp_bqclient(project_id)
    
    wHSIA_data = f"{project_id}.{dataset_id}.{wHSIA_view}"

    wHSIA = '''SELECT * FROM `{wHSIA_data}`'''.format(wHSIA_data=wHSIA_data)
    df_wHSIA = bq_client.query(wHSIA).to_dataframe()
    cols = ['ban', 'score_date', 'model_id', 'score']
    df_wHSIA = df_wHSIA[cols]
    print('......wHSIA table generated with {} samples'.format(df_wHSIA.shape[0]))

    # save current results to bucket for UCAR inputs
    file_name = 'gs://{}/ucar/wHSIA_churn.csv.gz'.format(file_bucket)
    results = df_wHSIA
    results.to_csv(file_name, compression='gzip', index=False)

    ll = [('ban', 'string'), ('score_date', 'string'), ('model_id', 'string'), ('score', 'float64')]
    sql = generate_sql_file(ll)

    batch_size = 2000
    n_batchs = int(df_wHSIA.shape[0] / batch_size) + 1
    print('...... will upsert {} batches'.format(n_batchs))
    df_wHSIA['ban'] = df_wHSIA['ban'].astype(str)
    df_wHSIA['model_id'] = df_wHSIA['model_id'].astype(str)
    df_wHSIA['score_date'] = df_wHSIA['score_date'].astype(str)
    
    print(df_wHSIA.head())
    print(df_wHSIA.shape)

    # all_scores = np.array(df_wHSIA['score'].values)
    for i in range(n_batchs):
        s, e = i * batch_size, (i + 1) * batch_size
        if e >= df_wHSIA.shape[0]:
            e = df_wHSIA.shape[0]

        df_temp = df_wHSIA.iloc[s:e]
#         pred_prob = all_scores[s:e]
        
#         batch_result = pd.DataFrame(columns=['ban', 'score_date', 'model_id', 'score'])
#         batch_result['score'] = list(pred_prob)
#         batch_result['score'] = batch_result['score'].fillna(0.0).astype('float64')
#         batch_result['ban'] = list(df_temp['ban'])
#         batch_result['ban'] = batch_result['ban'].astype('str')
#         batch_result['score_date'] = score_date_dash
#         batch_result['model_id'] = MODEL_ID
        
        upsert_table(project_id,
                     dataset_id,
                     table_id,
                     sql,
                     df_temp,
                     )
        if i % 20 == 0:
            print('predict for batch {} done'.format(i), end=' ')


In [4]:
def pipeline(
        project_id: str = PROJECT_ID,
        region: str = REGION,
        resource_bucket: str = RESOURCE_BUCKET
):

    # -------------  create ops ---------------
    create_wHSIA_view_op = create_wHSIA_view(
        view_name=wHSIA_QUERY_VIEW_NAME,
        query_date=QUERY_DATE,
        project_id=PROJECT_ID,
        dataset_id=DATASET_ID,
        region=REGION,
        resource_bucket=RESOURCE_BUCKET,
        query_path=wHSIA_QUERY_PATH
    )        

    wHSIA_processing_op = wHSIA_processing(
        wHSIA_view=wHSIA_QUERY_VIEW_NAME,
        project_id=PROJECT_ID,
        dataset_id=DATASET_ID,
        table_id=TABLE_ID,
        query_date=QUERY_DATE,
        file_bucket=FILE_BUCKET
    )
    
    create_wHSIA_view_op
    wHSIA_processing_op

In [5]:
pipeline(project_id = PROJECT_ID, region = REGION, resource_bucket = RESOURCE_BUCKET, file_bucket = FILE_BUCKET)

......wHSIA table generated with 10000 samples
...... will upsert 6 batches
         ban  score_date model_id     score
0  214240481  2023-03-15     5060  0.188538
1  605221661  2023-03-15     5060  0.191681
2  603041261  2023-03-15     5060  0.192536
3  604181301  2023-03-15     5060  0.192536
4  603936692  2023-03-15     5060  0.188538
(10000, 4)
predict for batch 0 done 