# Vertex pipeline

This notebook will take the work of defining the BQML PMI matrix factorization and ANN deployment found in the 01 and 02 notebooks, then will create a queryable ScaNN index as seen in notebook 05. If there is a need to convert the keras model, the steps below should explain how to productionize, many of these steps can be explained in this [repo](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/master/notebooks/official/pipelines/pipelines_intro_kfp.ipynb)

The goal of this notebook is:
1. Define the tasks that will be encapsulated in pipeline components
2. Define the pipeline
3. Run and monitor the pipeline

In [1]:
from datetime import datetime
import os

PROJECT_ID = "rec-ai-demo-326116"  # @param {type:"string"}
REGION = "us-central1"  # @param {type: "string"}
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET = 'rec_bq_jsw' # Change to the bucket you created.
BUCKET_NAME = f'gs://{BUCKET}'
DATASET_NAME="css_retail"
embeddings_table_name = 'item_embeddings'
output_dir = f'gs://{BUCKET}/bqml/item_embeddings'
temp_location = os.path.join(output_dir, 'tmp')


In [2]:
# Get your GCP project id from gcloud
shell_output = !gcloud auth list 2>/dev/null
SERVICE_ACCOUNT = shell_output[2].strip()
print("Service Account:", SERVICE_ACCOUNT)

Service Account: 733956866731-compute@developer.gserviceaccount.com


### Set service account access for Vertex Pipelines
Run the following commands to grant your service account access to read and write pipeline artifacts in the bucket that you created in the previous step -- you only need to run these once per service account.

In [3]:
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_NAME

! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_NAME

No changes made to gs://rec_bq_jsw/
No changes made to gs://rec_bq_jsw/


In [4]:
import google.cloud.aiplatform as aip
# API service endpoint
API_ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)

PIPELINE_ROOT = "{}/pipeline_root/intro".format(BUCKET_NAME)

In [5]:
from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import component

## Initialize Vertex SDK for Python
Initialize the Vertex SDK for Python for your project and corresponding bucket.

In [6]:
aip.init(project=PROJECT_ID, staging_bucket=BUCKET_NAME)

### Set up Biq Query DDLs for the pipelines
These functions will be leveraged to make bq calls and train the BQML models. Guide found [here](https://medium.com/google-cloud/using-bigquery-and-bigquery-ml-from-kubeflow-pipelines-991a2fa4bea8) - this will be a reusable component to interface with BQ via query strings

### Define Python function-based pipeline components
In this tutorial, you define a simple pipeline that has three steps, where each step is defined as a component.

### First component: compute the `sp_ComputePMI` stored proc

In [7]:
from typing import NamedTuple

In [8]:
@component(output_component_file="bqml_scann_pipeline.yaml", 
           base_image="python:3.7", 
           packages_to_install=['google-cloud-bigquery']
          )
def run_a_bq_call(
  project: str, query: str) -> NamedTuple('Outputs', [('COMPLETE_CODE', str)]):
    from google.cloud import bigquery
    
    bq_client = bigquery.Client(project=project)
    r = bq_client.query(query).result()

    return (
    str(200),
    )

## Declare the pipeline

In [9]:
# scratch area

In [10]:
#this loads the sprocs found in {BUCKET}/sql_scripts - loaded up in 00_prep_bq...

@component(packages_to_install=['google-cloud-bigquery', 'apache-beam[gcp]', 'gsutil']
          )
def load_sprocs(
  bucket: str, dataset_name: str) -> NamedTuple('Outputs', [('COMPLETE_CODE', str)]):
    import os
    
    sql_scripts = dict()
    SQL_SCRIPTS_DIR = f'{bucket}/sql_scripts'
    BQ_DATASET_NAME = dataset_name

    os.mkdir("downloads_")
    os.system(f"gsutil cp {SQL_SCRIPTS_DIR}/*.sql downloads_/")
    SQL_SCRIPTS_DIR = "downloads_/"

    for script_file in [file for file in os.listdir(SQL_SCRIPTS_DIR) if '.sql' in file]:
        script_file_path = os.path.join(SQL_SCRIPTS_DIR, script_file)
        sql_script = open(script_file_path, 'r').read()
        sql_script = sql_script.replace('@DATASET_NAME', BQ_DATASET_NAME)
        sql_scripts[script_file] = sql_script
    for script_file in sql_scripts:
        print(f'Executing {script_file} script...')
        query = sql_scripts[script_file]
        query_job = client.query(query)
        result = query_job.result()

    return (
    str(200),
    )


In [18]:
create_cooc_matrix_query = """
CREATE TABLE IF NOT EXISTS css_retail.item_cooc
AS SELECT 0 AS item1_Id, 0 AS item2_Id, 0 AS cooc, 0 AS pmi;
"""

create_bqml_model_query = """
CREATE MODEL IF NOT EXISTS css_retail.item_matching_model
OPTIONS(
    MODEL_TYPE='matrix_factorization', 
    USER_COL='item1_Id', 
    ITEM_COL='item2_Id',
    RATING_COL='score'
)
AS
SELECT 0 AS item1_Id, 0 AS item2_Id, 0 AS score;
"""

compute_PMI_query = """
DECLARE min_item_frequency INT64;
DECLARE max_group_size INT64;

SET min_item_frequency = 15;
SET max_group_size = 100;

CALL css_retail.sp_ComputePMI(min_item_frequency, max_group_size);
"""

train_item_matching_query = """
DECLARE dimensions INT64 DEFAULT 50;
CALL css_retail.sp_TrainItemMatchingModel(dimensions)
"""

extract_embeddings_query = """
CALL css_retail.sp_ExractEmbeddings() 
"""

export_embeddings_query = f"""
CREATE TEMP FUNCTION array_int_to_string(int_array ARRAY<FLOAT64>) 
  RETURNS ARRAY<STRING> LANGUAGE js as "return int_array.map(x => x+'')";
  
EXPORT DATA
OPTIONS (uri='{BUCKET_NAME}/bqml/item_embeddings/*.csv',
  format='CSV',
  overwrite=true) AS
select item_id, array_to_string(array_int_to_string(embedding), ',')  as embedding_string 
from `{PROJECT_ID}.{DATASET_NAME}.item_embeddings`
"""

In [19]:
export_embeddings_query

'\nCREATE TEMP FUNCTION array_int_to_string(int_array ARRAY<FLOAT64>) \n  RETURNS ARRAY<STRING> LANGUAGE js as "return int_array.map(x => x+\'\')";\n  \nEXPORT DATA\nOPTIONS (uri=\'gs://rec_bq_jsw/bqml/item_embeddings/*.csv\',\n  format=\'CSV\',\n  overwrite=true) AS\nselect item_id, array_to_string(array_int_to_string(embedding), \',\')  as embedding_string \nfrom `rec-ai-demo-326116.css_retail.item_embeddings`\n'

### Dataflow beam component

In [12]:
# # # Notebook 2 task - export the trained embeddings to cloud storage
# @component(base_image="apache/beam_python3.7_sdk", 
#            packages_to_install=['google-cloud-bigquery']
#           )
# def beam_export_to_GCS(project: str, bq_dataset_name: str, embeddings_table_name: str, 
#         output_dir: str, region: str)-> NamedTuple('Outputs', [('COMPLETE_CODE', str)]):
#     import os
#     import apache_beam as beam
#     from apache_beam.runners.dataflow import dataflow_runner
    
#     EMBEDDING_FILE_PREFIX = 'embeddings'

#     def to_csv(entry):
#         item_Id = entry['item_Id']
#         embedding = entry['embedding']
#         csv_string = f'{item_Id},'
#         csv_string += ','.join([str(value) for value in embedding])
#         return csv_string
    
#     def get_query(dataset_name, table_name):
#         query = f'''
#         SELECT 
#           item_Id,
#           embedding
#         FROM 
#           `{dataset_name}.{table_name}`;
#         '''
#         return query


#     pipeline_options = beam.options.pipeline_options.PipelineOptions(runner='dataflow',
#                                                                      project=project,
#                                                                      job_name="copy-embeddings",
#                                                                      region='US',
#                                                                      temp_location=output_dir+"/tmp"
#                                                                     )
#     proj = pipeline_options.get_all_options()['project']
    
#     with beam.Pipeline(options=pipeline_options) as pipeline:

#         query = get_query(bq_dataset_name, embeddings_table_name)
#         output_prefix = os.path.join(output_dir, EMBEDDING_FILE_PREFIX)

#         _ = (
#         pipeline
#         | 'ReadFromBigQuery' >> beam.io.ReadFromBigQuery(
#             project=proj, query=query, use_standard_sql=True, flatten_results=False)
#         | 'ConvertToCsv' >> beam.Map(to_csv)
#         | 'WriteToCloudStorage' >> beam.io.WriteToText(
#             file_path_prefix = output_prefix,
#             file_name_suffix = ".csv")
#         )
#     return (
#     str(200),
#     )

In [22]:
@dsl.pipeline(name="bqml-scann-demo",
             description="a bqml matching engine demo",
             pipeline_root=PIPELINE_ROOT)
def pipeline(project: str=PROJECT_ID, compute_PMI_query: str=compute_PMI_query,
            train_item_matching_query: str=train_item_matching_query,
            extract_embeddings_query: str=extract_embeddings_query,
            dataset_name: str=DATASET_NAME, embeddings_table_name: str=embeddings_table_name,
             output_dir: str=output_dir, region: str=REGION,
             create_cooc_matrix_query: str = create_cooc_matrix_query,
             create_bqml_model_query: str = create_bqml_model_query,
             bucket_name: str = BUCKET_NAME,
             export_embeddings_query: str = export_embeddings_query
):
    
    ###### NOTEBOOK 0 TASKS
    create_cooc_matrix = run_a_bq_call(project, create_cooc_matrix_query)
    
    create_bqml_model = run_a_bq_call(project, create_bqml_model_query).after(create_cooc_matrix)
    
    loaded_sprocs = load_sprocs(bucket = bucket_name, dataset_name=dataset_name).after(create_bqml_model)
    
    ###### NOTEBOOK 1 TASKS
    compute_bq_pmi_task = run_a_bq_call(project, compute_PMI_query).after(loaded_sprocs)
    
    train_bq_item_match_task = run_a_bq_call(project, train_item_matching_query).after(loaded_sprocs)
    ###### NOTEBOOK 2 TASKS
    create_bq_embeddings_task = run_a_bq_call(project, extract_embeddings_query).after(train_bq_item_match_task)
    
    export_emb_gcs = run_a_bq_call(project, export_embeddings_query).after(create_bq_embeddings_task)
    

### Now that the reusable components are set in a pipeline, declare the paramaters, queries, logic, etc

In [23]:
from kfp.v2 import compiler  # noqa: F811

compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="intro_pipeline.json".replace(" ", "_")
)

In [24]:
DISPLAY_NAME = "intro_" + TIMESTAMP

job = aip.PipelineJob(
    display_name=DISPLAY_NAME,
    template_path="intro_pipeline.json".replace(" ", "_"),
    pipeline_root=PIPELINE_ROOT,
)

job.run()

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/733956866731/locations/us-central1/pipelineJobs/bqml-scann-demo-20210923190217
INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:
INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/733956866731/locations/us-central1/pipelineJobs/bqml-scann-demo-20210923190217')
INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/bqml-scann-demo-20210923190217?project=733956866731
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/733956866731/locations/us-central1/pipelineJobs/bqml-scann-demo-20210923190217 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/733956866731/locations/us-central1/pipelineJobs/bqml-

In [None]:
def run_bigquery_ddl(project_id: str, query_string: str, 
    location: str):
    """
    Runs BigQuery query and returns a table/model name
    """
    print(query_string)
        
    from google.cloud import bigquery
    from google.api_core.future import polling
    from google.cloud import bigquery
    from google.cloud.bigquery import retry as bq_retry
    
    bqclient = bigquery.Client(project=project_id, location=location)
    job = bqclient.query(query_string, retry=bq_retry.DEFAULT_RETRY)
    job._retry = polling.DEFAULT_RETRY
    
    while job.running():
        from time import sleep
        sleep(0.1)
        print('Running ...')
    return print("Job complete!")

In [61]:
j = run_bigquery_ddl(query_string=export_embeddings_query,
                                     project_id=PROJECT_ID,
                                     location='US')


EXPORT DATA
OPTIONS (uri='gs://rec_bq_jsw/bqml/item_embeddings/*.csv',
  format='CSV',
  overwrite=true) AS
SELECT * FROM `css_retail.item_embeddings`

Running ...
Job complete!


In [None]:
if not j.error_result:
    print("gj!")

In [None]:
job.display_name