In [2]:
PROJECT_ID = 'cpg-cdp'
REGION = 'us-central1'

In [6]:
%%time
bq_client = bigquery.Client(PROJECT_ID)

sql = f""" CREATE OR REPLACE TABLE
  trendspotting.labeled_training_jw_pl_png_hair_22 AS (
  SELECT
    * EXCEPT(category_rank,
      split_col)
  FROM
    cpg-cdp.trendspotting.ETL_futurama_weekly_embed_aml_pl_png_hair_22 a
  INNER JOIN
    trendspotting.labels_jw_pl_png_hair_22 b
  ON
    a.sentences = b.terms )
"""

bq_client.query(sql).result()

CPU times: user 28.2 ms, sys: 6.15 ms, total: 34.3 ms
Wall time: 15.3 s


<google.cloud.bigquery.table._EmptyRowIterator at 0x7fda71358e90>

### Create a managed dataset for an autoML model to classify the new categories

In [4]:
from typing import List, Union

from google.cloud import aiplatform


dataset_display_name = "classification_jw_pl_png_hair_22"
path = "bq://cpg-cdp.trendspotting.labeled_distinct_training_jw_pl_png_hair_22"

aiplatform.init(project=PROJECT_ID, location=REGION)

ds = aiplatform.TabularDataset.create(
    display_name=dataset_display_name,
    bq_source=path,
    sync=True,
)

ds.wait()

print(ds.display_name)
print(ds.resource_name)

INFO:google.cloud.aiplatform.datasets.dataset:Creating TabularDataset
INFO:google.cloud.aiplatform.datasets.dataset:Create TabularDataset backing LRO: projects/939655404703/locations/us-central1/datasets/8568206043211890688/operations/4500666473944973312
INFO:google.cloud.aiplatform.datasets.dataset:TabularDataset created. Resource name: projects/939655404703/locations/us-central1/datasets/8568206043211890688
INFO:google.cloud.aiplatform.datasets.dataset:To use this TabularDataset in another session:
INFO:google.cloud.aiplatform.datasets.dataset:ds = aiplatform.TabularDataset('projects/939655404703/locations/us-central1/datasets/8568206043211890688')
classification_jw_pl_png_hair_22
projects/939655404703/locations/us-central1/datasets/8568206043211890688


#### Train the model

Below we get the column specs - manually created or you can copy/paste when manually creating an automl job from the dataset

In [7]:
col_spec = [{'auto': {'columnName': 'sentences'}},
 {'auto': {'columnName': 'emb1'}},
 {'auto': {'columnName': 'emb2'}},
 {'auto': {'columnName': 'emb3'}},
 {'auto': {'columnName': 'emb4'}},
 {'auto': {'columnName': 'emb5'}},
 {'auto': {'columnName': 'emb6'}},
 {'auto': {'columnName': 'emb7'}},
 {'auto': {'columnName': 'emb8'}},
 {'auto': {'columnName': 'emb9'}},
 {'auto': {'columnName': 'emb10'}},
 {'auto': {'columnName': 'emb11'}},
 {'auto': {'columnName': 'emb12'}},
 {'auto': {'columnName': 'emb13'}},
 {'auto': {'columnName': 'emb14'}},
 {'auto': {'columnName': 'emb15'}},
 {'auto': {'columnName': 'emb16'}},
 {'auto': {'columnName': 'emb17'}},
 {'auto': {'columnName': 'emb18'}},
 {'auto': {'columnName': 'emb19'}},
 {'auto': {'columnName': 'emb20'}},
 {'auto': {'columnName': 'terms'}}]

In [None]:
model_display_name = 'class_jw_pl_png_hair_22_v2'
train_budget_milli_node_hours = 1000



tabular_classification_job = aiplatform.AutoMLTabularTrainingJob(
        display_name=model_display_name, optimization_prediction_type="classification",
        column_transformations=col_spec
    )


model = tabular_classification_job.run(
        dataset=ds,
        target_column='label',
        predefined_split_column_name='dataframe',
        budget_milli_node_hours=train_budget_milli_node_hours,
        model_display_name=model_display_name,
        disable_early_stopping=False,
        sync=True,
    )

model.wait()

print(model.display_name)
print(model.resource_name)
print(model.uri)

  column_specs, column_transformations


### Create a batch prediction job to run against the `cpg-cdp.trendspotting.ETL_futurama_weekly_embed_aml_{SUFFIX}` table

In [None]:
# The AI Platform services require regional API endpoints.
client_options = {"api_endpoint": "us-central1-aiplatform.googleapis.com"}
# Initialize client that will be used to create and send requests.
# This client only needs to be created once, and can be reused for multiple requests.
client_v1beta = aiplatform_v1beta1.JobServiceClient(client_options=client_options)
model_parameters_dict = {}
model_parameters = json_format.ParseDict(model_parameters_dict, Value())

batch_prediction_job = {
    "display_name": display_name,
    # Format: 'projects/{project}/locations/{location}/models/{model_id}'
    "model": model_name,
    "model_parameters": model_parameters,
    "input_config": {
        "instances_format": instances_format,
        "bigquery_source": {"input_uri": bigquery_source_input_uri},
    },
    "output_config": {
        "predictions_format": predictions_format,
        "bigquery_destination": {"output_uri": bigquery_destination_output_uri},
    },
    # optional
    "generate_explanation": True,
}
parent = f"projects/{project}/locations/{location}"
response = client.create_batch_prediction_job(
    parent=parent, batch_prediction_job=batch_prediction_job
)
print("response:", response)