In [24]:
# !pip install pandas-gbq==0.12.0 keras-tuner --user

In [1]:
import pandas as pd
from google.cloud import bigquery
import time

PROJECT_ID = 'cpg-cdp'
REGION = 'us-central1'

### Download your data after manually labeling
Process: BQ -> sheets (label) -> csv -> upload to notebook

In [2]:
labels = pd.read_csv('P&G Trendspotting Term Classification - Export2BQ.csv')

In [3]:
labels

Unnamed: 0,CENTROID_ID,top_terms.sentences,top_terms.n,JW Categorization
0,20,curtain bangs,12541,Hair Styling
1,20,haircut,12533,Hair Styling
2,20,mullet haircut,12457,Hair Styling
3,20,cute hairstyles,12444,Hair Styling
4,20,shag haircut,12295,Hair Styling
...,...,...,...,...
703,2,light brown hair with highlights,9625,Hair Dyes & Coloring
704,2,short blonde hair,9591,Hair Styling
705,2,gray color,9501,Hair Dyes & Coloring
706,2,wedding hair styles,9431,Hair Styling


In [4]:
### Upload the data to BQ

In [5]:
#fix column names

col_names = ['CENTROID_ID', 'terms', 'count', 'label']
labels.columns = col_names
labels.to_gbq(
destination_table=f'trendspotting.labels_jw_pl_png_hair_22', 
project_id=PROJECT_ID, # TODO: param
location='US', 
progress_bar=True, 
reauth=True, 
if_exists='append'
) 

1it [00:04,  4.21s/it]


In [6]:
%%time
bq_client = bigquery.Client(PROJECT_ID)

sql = f""" CREATE OR REPLACE TABLE
  trendspotting.labeled_training_jw_pl_png_hair_22 AS (
  SELECT
    * EXCEPT(category_rank,
      split_col)
  FROM
    cpg-cdp.trendspotting.ETL_futurama_weekly_embed_aml_pl_png_hair_22 a
  INNER JOIN
    trendspotting.labels_jw_pl_png_hair_22 b
  ON
    a.sentences = b.terms )
"""

bq_client.query(sql).result()

CPU times: user 28.2 ms, sys: 6.15 ms, total: 34.3 ms
Wall time: 15.3 s


<google.cloud.bigquery.table._EmptyRowIterator at 0x7fda71358e90>

### Create a managed dataset for an autoML model to classify the new categories

In [49]:
from typing import List, Union

from google.cloud import aiplatform


dataset_display_name = "classification_jw_pl_png_hair_22"
path = "bq://cpg-cdp.trendspotting.labeled_training_jw_pl_png_hair_22"

aiplatform.init(project=PROJECT_ID, location=REGION)

ds = aiplatform.TabularDataset.create(
    display_name=dataset_display_name,
    bq_source=path,
    sync=True,
)

ds.wait()

print(ds.display_name)
print(ds.resource_name)

INFO:google.cloud.aiplatform.datasets.dataset:Creating TabularDataset
INFO:google.cloud.aiplatform.datasets.dataset:Create TabularDataset backing LRO: projects/939655404703/locations/us-central1/datasets/3949834994087624704/operations/531240381673635840
INFO:google.cloud.aiplatform.datasets.dataset:TabularDataset created. Resource name: projects/939655404703/locations/us-central1/datasets/3949834994087624704
INFO:google.cloud.aiplatform.datasets.dataset:To use this TabularDataset in another session:
INFO:google.cloud.aiplatform.datasets.dataset:ds = aiplatform.TabularDataset('projects/939655404703/locations/us-central1/datasets/3949834994087624704')
classification_jw_pl_png_hair_22
projects/939655404703/locations/us-central1/datasets/3949834994087624704


#### Train the model

Below we get the column specs - manually created or you can copy/paste when manually creating an automl job from the dataset

In [55]:
from src.colspecs import col_spec
col_spec

[{'auto': {'columnName': 'date'}},
 {'auto': {'columnName': 'geo_id'}},
 {'auto': {'columnName': 'sentences'}},
 {'auto': {'columnName': 'series_id'}},
 {'auto': {'columnName': 'emb1'}},
 {'auto': {'columnName': 'emb2'}},
 {'auto': {'columnName': 'emb3'}},
 {'auto': {'columnName': 'emb4'}},
 {'auto': {'columnName': 'emb5'}},
 {'auto': {'columnName': 'emb6'}},
 {'auto': {'columnName': 'emb7'}},
 {'auto': {'columnName': 'emb8'}},
 {'auto': {'columnName': 'emb9'}},
 {'auto': {'columnName': 'emb10'}},
 {'auto': {'columnName': 'emb11'}},
 {'auto': {'columnName': 'emb12'}},
 {'auto': {'columnName': 'emb13'}},
 {'auto': {'columnName': 'emb14'}},
 {'auto': {'columnName': 'emb15'}},
 {'auto': {'columnName': 'emb16'}},
 {'auto': {'columnName': 'emb17'}},
 {'auto': {'columnName': 'emb18'}},
 {'auto': {'columnName': 'emb19'}},
 {'auto': {'columnName': 'emb20'}},
 {'auto': {'columnName': 'CENTROID_ID'}},
 {'auto': {'columnName': 'terms'}}]

In [None]:
model_display_name = 'class_jw_pl_png_hair_22_v2'
train_budget_milli_node_hours = 1000



tabular_classification_job = aiplatform.AutoMLTabularTrainingJob(
        display_name=model_display_name, optimization_prediction_type="classification",
        column_transformations=col_spec
    )


model = tabular_classification_job.run(
        dataset=ds,
        target_column='label',
        training_fraction_split=0.8,
        validation_fraction_split=0.1,
        test_fraction_split=0.1,
        budget_milli_node_hours=train_budget_milli_node_hours,
        model_display_name=model_display_name,
        disable_early_stopping=False,
        sync=True,
    )

model.wait()

print(model.display_name)
print(model.resource_name)
print(model.uri)

### Create a batch prediction job to run against the `cpg-cdp.trendspotting.ETL_futurama_weekly_embed_aml_{SUFFIX}` table

In [None]:
# The AI Platform services require regional API endpoints.
client_options = {"api_endpoint": "us-central1-aiplatform.googleapis.com"}
# Initialize client that will be used to create and send requests.
# This client only needs to be created once, and can be reused for multiple requests.
client_v1beta = aiplatform_v1beta1.JobServiceClient(client_options=client_options)
model_parameters_dict = {}
model_parameters = json_format.ParseDict(model_parameters_dict, Value())

batch_prediction_job = {
    "display_name": display_name,
    # Format: 'projects/{project}/locations/{location}/models/{model_id}'
    "model": model_name,
    "model_parameters": model_parameters,
    "input_config": {
        "instances_format": instances_format,
        "bigquery_source": {"input_uri": bigquery_source_input_uri},
    },
    "output_config": {
        "predictions_format": predictions_format,
        "bigquery_destination": {"output_uri": bigquery_destination_output_uri},
    },
    # optional
    "generate_explanation": True,
}
parent = f"projects/{project}/locations/{location}"
response = client.create_batch_prediction_job(
    parent=parent, batch_prediction_job=batch_prediction_job
)
print("response:", response)

## Unique data - possibly better model

In [6]:
%%time
bq_client = bigquery.Client(PROJECT_ID)

sql = f""" CREATE OR REPLACE TABLE
  trendspotting.labeled_distinct_training_jw_pl_png_hair_22 AS (
  SELECT DISTINCT
    * EXCEPT(category_rank,
      split_col, geo_id, date, count, CENTROID_ID)
  FROM
    cpg-cdp.trendspotting.ETL_futurama_weekly_embed_aml_pl_png_hair_22 a
  INNER JOIN
    trendspotting.labels_jw_pl_png_hair_22 b
  ON
    a.sentences = b.terms )
"""

bq_client.query(sql).result()

CPU times: user 31 ms, sys: 0 ns, total: 31 ms
Wall time: 9.42 s


<google.cloud.bigquery.table._EmptyRowIterator at 0x7f5a6821bc90>

In [4]:
### BQML for classification

In [9]:
sql = f""" CREATE OR REPLACE TABLE
  trendspotting.labeled_distinct_training_jw_pl_png_hair_22 AS (
  with distinct_data as (
  SELECT DISTINCT
    * EXCEPT(category_rank,
      split_col, geo_id, date, count, CENTROID_ID),
      rand() as rand_digit
  FROM
    cpg-cdp.trendspotting.ETL_futurama_weekly_embed_aml_pl_png_hair_22 a
  INNER JOIN
    trendspotting.labels_jw_pl_png_hair_22 b
  ON
    a.sentences = b.terms )
  select * EXCEPT(rand_digit),
  case when rand_digit > 0.9 then 'test' when rand_digit > 0.8 then 'valid' else 'train' end as dataframe
  from distinct_data
    )
"""

bq_client.query(sql).result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7f5a68214850>

In [11]:
%%bigquery
CREATE OR REPLACE MODEL
  `trendspotting.bqml_distinct_pl_png_hair_22`
OPTIONS
  ( model_type='LOGISTIC_REG',
    auto_class_weights=TRUE,
    input_label_cols=['label']
  ) AS
SELECT
  * EXCEPT(dataframe)
FROM
  `trendspotting.labeled_distinct_training_jw_pl_png_hair_22`
WHERE
  dataframe = 'train'

Query complete after 0.00s: 100%|██████████| 3/3 [00:00<00:00, 2134.51query/s]                        


In [15]:
%%bigquery
SELECT
  *
FROM
  ML.EVALUATE (MODEL `trendspotting.bqml_distinct_pl_png_hair_22`,
    (
    SELECT
      *
    FROM
      `trendspotting.labeled_distinct_training_jw_pl_png_hair_22`
    WHERE
      dataframe = 'test'
    )
  )

Query complete after 0.00s: 100%|██████████| 8/8 [00:00<00:00, 5260.96query/s]                        
Downloading: 100%|██████████| 1/1 [00:01<00:00,  1.12s/rows]


Unnamed: 0,precision,recall,accuracy,f1_score,log_loss,roc_auc
0,1.0,1.0,1.0,1.0,7.694364e-07,1.0


## Now predict on the entire dataset

In [21]:
%%bigquery
CREATE OR REPLACE TABLE cpg-cdp.trendspotting.classified_terms_bqml_aml_pl_png_hair_22 as (
SELECT
  *
FROM
  ML.PREDICT (MODEL `trendspotting.bqml_distinct_pl_png_hair_22`,
    (
    SELECT
      *,
        sentences as terms
    FROM
      `cpg-cdp.trendspotting.ETL_futurama_weekly_embed_aml_pl_png_hair_22`
     )
  )
)

Query complete after 0.00s: 100%|██████████| 30/30 [00:00<00:00, 13854.78query/s]                      


### Analysis this to see how well it classified terms

### Basic counts

In [24]:
%%bigquery
select 
predicted_label, 
count(1) as count 
from cpg-cdp.trendspotting.classified_terms_bqml_aml_pl_png_hair_22 group by 1

Query complete after 0.00s: 100%|██████████| 2/2 [00:00<00:00, 1206.13query/s]                        
Downloading: 100%|██████████| 7/7 [00:01<00:00,  5.71rows/s]


Unnamed: 0,predicted_label,count
0,Hair Styling,295783310
1,Hair Straighteners and Relaxers,1783415
2,Scalp/Anti-Dandruff Products,891217
3,Hair Dyes & Coloring,99345003
4,Damaged Hair,141862
5,Shampoos & Conditioners,4544975
6,Hair Loss Products,15387134


### Examples
Top n terms

In [37]:
%%bigquery
with distinct_counts as (select distinct predicted_label, sentences, count(1) as count from cpg-cdp.trendspotting.classified_terms_bqml_aml_pl_png_hair_22 group by 1,2)
select
predicted_label, sentences, count
from distinct_counts where predicted_label = 'Hair Styling' order by count desc limit 20

Query complete after 0.00s: 100%|██████████| 5/5 [00:00<00:00, 2734.23query/s]                        
Downloading: 100%|██████████| 20/20 [00:01<00:00, 18.71rows/s]


Unnamed: 0,predicted_label,sentences,count
0,Hair Styling,hair salons near me,12361
1,Hair Styling,curtain bangs,12331
2,Hair Styling,haircut near me,12331
3,Hair Styling,mullet,12328
4,Hair Styling,mens haircuts,12325
5,Hair Styling,haircut,12324
6,Hair Styling,barber shops near me,12319
7,Hair Styling,barbershop near me,12319
8,Hair Styling,sallys,12317
9,Hair Styling,short haircuts for women,12312


### Examine the probabilities 

In [39]:
%%bigquery predicted_probs
WITH
  distinct_counts AS (
  SELECT
    DISTINCT predicted_label,
    sentences,
    COUNT(1) AS count
  FROM
    cpg-cdp.trendspotting.classified_terms_bqml_aml_pl_png_hair_22
  GROUP BY
    1,
    2)
SELECT
  a.predicted_label,
  a.sentences,
  count,
  b.predicted_label_probs
FROM
  distinct_counts a
INNER JOIN
  cpg-cdp.trendspotting.classified_terms_bqml_aml_pl_png_hair_22 b
ON
  a.sentences = b.sentences
WHERE
  a.predicted_label = 'Hair Styling'
ORDER BY
  count DESC
LIMIT
  20

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 927.12query/s] 
Downloading: 100%|██████████| 20/20 [00:01<00:00, 19.66rows/s]


In [46]:
for i in range(5):
    print(predicted_probs.sentences[i], predicted_probs.predicted_label_probs[i])

hair salons near me [{'label': 'Hair Styling', 'prob': 0.6659966652121528}
 {'label': 'Scalp/Anti-Dandruff Products', 'prob': 0.06550108975260646}
 {'label': 'Damaged Hair', 'prob': 0.06313380901178574}
 {'label': 'Hair Dyes & Coloring', 'prob': 0.05953301587697524}
 {'label': 'Hair Straighteners and Relaxers', 'prob': 0.05402233680822926}
 {'label': 'Shampoos & Conditioners', 'prob': 0.048485963395612244}
 {'label': 'Hair Loss Products', 'prob': 0.043327119942638236}]
hair salons near me [{'label': 'Hair Styling', 'prob': 0.6659966652121528}
 {'label': 'Scalp/Anti-Dandruff Products', 'prob': 0.06550108975260646}
 {'label': 'Damaged Hair', 'prob': 0.06313380901178574}
 {'label': 'Hair Dyes & Coloring', 'prob': 0.05953301587697524}
 {'label': 'Hair Straighteners and Relaxers', 'prob': 0.05402233680822926}
 {'label': 'Shampoos & Conditioners', 'prob': 0.048485963395612244}
 {'label': 'Hair Loss Products', 'prob': 0.043327119942638236}]
hair salons near me [{'label': 'Hair Styling', 'prob