In [24]:
# !pip install pandas-gbq==0.12.0 keras-tuner --user

In [None]:
# # ### In order to get to this step - run the train_k_means_op, get the top n terms by cluster to genrate a labeling datset

# WITH
#   unique_terms AS (
#   SELECT
#     DISTINCT CENTROID_ID,
#     sentences,
#     COUNT(1) AS n
#   FROM
#     `cpg-cdp.trends_pipeline.ETL_futurama_weekly_embed_cluster_100_elc_skincare_v1_22`
#   GROUP BY
#     1,
#     2 )
# SELECT CENTROID_ID, b.* FROM (SELECT
#   CENTROID_ID,
#   ARRAY_AGG(STRUCT(sentences,
#       n)
#   ORDER BY
#     n DESC
#   LIMIT
#     50) AS top_terms,
# FROM
#   unique_terms
# GROUP BY
#   1
# ), unnest(top_terms) as b

In [1]:
import pandas as pd
from google.cloud import bigquery
import time

PROJECT_ID = 'cpg-cdp'
REGION = 'us-central1'

### Download your data after manually labeling
Process: BQ -> sheets (label) -> csv -> upload to notebook

In [2]:
labels = pd.read_csv('P&G Trendspotting Term Classification - PivotData (1).csv')

In [3]:
col_names = ['terms', 'count', 'label']
labels.columns = col_names
labels

Unnamed: 0,terms,count,label
0,curtain bangs,12541.0,Hair Styling
1,haircut,12533.0,Hair Styling
2,mullet haircut,12457.0,Hair Styling
3,cute hairstyles,12444.0,Hair Styling
4,shag haircut,12295.0,Hair Styling
...,...,...,...
1271,,,
1272,,,
1273,,,
1274,,,


In [4]:
### Upload the data to BQ

In [9]:
#fix column names


labels.to_gbq(
destination_table=f'trendspotting.labels_jw_pl_png_hair_22', 
project_id=PROJECT_ID, # TODO: param
location='US', 
progress_bar=True, 
reauth=True, 
if_exists='replace'
) 

1it [00:04,  4.42s/it]


In [11]:
bq_client = bigquery.Client(PROJECT_ID)


In [10]:
%%time

sql = f""" CREATE OR REPLACE TABLE
  trendspotting.labeled_training_jw_pl_png_hair_22 AS (
  SELECT
    * EXCEPT(category_rank,
      split_col)
  FROM
    cpg-cdp.trendspotting.ETL_futurama_weekly_embed_aml_pl_png_hair_22 a
  INNER JOIN
    trendspotting.labels_jw_pl_png_hair_22 b
  ON
    a.sentences = b.terms and label IS NOT NULL )
  
"""

bq_client.query(sql).result()

CPU times: user 40.2 ms, sys: 0 ns, total: 40.2 ms
Wall time: 16.7 s


<google.cloud.bigquery.table._EmptyRowIterator at 0x7f47e3f29dd0>

## Unique data - possibly better model

In [11]:
### BQML for classification

In [13]:
sql = f""" CREATE OR REPLACE TABLE
  trendspotting.labeled_distinct_training_jw_pl_png_hair_22 AS (
  with distinct_data as (
  SELECT DISTINCT
    * EXCEPT(category_rank, series_id, date)
  FROM
    cpg-cdp.trendspotting.ETL_futurama_weekly_embed_aml_pl_png_hair_22 a
  INNER JOIN
    trendspotting.labels_jw_pl_png_hair_22 b
  ON
    a.sentences = b.terms )
  select distinct *,
  case when rand() > 0.9 then 'VALIDATE' when rand() > 0.8 then 'TEST' else 'TRAIN' end as dataframe
  from distinct_data 
  WHERE label is not null
    )
"""

bq_client.query(sql).result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7f47e3f3e3d0>

In [14]:
%%bigquery
CREATE OR REPLACE MODEL
  `trendspotting.bqml_distinct_pl_png_hair_22`
OPTIONS
  ( model_type='AUTOML_CLASSIFIER',
    BUDGET_HOURS=1.0,
    input_label_cols=['label']
  ) AS
SELECT
  * EXCEPT(dataframe, count)
FROM
  `trendspotting.labeled_distinct_training_jw_pl_png_hair_22`
WHERE
  dataframe = 'TRAIN'

Executing query with job ID: 43101f4b-27bf-438b-864e-68d0403cfc38
Query executing: 2478.08s

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Query executing: 4751.06s

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [22]:
%%bigquery
SELECT
  *
FROM
  ML.EVALUATE (MODEL `trendspotting.bqml_distinct_pl_png_hair_22`,
    (
    SELECT
      *
    FROM
      `trendspotting.labeled_distinct_training_jw_pl_png_hair_22`
    WHERE
      dataframe = 'VALIDATE'
    )
  )

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 975.87query/s] 
Downloading: 100%|██████████| 1/1 [00:01<00:00,  1.25s/rows]


Unnamed: 0,precision,recall,accuracy,f1_score,log_loss,roc_auc
0,0.999052,0.999685,0.998477,0.999366,0.000694,1.0


## Now predict on the entire dataset

In [23]:
%%bigquery
CREATE OR REPLACE TABLE cpg-cdp.trendspotting.classified_terms_bqml_aml_pl_png_hair_22 as (
SELECT
  *
FROM
  ML.PREDICT (MODEL `trendspotting.bqml_distinct_pl_png_hair_22`,
    (
    SELECT
      *,
        sentences as terms
    FROM
      `cpg-cdp.trendspotting.ETL_futurama_weekly_embed_aml_pl_png_hair_22`
     )
  )
)

Query complete after 0.00s: 100%|██████████| 3/3 [00:00<00:00, 1672.81query/s]                        


### Analyze this to see how well it classified terms

### Basic counts

In [24]:
%%bigquery
select 
predicted_label, 
count(1) as count 
from cpg-cdp.trendspotting.classified_terms_bqml_aml_pl_png_hair_22 group by 1

Query complete after 0.00s: 100%|██████████| 2/2 [00:00<00:00, 1006.07query/s]                        
Downloading: 100%|██████████| 9/9 [00:01<00:00,  7.76rows/s]


Unnamed: 0,predicted_label,count
0,Hair Straighteners and Relaxers,15760900
1,Scalp/Anti-Dandruff Products,766387
2,Hair Loss Products,20570349
3,Hair Dyes & Coloring,60284082
4,Shampoos & Conditioners,23981229
5,Hair Styling,251579841
6,Lice,2954776
7,Near me,35924485
8,Damaged Hair,6054867


### get the distinct labels for later

In [4]:
%%bigquery labels
select distinct predicted_label from cpg-cdp.trendspotting.classified_terms_bqml_aml_pl_png_hair_22

Query complete after 0.01s: 100%|██████████| 2/2 [00:00<00:00, 1243.49query/s]                        
Downloading: 100%|██████████| 9/9 [00:01<00:00,  7.03rows/s]


In [5]:
list(labels.predicted_label)

['Scalp/Anti-Dandruff Products',
 'Hair Straighteners and Relaxers',
 'Hair Loss Products',
 'Hair Dyes & Coloring',
 'Damaged Hair',
 'Shampoos & Conditioners',
 'Lice',
 'Hair Styling',
 'Near me']

### Examples
Top n terms

In [6]:
%%bigquery
with distinct_counts as (select distinct predicted_label, sentences, count(1) as count from cpg-cdp.trendspotting.classified_terms_bqml_aml_pl_png_hair_22 group by 1,2)
select
predicted_label, sentences, count
from distinct_counts where predicted_label = 'Hair Dyes & Coloring' order by count desc limit 20

Query complete after 0.00s: 100%|██████████| 4/4 [00:00<00:00, 1970.31query/s]                        
Downloading: 100%|██████████| 20/20 [00:01<00:00, 15.37rows/s]


Unnamed: 0,predicted_label,sentences,count
0,Hair Dyes & Coloring,hair,12278
1,Hair Dyes & Coloring,hair color ideas,12252
2,Hair Dyes & Coloring,blonde hair,12217
3,Hair Dyes & Coloring,burgundy color,12086
4,Hair Dyes & Coloring,red hair,12048
5,Hair Dyes & Coloring,brown hair with blonde highlights,12046
6,Hair Dyes & Coloring,auburn hair,12019
7,Hair Dyes & Coloring,platinum blonde,12005
8,Hair Dyes & Coloring,strawberry blonde,12004
9,Hair Dyes & Coloring,light brown hair,11992


### Examine the probabilities 

In [27]:
%%bigquery predicted_probs
WITH
  distinct_counts AS (
  SELECT
    DISTINCT predicted_label,
    sentences,
    COUNT(1) AS count
  FROM
    cpg-cdp.trendspotting.classified_terms_bqml_aml_pl_png_hair_22
  GROUP BY
    1,
    2)
SELECT
  a.predicted_label,
  a.sentences,
  count,
  b.predicted_label_probs
FROM
  distinct_counts a
INNER JOIN
  cpg-cdp.trendspotting.classified_terms_bqml_aml_pl_png_hair_22 b
ON
  a.sentences = b.sentences
WHERE
  a.predicted_label = 'Hair Styling'
ORDER BY
  count DESC
LIMIT
  20

Query complete after 0.00s: 100%|██████████| 14/14 [00:00<00:00, 6583.73query/s]                       
Downloading: 100%|██████████| 20/20 [00:01<00:00, 16.88rows/s]


In [28]:
for i in range(5):
    print(predicted_probs.sentences[i], predicted_probs.predicted_label_probs[i])

curtain bangs [{'label': 'Hair Styling', 'prob': 0.9999998807907104}
 {'label': 'Hair Dyes & Coloring', 'prob': 8.527867995100635e-11}
 {'label': 'Near me', 'prob': 2.677499021108698e-11}
 {'label': 'Hair Loss Products', 'prob': 2.2754183953699325e-11}
 {'label': 'Hair Straighteners and Relaxers', 'prob': 1.1360073472188592e-11}
 {'label': 'Shampoos & Conditioners', 'prob': 7.19084782453496e-17}
 {'label': 'Damaged Hair', 'prob': 5.196926222694742e-12}
 {'label': 'Lice', 'prob': 5.057575791001835e-14}
 {'label': 'Scalp/Anti-Dandruff Products', 'prob': 1.8769749209751607e-12}]
curtain bangs [{'label': 'Hair Styling', 'prob': 0.9999998807907104}
 {'label': 'Hair Dyes & Coloring', 'prob': 7.213465935684837e-11}
 {'label': 'Near me', 'prob': 1.9501619069606235e-11}
 {'label': 'Hair Loss Products', 'prob': 2.5467298409020955e-11}
 {'label': 'Hair Straighteners and Relaxers', 'prob': 1.535932839247245e-11}
 {'label': 'Shampoos & Conditioners', 'prob': 6.677604753758468e-17}
 {'label': 'Damag

## Create subcategories for each category by using BQML Kmeans

Starting with this table `classified_terms_bqml_aml_pl_png_hair_22`

In [7]:
labels.predicted_label[0]

'Scalp/Anti-Dandruff Products'

In [8]:
## Automate label pivots - this gets the probabilaties for each class


prob_pivot_sql = ""
for i, l in enumerate(labels.predicted_label):
    prob_pivot_sql += (f"(select max(probs.prob) from UNNEST(t.predicted_label_probs) probs where probs.label = '{l}') as _{i}_prob, ")

In [9]:
label = labels.predicted_label[2]

table_sql_for_clustering = f"""
SELECT * except(predicted_label_probs),
{prob_pivot_sql}
FROM `cpg-cdp.trendspotting.classified_terms_bqml_aml_pl_png_hair_22` t 
"""

In [12]:
%%time
kmeans_table_sql = f"""
create or replace table trendspotting.cat_clus_train_png_hair_22 as (
select distinct * EXCEPT(date, geo_id, series_id, terms, category_rank, split_col) from (
    {table_sql_for_clustering})
    )
    """
bq_client.query(kmeans_table_sql).result()

CPU times: user 40.5 ms, sys: 4.69 ms, total: 45.2 ms
Wall time: 38.9 s


<google.cloud.bigquery.table._EmptyRowIterator at 0x7f5ff447f710>

In [13]:
## use this function to get the name of the topic in the clustering
def only_upper(s: str):
    upper_chars = ""
    for char in s:
        if char.isupper():
            upper_chars += char
    return upper_chars

label_upper = only_upper(label)

In [14]:
## we use this to find where the DB index flattens for n_clusters and use that for optimal number of clusters per topic

def loop_n_clus_and_get_db_index(cluster_min: int, cluster_max: int, label: str):
    
    label_upper = only_upper(label) #get only the upper case letters to denote the model name
    return_data = {label: []}
    for n_clusters in range(cluster_min, cluster_max+1):
        print(f"Training for {n_clusters} clusters")
        # return_data[label].append({'model_name': f'trendspotting.cat_clus_{label_upper}_{n_clusters}_png_hair_22'})
        kmeans_sql = f"""
        CREATE OR REPLACE MODEL trendspotting.cat_clus_{label_upper}_{n_clusters}_png_hair_22
        OPTIONS(model_type='kmeans', num_clusters={n_clusters}, standardize_features = true) AS
        select * EXCEPT(predicted_label, sentences) from trendspotting.cat_clus_train_png_hair_22 
        WHERE predicted_label = '{label}'
        """
        bq_client.query(kmeans_sql).result()
        #next, get the DB index to assess the cluster quality
        sql = f"""
        SELECT
          *
        FROM
          ML.EVALUATE (MODEL trendspotting.cat_clus_{label_upper}_{n_clusters}_png_hair_22)
          """
        data = bq_client.query(sql).to_dataframe()
        print(f"DB Index: {data.davies_bouldin_index[0]}")
        return_data[label].append({f'trendspotting.cat_clus_{label_upper}_{n_clusters}_png_hair_22': data.davies_bouldin_index[0]})

        time.sleep(60)
        
    return(return_data)

## Now run training over the loop, running 2 - 8 clusters for now

In [None]:
%%time

MIN_CLUS, MAX_CLUS = 2, 10

data_dict = {}
for label in labels.predicted_label:
    print(f"Tranining for label: {label}")
    cluster_data = loop_n_clus_and_get_db_index(MIN_CLUS, MAX_CLUS, label)
    data_dict.update(cluster_data) #update with the results
    time.sleep(60)
    

Tranining for label: Scalp/Anti-Dandruff Products
Training for 2 clusters
DB Index: 2.0578861778655475
Training for 3 clusters
DB Index: 1.9488913347883503
Training for 4 clusters
DB Index: 2.227226202805161
Training for 5 clusters
DB Index: 2.046736520725745
Training for 6 clusters
DB Index: 1.921297893157773
Training for 7 clusters
DB Index: 1.8793843303105808
Training for 8 clusters
DB Index: 1.8544206346006729
Training for 9 clusters
DB Index: 1.8492236557752448
Training for 10 clusters
DB Index: 1.976174263194094
Tranining for label: Hair Straighteners and Relaxers
Training for 2 clusters
DB Index: 2.430281557288552
Training for 3 clusters
DB Index: 3.0065136672546235
Training for 4 clusters
DB Index: 2.49605379520802
Training for 5 clusters
DB Index: 2.4249209372597305
Training for 6 clusters
DB Index: 2.516017956793506
Training for 7 clusters
DB Index: 2.5831602027223517
Training for 8 clusters
DB Index: 2.367515588140149


'trendspotting.cat_clus_HS_2_png_hair_22'

In [80]:
del optimal_models_by_label, optimal_model

In [82]:
# find the min DB score cluster for each topic, delete the other models and then score based on topic id

optimal_models_by_label = {}
for label in labels.predicted_label:
    prior_db=999 # set this high
    for c in data_dict[label]:
        if list(c.values())[0] < prior_db:
            prior_db = list(c.values())[0]
            optimal_model = list(c.keys())[0]         
    print(optimal_model)
    optimal_models_by_label.update({label: optimal_model})
optimal_models_by_label

trendspotting.cat_clus_HSR_6_png_hair_22
trendspotting.cat_clus_N_5_png_hair_22
trendspotting.cat_clus_SADP_3_png_hair_22
trendspotting.cat_clus_HDC_2_png_hair_22
trendspotting.cat_clus_DH_8_png_hair_22
trendspotting.cat_clus_HS_2_png_hair_22
trendspotting.cat_clus_HLP_4_png_hair_22
trendspotting.cat_clus_L_2_png_hair_22
trendspotting.cat_clus_SC_6_png_hair_22


{'Hair Straighteners and Relaxers': 'trendspotting.cat_clus_HSR_6_png_hair_22',
 'Near me': 'trendspotting.cat_clus_N_5_png_hair_22',
 'Scalp/Anti-Dandruff Products': 'trendspotting.cat_clus_SADP_3_png_hair_22',
 'Hair Dyes & Coloring': 'trendspotting.cat_clus_HDC_2_png_hair_22',
 'Damaged Hair': 'trendspotting.cat_clus_DH_8_png_hair_22',
 'Hair Styling': 'trendspotting.cat_clus_HS_2_png_hair_22',
 'Hair Loss Products': 'trendspotting.cat_clus_HLP_4_png_hair_22',
 'Lice': 'trendspotting.cat_clus_L_2_png_hair_22',
 'Shampoos & Conditioners': 'trendspotting.cat_clus_SC_6_png_hair_22'}

In [37]:
#delete the sub-optimal models
def delete_model_sql(model_name):
    return f"DROP MODEL IF EXISTS {model_name}"

for label in labels.predicted_label:
    optimal_model_for_label = optimal_models_by_label[label]
    for c in data_dict[label]:
        if list(c.keys())[0] != optimal_model_for_label:
            sub_optimal_model = list(c.keys())[0]
            bq_client.query(delete_model_sql(sub_optimal_model)).result() #clean up the models

In [38]:
optimal_models_by_label

{'Hair Styling': 'trendspotting.cat_clus_HS_2_png_hair_22',
 'Hair Loss Products': 'trendspotting.cat_clus_HLP_4_png_hair_22',
 'Hair Dyes & Coloring': 'trendspotting.cat_clus_HDC_2_png_hair_22',
 'Damaged Hair': 'trendspotting.cat_clus_DH_8_png_hair_22',
 'Lice': 'trendspotting.cat_clus_L_2_png_hair_22',
 'Shampoos & Conditioners': 'trendspotting.cat_clus_SC_6_png_hair_22',
 'Hair Straighteners and Relaxers': 'trendspotting.cat_clus_HSR_6_png_hair_22',
 'Scalp/Anti-Dandruff Products': 'trendspotting.cat_clus_SADP_3_png_hair_22',
 'Near me': 'trendspotting.cat_clus_N_5_png_hair_22'}

In [63]:
# save optimal model dictionary to gcs
from google.cloud import storage

import pickle

with open('./optimal_models.dict', 'wb') as file:
    pickle.dump(optimal_models_by_label, file)

bucket_name = 'trendspotting-pipeline'
    
storage_client = storage.Client(project=PROJECT_ID)
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob('optimal_models.dict')

blob.upload_from_filename('optimal_models.dict')

In [65]:
#last, score using a union query for each label

def score_cluster(label, model_name):
    predict_sql = f"""
            SELECT
              *
            FROM
              ML.PREDICT (MODEL {model_name},
              (SELECT * EXCEPT(predicted_label, sentences), 
              sentences as terms, 
              predicted_label as category 
              from trendspotting.cat_clus_train_png_hair_22 
              where predicted_label = '{label}'))
              """
    return(predict_sql)

predict_sql = ""
for i, label in enumerate(labels.predicted_label):
    predict_sql += score_cluster(label, optimal_models_by_label[label])
    if len(labels.predicted_label)-1 == i:
        break
    else:
        predict_sql += """
        UNION ALL
        """
    
def score_table(predict_sql, target_table="trendspotting.full_cat_clus_png_hair_22"):
    return(f"CREATE OR REPLACE TABLE {target_table} AS ({predict_sql})")

segment_score_sql = score_table(predict_sql)

bq_client.query(segment_score_sql).result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7f0bf6ec69d0>

### Pipeline version - to be ported to main section

In [20]:
# run pipeline version 

from datetime import datetime
import json
import os
import time
from typing import Any, Callable, Dict, NamedTuple, Optional
from IPython.display import clear_output

import google.cloud.aiplatform as aiplatform
from google.cloud import bigquery
from google.cloud import storage

import kfp
import kfp.v2.dsl
from kfp.v2.google import client as pipelines_client

from matplotlib import dates as mdates
from matplotlib import pyplot as plt

import pandas as pd
import seaborn as sns

from IPython.display import Image
from IPython.core.display import HTML 

from typing import Dict, List, Optional, Sequence, Tuple, Union
from kfp.v2.dsl import Artifact
from kfp.v2.dsl import Input, Model
from kfp.v2.components.types.type_utils import artifact_types
from src.components import components


In [19]:
del components

In [21]:
PIPELINES = {}

PIPELINES_FILEPATH = 'gs://trendspotting-pipeline' # <--- TODO: CHANGE THIS; can be blank json file

if os.path.isfile(PIPELINES_FILEPATH):
    with open(PIPELINES_FILEPATH) as f:
        PIPELINES = json.load(f)
else:
    PIPELINES = {}

def save_pipelines():
    with open(PIPELINES_FILEPATH, 'w') as f:
        json.dump(PIPELINES, f)
        
VERSION = 'v1'
SUFFIX = "png_hair_22"
PIPELINE_TAG = f'{SUFFIX}-autocluster-pipeline-{VERSION}' # <--- TODO; optionally name pipeline
@kfp.v2.dsl.pipeline(
  name=f'{VERSION}-{PIPELINE_TAG}'.replace('_', '-'),
        pipeline_root=PIPELINES_FILEPATH,

)
def pipeline(project_id: str,
            labels: list):
    auto_cluster = components.auto_cluster(2, 8, labels)

In [22]:
kfp.v2.compiler.Compiler().compile(
  pipeline_func=pipeline, 
  package_path='auto-cluster.json',
)

In [23]:
PIPELINE_PARAMETERS = { 'project_id': PROJECT_ID,
                       'labels': list(labels.predicted_label)
}

job = aiplatform.PipelineJob(display_name = f'trendspotting_auto_cluster',
                             template_path = 'auto-cluster.json',
                             pipeline_root = PIPELINES_FILEPATH,
                             parameter_values = PIPELINE_PARAMETERS,
                             project = PROJECT_ID,
                             location = REGION,
                              enable_caching=True)

job.submit()

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/939655404703/locations/us-central1/pipelineJobs/v1-png-hair-22-autocluster-pipeline-v1-20220902214314
INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:
INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/939655404703/locations/us-central1/pipelineJobs/v1-png-hair-22-autocluster-pipeline-v1-20220902214314')
INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/v1-png-hair-22-autocluster-pipeline-v1-20220902214314?project=939655404703
