# Trendspotting POC

Goal of this notebook is to
* Load signals data into a managed vertex dataset for time series forecasting
* Create a forecast prediction model for each term, geo and category combination
* Project the forecasts on a holdout set of data to assess performance and trends
* Clean up results of test predictions
* Cluster test predictions
* Create dashboard for backtesting

[Source Control Link](https://source.cloud.google.com/cpg-cdp/trendspotting/+/master:pipeline_train.ipynb)

When run - the piepline will look something like this:

![pipeline example](img/pipeline_example.png)

Todo: Integration of ingredients, flagging of trends

## Install packages, create bucket (only run once)

In [158]:
# # New
# ! pip3 install -U google-cloud-storage --user
# # ! pip3 install $USER kfp google-cloud-pipeline-components --upgrade
# !git clone https://github.com/kubeflow/pipelines.git
# !pip install pipelines/components/google-cloud/.
# !pip install google-cloud-aiplatform

In [116]:
# ! gsutil mb -l us-central1 gs://trendspotting-pipeline

### Import libs and types for KFP pipeline

In [159]:
from datetime import datetime
import json
import os
import time
from typing import Any, Callable, Dict, NamedTuple, Optional
from IPython.display import clear_output

from google import auth
from google.api_core import exceptions as google_exceptions
from google_cloud_pipeline_components import aiplatform as gcc_aip
from google_cloud_pipeline_components.experimental import forecasting as gcc_aip_forecasting
import google.cloud.aiplatform
from google.cloud import bigquery
from google.cloud import storage

import kfp
import kfp.v2.dsl
from kfp.v2.google import client as pipelines_client

from matplotlib import dates as mdates
from matplotlib import pyplot as plt

import pandas as pd
import seaborn as sns

from IPython.display import Image
from IPython.core.display import HTML 

from typing import Dict, List, Optional, Sequence, Tuple, Union
from kfp.v2.dsl import Artifact
from kfp.v2.dsl import Input, Model

In [160]:
PROJECT_ID = 'cpg-cdp'
LOCATION = 'us-central1'

In [161]:
PIPELINES = {}

PIPELINES_FILEPATH = 'gs://trendspotting-pipeline' # <--- TODO: CHANGE THIS; can be blank json file

if os.path.isfile(PIPELINES_FILEPATH):
    with open(PIPELINES_FILEPATH) as f:
        PIPELINES = json.load(f)
else:
    PIPELINES = {}

def save_pipelines():
    with open(PIPELINES_FILEPATH, 'w') as f:
        json.dump(PIPELINES, f)

### KFP Custom Component - training data query

Details: From `futurama_weekly` pull data between 7/20 - 12/21 (100 gb limit for automl tables). Automatically set testing and validation as follows:
    
* Train: 2/20-4/21
* Validate: 5/21-6/21
* Test: 6/21-12/21
    
Also set `series_id` to be a concat: `concat(category_id, geo_id, term) as series_id`

In [182]:
@kfp.v2.dsl.component(
  base_image='python:3.9',
  packages_to_install=['google-cloud-bigquery==2.18.0'],
)
def create_prediction_dataset_term_level(
      target_table: str,
      source_table_uri: str,
      train_st: str,
      train_end: str,
      valid_st: str,
      valid_end: str,
      subcat_id: int = 10047,
      override: str = 'False',
      project_id: str = 'cpg-cdp'
    ) -> NamedTuple('Outputs', [('training_data_table_uri', str)]):
    
    from google.cloud import bigquery
 
    override = bool(override)
    bq_client = bigquery.Client(project=project_id)
    (
    bq_client.query(
      f"""CREATE TEMPORARY FUNCTION arr_to_input_20(arr ARRAY<FLOAT64>)
        RETURNS 
        STRUCT<p1 FLOAT64, p2 FLOAT64, p3 FLOAT64, p4 FLOAT64,
               p5 FLOAT64, p6 FLOAT64, p7 FLOAT64, p8 FLOAT64, 
               p9 FLOAT64, p10 FLOAT64, p11 FLOAT64, p12 FLOAT64, 
               p13 FLOAT64, p14 FLOAT64, p15 FLOAT64, p16 FLOAT64,
               p17 FLOAT64, p18 FLOAT64, p19 FLOAT64, p20 FLOAT64>
        AS (
        STRUCT(
            arr[OFFSET(0)]
            , arr[OFFSET(1)]
            , arr[OFFSET(2)]
            , arr[OFFSET(3)]
            , arr[OFFSET(4)]
            , arr[OFFSET(5)]
            , arr[OFFSET(6)]
            , arr[OFFSET(7)]
            , arr[OFFSET(8)]
            , arr[OFFSET(9)]
            , arr[OFFSET(10)]
            , arr[OFFSET(11)]
            , arr[OFFSET(12)]
            , arr[OFFSET(13)]
            , arr[OFFSET(14)]
            , arr[OFFSET(15)]
            , arr[OFFSET(16)]
            , arr[OFFSET(17)]
            , arr[OFFSET(18)]
            , arr[OFFSET(19)]    
        ));


        CREATE OR REPLACE TABLE `{target_table}` as (
            SELECT * except(output_0), case when date between "{train_st}" and "{train_end}" then 'TRAIN'
                  when date between "{valid_st}" and "{valid_end}" then 'VALIDATE'
                 else 'TEST' end as split_col,
            arr_to_input_20(output_0) as embed
        FROM ML.PREDICT(MODEL trendspotting.swivel_text_embed,(
          SELECT date, geo_id, term AS sentences, category_rank, concat( term, geo_id) as series_id
          FROM `{source_table_uri}` where subcategory_id = {subcat_id}
        ))
        )
          """
    )
    .result()
    )

    return (
    f'{target_table}',
    )

### Preparing data for k-means clustering

In [208]:
@kfp.v2.dsl.component(
  base_image='python:3.9',
  packages_to_install=['google-cloud-bigquery==2.18.0'],
)
def prep_forecast_term_level(
    source_table: str,
    target_table: str,
    override: str = 'False',
    project_id: str = 'cpg-cdp'
    ) -> NamedTuple('Outputs', [('term_train_table', str)]):
    
    from google.cloud import bigquery

    bq_client = bigquery.Client(project=project_id)
    (
    bq_client.query(
      f"""
            CREATE OR REPLACE TABLE `{target_table}` as (
        SELECT * except(embed), 
        embed.p1 as emb1, 
        embed.p2 as emb2,
        embed.p3 as emb3,
        embed.p4 as emb4,
        embed.p5 as emb5,
        embed.p6 as emb6,
        embed.p7 as emb7,
        embed.p8 as emb8,
        embed.p9 as emb9,
        embed.p10 as emb10,
        embed.p11 as emb11,
        embed.p12 as emb12,
        embed.p13 as emb13,
        embed.p14 as emb14,
        embed.p15 as emb15,
        embed.p16 as emb16,
        embed.p17 as emb17,
        embed.p18 as emb18,
        embed.p19 as emb19,
        embed.p20 as emb20

        FROM `{source_table}` )
          """
    )
    .result()
    )

    return (
    f'bq://{target_table}',
    )

### Produce top-mover table

In [209]:
@kfp.v2.dsl.component(
  base_image='python:3.9',
  packages_to_install=['google-cloud-bigquery==2.18.0'],
)
def create_top_mover_table(
    source_table: str,
    target_table: str,
    predict_on_dt: str, #uses the last validation date,
    six_month_dt: str,
    trained_model: Input[Artifact],
    top_n_results: int,
    override: str = 'False',
    project_id: str = 'cpg-cdp'
    ) -> NamedTuple('Outputs', [('term_train_table', str)]):
    
    from google.cloud import bigquery
    
    source_table_no_bq = source_table.strip('bq://')

    bq_client = bigquery.Client(project=project_id)
    (
    bq_client.query(
      f"""
            CREATE OR REPLACE TABLE {target_table} as (
    select * from
      (with six_mo_val as (select *, predicted_category_rank.value as six_mo_forecast from `{source_table_no_bq}` 
        where predicted_on_date = '{predict_on_dt}' and date = '{six_month_dt}'),
         geo_id as (select distinct geo_id, geo_name from `cpg-cdp.trendspotting.futurama_weekly`)
    SELECT a.date, 
       geo_id.geo_name, 
       a.sentences, 
       cast(a.category_rank as int64) as current_rank, 
       cast(a.category_rank as int64) - b.six_mo_forecast as six_delta_rank,
       cast(b.category_rank as int64) as six_mo_rank, 
       six_mo_forecast
      FROM `{source_table_no_bq}` a INNER JOIN 
       six_mo_val b on a.series_id = b.series_id 
       inner join 
       geo_id on cast(a.geo_id as int64) = geo_id.geo_id
      WHERE a.date = '{predict_on_dt}'
      ) where current_rank > 500 and six_mo_forecast < 600 order by six_delta_rank desc limit {top_n_results} 
)
          """
    )
    .result()
    )

    return (
    f'{target_table}',
    )

In [210]:
from google_cloud_pipeline_components.types import artifact_types

VERSION = 'poc_rmse'
rmse_model_version = 'poc_rmse'

COLUMN_TRANSFORMATIONS = [
  {
    "timestamp": {
      "columnName": "date"
    }
  },
  {
    "categorical": {
      "columnName": "geo_id"
    }
  },
  {
    "text": {
      "columnName": "sentences"
    }
  },
  {
    "numeric": {
      "columnName": "category_rank"
    }
  },
  {
    "numeric": {
      "columnName": "emb1"
    }
  },
  {
    "numeric": {
      "columnName": "emb2"
    }
  },
  {
    "numeric": {
      "columnName": "emb3"
    }
  },
  {
    "numeric": {
      "columnName": "emb4"
    }
  },
  {
    "numeric": {
      "columnName": "emb5"
    }
  },
  {
    "numeric": {
      "columnName": "emb6"
    }
  },
  {
    "numeric": {
      "columnName": "emb7"
    }
  },
  {
    "numeric": {
      "columnName": "emb8"
    }
  },
  {
    "numeric": {
      "columnName": "emb9"
    }
  },
  {
    "numeric": {
      "columnName": "emb10"
    }
  },
  {
    "numeric": {
      "columnName": "emb11"
    }
  },
  {
    "numeric": {
      "columnName": "emb12"
    }
  },
  {
    "numeric": {
      "columnName": "emb13"
    }
  },
  {
    "numeric": {
      "columnName": "emb14"
    }
  },
  {
    "numeric": {
      "columnName": "emb15"
    }
  },
  {
    "numeric": {
      "columnName": "emb16"
    }
  },
  {
    "numeric": {
      "columnName": "emb17"
    }
  },
  {
    "numeric": {
      "columnName": "emb18"
    }
  },
  {
    "numeric": {
      "columnName": "emb19"
    }
  },
  {
    "numeric": {
      "columnName": "emb20"
    }
  }
]

### Pipeline 

Uses custom components, also uses reusable vertex components for creating the training dataset and training the forecast models

Notice the output for testing in BQ is set by `target_table`, assigned to `export_evaluated_data_items_bigquery_destination_uri`

In [216]:
PIPELINE_TAG = 'trendspotting-pipeline' # <--- TODO; optionally name pipeline
@kfp.v2.dsl.pipeline(
  name=f'{VERSION}-{PIPELINE_TAG}'.replace('_', '-'),
        pipeline_root=PIPELINES_FILEPATH,

)
def pipeline(
    vertex_project: str,
    location: str,
    version: str,
    ds_display_name_terms: str,
    train_st: str,
    train_end: str,
    valid_st: str,
    valid_end: str,
    predict_on_dt: str,
    context_window: int,
    forecast_horizon: int,
    override: str,
    k_means_name: str,
    n_clusters: int,
    top_n_results: int,
    six_month_dt: str,
    source_table: str = 'cpg-cdp.trendspotting.futurama_weekly',
    target_term_forecast_table: str = 'cpg-cdp.trendspotting.predict_c52_p52_embed_pl',
    budget_milli_node_hours: int = 1000,
):

    
    embed_terms = create_prediction_dataset_term_level(
      target_table = 'cpg-cdp.trendspotting.futurama_weekly_embed',
      source_table_uri = source_table,
      train_st = train_st,
      train_end = train_end,
      valid_st = valid_st,
      valid_end = valid_end,
    ) #-> NamedTuple('Outputs', [('training_data_table_uri', str)])j
    
    fix_embed = prep_forecast_term_level(
        source_table = embed_terms.outputs['training_data_table_uri'],
        target_table = 'cpg-cdp.trendspotting.futurama_weekly_embed_aml_pl',
        )# -> NamedTuple('Outputs', [('term_train_table', str)]):


    time_series_dataset_create_op = gcc_aip.TimeSeriesDatasetCreateOp(
        display_name=ds_display_name_terms, 
        bq_source=fix_embed.outputs['term_train_table'],
        project=vertex_project,
        location=location,
    )
    
    term_forecasting_op = gcc_aip_forecasting.ForecastingTrainingWithExperimentsOp(
        display_name=f'train-{rmse_model_version}',
        model_display_name=rmse_model_version,
        dataset=time_series_dataset_create_op.outputs['dataset'],
        context_window=context_window,
        forecast_horizon=forecast_horizon,
        budget_milli_node_hours=budget_milli_node_hours,
        project=vertex_project,
        location=location,
        export_evaluated_data_items=True,
        export_evaluated_data_items_override_destination=True,
        target_column='category_rank',
        time_column='date',
        time_series_identifier_column='series_id',
        time_series_attribute_columns=['geo_name', 'geo_id', 'category_id', 'term', 
                                      'emb1', 'emb2', 'emb3', 'emb4', 'emb5', 'emb6',
                                      'emb7', 'emb8', 'emb9', 'emb10', 'emb11', 'emb12',
                                      'emb13', 'emb14', 'emb15', 'emb16', 'emb17', 'emb18', 
                                      'emb19', 'emb20', 'sentences'],
        unavailable_at_forecast_columns=['category_rank'],
        available_at_forecast_columns=['date'],
        data_granularity_unit='week',
        data_granularity_count=1,
        predefined_split_column_name= 'split_col', 
        optimization_objective='minimize-rmse',
        column_transformations=COLUMN_TRANSFORMATIONS,
        export_evaluated_data_items_bigquery_destination_uri = target_term_forecast_table, # must be format:``bq://<project_id>:<dataset_id>:<table>``
    )
    
    top_movers_data_op = create_top_mover_table(source_table = target_term_forecast_table,
    target_table = 'cpg-cdp.trendspotting.top_movers_pl',
        predict_on_dt = predict_on_dt, 
        six_month_dt = six_month_dt,
        trained_model = term_forecasting_op.outputs['model'],
        top_n_results = top_n_results,
        ) #-> NamedTuple('Outputs', [('term_train_table', str)]):

    
    

In [217]:
kfp.v2.compiler.Compiler().compile(
  pipeline_func=pipeline, 
  package_path='trendspotting.json',
)

### Set parameters for pipeline here

In [218]:
PROJECT_ID = 'cpg-cdp' # <--- TODO: If not set
LOCATION = 'us-central1' # <--- TODO: If not set
SERVICE_ACCOUNT = 'vertex-pipelines@cpg-cdp.iam.gserviceaccount.com' , # <--- TODO: Change This if needed
N_CLUSTERS = 100
K_MEANS_MODEL_NAME = f"trendspotting_{N_CLUSTERS}_rmse"


# BQ dataset for source data source
SOURCE_DATA = 'futurama_weekly'
TOP_N_RESULTS = 500
# TODO: Forecasting Configuration:
HISTORY_WINDOW_n = 52 #  {type: 'integer'} # context_window
FORECAST_HORIZON = 52 #  {type: 'integer'} 
BUDGET_MILLI_NODE_HOURS = 1000

In [219]:
! gcloud config set project $PROJECT_ID

Updated property [core/project].


### Run the pipeline
Follow the link to see the exectution

In [220]:
from google.cloud import aiplatform
# vertex_project: str,
#     location: str,
#     version: str,
#     ds_display_name: str,
#     train_st: str,
#     train_end: str,
#     valid_st: str,
#     valid_end: str,
#     predict_on_dt: str,
#     context_window: int,
#     forecast_horizon: int,
#     override: str,
#     k_means_name: str,
#     n_clusters: int,
#     top_n_results: int,
#     six_month_dt: str,
#     source_table: str = 'cpg-cdp.trendspotting.futurama_weekly',
#     target_term_forecast_table: str = 'cpg-cdp.trendspotting.predict_c52_p52_embed_pl',
#     budget_milli_node_hours: int = 1000,
PIPELINE_PARAMETERS = {
      'vertex_project': PROJECT_ID,
      'location': LOCATION,
      'version': VERSION,
      'train_st': '2019-01-01',
      'train_end': '2020-12-31',
      'valid_st': '2021-01-01',
      'valid_end': '2021-05-31',
      'predict_on_dt': '2021-06-06',
      'six_month_dt': '2021-12-26',
      'context_window': HISTORY_WINDOW_n,
      'forecast_horizon': FORECAST_HORIZON,
      'budget_milli_node_hours': BUDGET_MILLI_NODE_HOURS,
      'ds_display_name_terms': 'futurama-term-forecasts',
      'k_means_name': K_MEANS_MODEL_NAME,
      'n_clusters': N_CLUSTERS,
      'top_n_results': TOP_N_RESULTS,
      'override' : 'false',
      'target_term_forecast_table' : 'bq://cpg-cdp.trendspotting.predict_c52_p52_embed_pl'
    }

job = aiplatform.PipelineJob(display_name = 'trendspotting_test',
                             template_path = 'trendspotting.json',
                             pipeline_root = PIPELINES_FILEPATH,
                             parameter_values = PIPELINE_PARAMETERS,
                             project = PROJECT_ID,
                             location = LOCATION)

job.submit()

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/939655404703/locations/us-central1/pipelineJobs/poc-rmse-trendspotting-pipeline-20220304211452
INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:
INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/939655404703/locations/us-central1/pipelineJobs/poc-rmse-trendspotting-pipeline-20220304211452')
INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/poc-rmse-trendspotting-pipeline-20220304211452?project=939655404703


### Link to downstream report
[here](https://datastudio.google.com/c/u/0/reporting/7f55644b-679b-4123-b13d-ce6f90fbd436/page/uhSlC/edit)