# Trendspotting POC

Goal of this notebook is to
Load signals data into a managed vertex dataset for time series forecasting
Create a forecast prediction model for each term, geo and category combination
Project the forecasts on a holdout set of data to assess performance and trends

When run - the piepline will look something like this:

![pipeline example](img/pipeline_example.png)

Todo: Integration of ingredients, flagging of trends

## Install packages, create bucket (only run once)

In [1]:
# # New
# ! pip3 install -U google-cloud-storage --user
# # ! pip3 install $USER kfp google-cloud-pipeline-components --upgrade
# !git clone https://github.com/kubeflow/pipelines.git
# !pip install pipelines/components/google-cloud/.
# !pip install google-cloud-aiplatform

In [2]:
# ! gsutil mb -l us-central1 gs://trendspotting-pipeline

### Import libs and types for KFP pipeline

In [23]:
from datetime import datetime
import json
import os
import time
from typing import Any, Callable, Dict, NamedTuple, Optional
from IPython.display import clear_output

from google import auth
from google.api_core import exceptions as google_exceptions
from google_cloud_pipeline_components import aiplatform as gcc_aip
from google_cloud_pipeline_components.experimental import forecasting as gcc_aip_forecasting
import google.cloud.aiplatform
from google.cloud import bigquery
from google.cloud import storage

import kfp
import kfp.v2.dsl
from kfp.v2.google import client as pipelines_client

from matplotlib import dates as mdates
from matplotlib import pyplot as plt

import pandas as pd
import seaborn as sns

from IPython.display import Image
from IPython.core.display import HTML 

from typing import Dict, List, Optional, Sequence, Tuple, Union
from kfp.v2.dsl import Artifact
from kfp.v2.dsl import Input, Model

In [24]:
PROJECT_ID = 'cgp-cdp'
LOCATION = 'us-central1'

In [25]:
PIPELINES = {}

PIPELINES_FILEPATH = 'gs://trendspotting-pipeline' # <--- TODO: CHANGE THIS; can be blank json file

if os.path.isfile(PIPELINES_FILEPATH):
  with open(PIPELINES_FILEPATH) as f:
    PIPELINES = json.load(f)
else:
  PIPELINES = {}

def save_pipelines():
  with open(PIPELINES_FILEPATH, 'w') as f:
    json.dump(PIPELINES, f)

### KFP Custom Component - training data query

Details: From `futurama_weekly` pull data between 7/20 - 12/21 (100 gb limit for automl tables). Automatically set testing and validation as follows:
    
* Train: 2/20-4/21
* Validate: 5/21-6/21
* Test: 6/21-12/21
    
Also set `series_id` to be a concat: `concat(category_id, geo_id, term) as series_id`

In [26]:
@kfp.v2.dsl.component(
  base_image='python:3.9',
  packages_to_install=['google-cloud-bigquery==2.18.0'],
)
def create_prediction_dataset(
  project: str,
  dataset: str,
  source_table_uri: str,
  override: str = 'False',
) -> NamedTuple('Outputs', [('training_data_table_uri', str)]):
  from google.cloud import bigquery
 
  override = bool(override)
  bq_client = bigquery.Client(project=project)
  combined_preds_forecast_table_name = f'{project}.{dataset}.y5y6_forecast_volume_term'
  (
    bq_client.query(
      f"""
     create table if not exists `{combined_preds_forecast_table_name}` as (
         with raw_data as (
         SELECT *, concat(category_id, geo_id, term) as series_id,
         case when date between '2020-02-01' and  '2021-04-01' then 'TRAIN'
          when date between '2021-05-01' and '2021-05-31' then 'VALIDATE'
         else 'TEST' end as split_col
         from `cpg-cdp.trendspotting.futurama_weekly`
         WHERE date between '2020-07-01' and '2021-12-31'
         )
         SELECT * EXCEPT (volume, score, geo_type) from raw_data
)
          """
    )
    .result()
  )

  return (
    f'bq://{combined_preds_forecast_table_name}',
  )

In [27]:
from google_cloud_pipeline_components.types import artifact_types

VERSION = 'poc'
rmse_model_version = 'poc'

COLUMN_TRANSFORMATIONS = [
  {
    "timestamp": {
      "columnName": "date"
    }
  },
  {
    "categorical": {
      "columnName": "geo_id"
    }
  },
  {
    "text": {
      "columnName": "geo_name"
    }
  },
  {
    "categorical": {
      "columnName": "category_id"
    }
  },
  {
    "text": {
      "columnName": "term"
    }
  },
  {
    "numeric": {
      "columnName": "category_rank"
    }
  }
]

### Pipeline 

Uses custom components, also uses reusable vertex components for creating the training dataset and training the forecast models

Notice the output for testing in BQ is set by `target_table`, assigned to `export_evaluated_data_items_bigquery_destination_uri`

In [28]:
PIPELINE_TAG = 'trendspotting-pipeline' # <--- TODO; optionally name pipeline
@kfp.v2.dsl.pipeline(
  name=f'{VERSION}-{PIPELINE_TAG}'.replace('_', '-'),
        pipeline_root=PIPELINES_FILEPATH,

)
def pipeline(
  vertex_project: str,
  location: str,
  version: str,
  data_source_dataset: str,
  ds_display_name: str,
  # activities_expected_historical_last_date: str,
  context_window: int,
  forecast_horizon: int,
  override: str,
  target_table: str,
  budget_milli_node_hours: int = 16000,
):

    
  create_prediction_dataset_op = create_prediction_dataset(
  project = vertex_project,
  dataset = 'trendspotting',
  source_table_uri = data_source_dataset,
  override = 'False',)


  time_series_dataset_create_op = gcc_aip.TimeSeriesDatasetCreateOp(
    display_name=ds_display_name, 
    bq_source=create_prediction_dataset_op.outputs['training_data_table_uri'],
    project=vertex_project,
    location=location,
  )
  rmse_model_op = gcc_aip_forecasting.ForecastingTrainingWithExperimentsOp(
      display_name=f'train-{rmse_model_version}',
      model_display_name=rmse_model_version,
      dataset=time_series_dataset_create_op.outputs['dataset'],
      context_window=context_window,
      forecast_horizon=forecast_horizon,
      budget_milli_node_hours=budget_milli_node_hours,
      project=vertex_project,
      location=location,
      export_evaluated_data_items=True,
      export_evaluated_data_items_override_destination=True,
      target_column='category_rank',
      time_column='date',
      time_series_identifier_column='series_id',
      time_series_attribute_columns=['geo_name', 'geo_id', 'category_id', 'term'],
      unavailable_at_forecast_columns=['category_rank'],
      available_at_forecast_columns=['date'],
      data_granularity_unit='week',
      data_granularity_count=1,
      predefined_split_column_name= 'split_col', 
      optimization_objective='minimize-rmse',
      column_transformations=COLUMN_TRANSFORMATIONS,
      export_evaluated_data_items_bigquery_destination_uri = target_table, # must be format:``bq://<project_id>:<dataset_id>:<table>``
  )

In [29]:
kfp.v2.compiler.Compiler().compile(
  pipeline_func=pipeline, 
  package_path='trendspotting.json',
)



### Set parameters for pipeline here

In [39]:
PROJECT_ID = 'cpg-cdp' # <--- TODO: If not set
LOCATION = 'us-central1' # <--- TODO: If not set
SERVICE_ACCOUNT = 'vertex-pipelines@cpg-cdp.iam.gserviceaccount.com' , # <--- TODO: Change This if needed

# BQ dataset for source data source
DATA_SOURCE_DATASET = 'futurama_weekly'

# TODO: Forecasting Configuration:
HISTORY_WINDOW_n = 52 #  {type: 'integer'} # context_window
FORECAST_HORIZON = 52 #  {type: 'integer'} 
BUDGET_MILLI_NODE_HOURS = 16000



In [41]:
! gcloud config set project $PROJECT_ID

Updated property [core/project].


In [None]:
### Run the pipeline
Follow the link to see the exectution

In [42]:
from google.cloud import aiplatform

PIPELINE_PARAMETERS = {
      'vertex_project': PROJECT_ID,
      'location': LOCATION,
      'version': VERSION,
      'data_source_dataset': DATA_SOURCE_DATASET,
      'context_window': HISTORY_WINDOW_n,
      'forecast_horizon': FORECAST_HORIZON,
      'budget_milli_node_hours': BUDGET_MILLI_NODE_HOURS,
      'ds_display_name': '20-21-clean',
      'override' : 'false',
      'target_table' : f'bq://{PROJECT_ID}:trendspotting.predict_c{HISTORY_WINDOW_n}_h{FORECAST_HORIZON}'
    }

job = aiplatform.PipelineJob(display_name = 'trendspotting_test',
                             template_path = 'trendspotting.json',
                             pipeline_root = PIPELINES_FILEPATH,
                             parameter_values = PIPELINE_PARAMETERS,
                             project = PROJECT_ID,
                             location = LOCATION)

job.submit()

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/939655404703/locations/us-central1/pipelineJobs/poc-trendspotting-pipeline-20220208203238
INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:
INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/939655404703/locations/us-central1/pipelineJobs/poc-trendspotting-pipeline-20220208203238')
INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/poc-trendspotting-pipeline-20220208203238?project=939655404703


In [None]:
        #         note have to clean the output a bit for casting
    
#     create table `cpg-cdp.trendspotting.predict_c52_h52_fixed` as
#  SELECT *, cast(category_rank as int) as category_rank_int  FROM `cpg-cdp.trendspotting.predict_c52_h52`    