# 05 - BigQuery ML - Contribution Analysis 


Status: WIP 

refs:

* https://cloud.google.com/bigquery/docs/contribution-analysis
* https://cloud.google.com/bigquery/docs/get-contribution-analysis-insights

## Install packages 

In [1]:
## ('import name', 'install name') 
packages = [
    ('numpy', 'numpy'),
    ('matplotlib.pyplot', 'matplotlib'),
    ('google.cloud.aiplatform', 'google-cloud-aiplatform'),
    ('gcsfs', 'gcsfs'),
    ('google.cloud.bigquery', 'google-cloud-bigquery'),
    ('google.cloud-bigquery.storage', 'google-cloud-bigquery-storage'),

]

import importlib
install = False
for package in packages:
    try:
        importlib.import_module(package[0])
    except ImportError:
        print(f'installing package {package[1]}')
        install = True
        !pip install {package[1]} -U -q --user

if install:
    print("Installation of missing packages complete. Please run the next cell to restart the kernel before proceeding.")

installing package google-cloud-bigquery-storage
Installation of missing packages complete. Please run the next cell to restart the kernel before proceeding.


In [2]:
if install:
    import IPython
    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)
    IPython.display.display(IPython.display.Markdown("""<div class=\"alert alert-block alert-warning\">
        <b>⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. The previous cells do not need to be run again⚠️</b>
        </div>"""))

<div class="alert alert-block alert-warning">
        <b>⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. The previous cells do not need to be run again⚠️</b>
        </div>

## Setup

In [2]:
project = !gcloud config get-value project
PROJECT_ID = project[0]
PROJECT_ID

'demos-vertex-ai'

In [4]:
REGION = 'us-central1'
EXPERIMENT = 'contribution-analysis'
SERIES = 'bqml'

# source data
BQ_PROJECT = PROJECT_ID
BQ_DATASET = 'bqml_tutorial' # TODO -  need different  dataset
BQ_TABLE = 'taxi_control_and_test' # TODO - need different  dataset

# Model Training
TARGET_METRIC = 'avg_total_fare'

In [5]:
from google.cloud import bigquery
from google.cloud import aiplatform
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn import metrics
import numpy as np

from google.api import httpbody_pb2
import json

### clients

In [6]:
bq = bigquery.Client(project = PROJECT_ID)
aiplatform.init(project = PROJECT_ID, location = REGION)

### parameters

In [7]:
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET = PROJECT_ID
URI = f"gs://{BUCKET}/{SERIES}/{EXPERIMENT}"
RUN_NAME = f'run-{TIMESTAMP}'

BQ_MODEL = f'{SERIES}_{EXPERIMENT}_{TIMESTAMP}'

### env

In [8]:
DIR = f"temp/{EXPERIMENT}"

!rm -rf {DIR}
!mkdir -p {DIR}

In [12]:
query = f"""CREATE OR REPLACE TABLE `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}`
AS (
  SELECT
    vendor_id,
    passenger_count,
    payment_type,
    pickup_location_id,
    EXTRACT(MONTH FROM pickup_datetime) AS month,
    AVG(total_amount) AS avg_total_fare,
    FALSE AS is_test
  FROM `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2011`
  WHERE total_amount > 0
  GROUP BY vendor_id, passenger_count, payment_type, pickup_location_id, month, is_test
)
UNION ALL
(
  SELECT
    vendor_id,
    passenger_count,
    payment_type,
    pickup_location_id,
    EXTRACT(MONTH FROM pickup_datetime) AS month,
    AVG(total_amount) AS avg_total_fare,
    TRUE AS is_test
  FROM `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2012`
  WHERE total_amount > 0
  GROUP BY vendor_id, passenger_count, payment_type, pickup_location_id, month, is_test
);
"""
print(query)

CREATE OR REPLACE TABLE `demos-vertex-ai.bqml_tutorial.taxi_control_and_test`
AS (
  SELECT
    vendor_id,
    passenger_count,
    payment_type,
    pickup_location_id,
    EXTRACT(MONTH FROM pickup_datetime) AS month,
    AVG(total_amount) AS avg_total_fare,
    FALSE AS is_test
  FROM `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2011`
  WHERE total_amount > 0
  GROUP BY vendor_id, passenger_count, payment_type, pickup_location_id, month, is_test
)
UNION ALL
(
  SELECT
    vendor_id,
    passenger_count,
    payment_type,
    pickup_location_id,
    EXTRACT(MONTH FROM pickup_datetime) AS month,
    AVG(total_amount) AS avg_total_fare,
    TRUE AS is_test
  FROM `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2012`
  WHERE total_amount > 0
  GROUP BY vendor_id, passenger_count, payment_type, pickup_location_id, month, is_test
);



In [13]:
job = bq.query(query = query)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7efc96080f10>

## Review Data


In [32]:
query = f"""
    SELECT is_test, count(*) as n
    FROM `{BQ_PROJECT}.{BQ_DATASET}.{BQ_TABLE}`
    GROUP BY is_test
"""
print(query)


    SELECT is_test, count(*) as n
    FROM `demos-vertex-ai.bqml_tutorial.taxi_control_and_test`
    GROUP BY is_test



In [33]:
review = bq.query(query = query).to_dataframe()
review

Unnamed: 0,is_test,n
0,True,72339
1,False,59649


## Create model  

* https://cloud.google.com/bigquery/docs/contribution-analysis
* https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-contribution-analysis#examples - examples here 

In [16]:
print(f'This run will create BQML model: {BQ_PROJECT}.{BQ_DATASET}.{BQ_MODEL}')
print(f'This run will create Vertex AI model: {BQ_PROJECT}.{BQ_DATASET}.bqml_{SERIES}_{EXPERIMENT}')
print(f'The runs timestamp Is: {TIMESTAMP}')

This run will create BQML model: demos-vertex-ai.bqml_tutorial.bqml_contribution-analysis_20240923204124
This run will create Vertex AI model: demos-vertex-ai.bqml_tutorial.bqml_bqml_contribution-analysis
The runs timestamp Is: 20240923204124


## Train Model

In [19]:
query = f"""CREATE OR REPLACE MODEL `{BQ_PROJECT}.{BQ_DATASET}.{BQ_MODEL}`
  OPTIONS (
    MODEL_TYPE = 'CONTRIBUTION_ANALYSIS',
    CONTRIBUTION_METRIC = 'SUM({TARGET_METRIC})',
    DIMENSION_ID_COLS =
      ['vendor_id', 'passenger_count', 'pickup_location_id', 'payment_type', 'month'],
    IS_TEST_COL = 'is_test',
    MIN_APRIORI_SUPPORT = 0.05
    )
AS
SELECT * FROM bqml_tutorial.taxi_control_and_test;
"""
print(query)


CREATE OR REPLACE MODEL `demos-vertex-ai.bqml_tutorial.bqml_contribution-analysis_20240923204124`
  OPTIONS (
    MODEL_TYPE = 'CONTRIBUTION_ANALYSIS',
    CONTRIBUTION_METRIC = 'SUM(avg_total_fare)',
    DIMENSION_ID_COLS =
      ['vendor_id', 'passenger_count', 'pickup_location_id', 'payment_type', 'month'],
    IS_TEST_COL = 'is_test',
    MIN_APRIORI_SUPPORT = 0.05
    )
AS
SELECT * FROM bqml_tutorial.taxi_control_and_test;



In [20]:
job = bq.query(query = query)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7efc774c26e0>

In [21]:
(job.ended-job.started).total_seconds()

30.069

In [22]:
job.total_bytes_processed

5756241

### add labels to model object

In [23]:
model = bq.get_model(f'{BQ_PROJECT}.{BQ_DATASET}.{BQ_MODEL}')
model.labels = {'series' : f'{SERIES}', 'experiment' : f'{EXPERIMENT}'}
model = bq.update_model(model, ['labels'])

### view model in GUI

In [None]:
print(f'BigQuery Console for Project:\nhttps://console.cloud.google.com/bigquery?project={PROJECT_ID}')

## Get Insights 

https://cloud.google.com/bigquery/docs/get-contribution-analysis-insights

In [27]:
query = f"""SELECT
  contributors,
  metric_test,
  metric_control,
  difference,
  relative_difference,
  unexpected_difference,
  relative_unexpected_difference,
  apriori_support
FROM
  ML.GET_INSIGHTS(
    MODEL `{BQ_PROJECT}.{BQ_DATASET}.{BQ_MODEL}`)
WHERE relative_difference IS NOT NULL
ORDER BY unexpected_difference DESC;
"""
print(query)

SELECT
  contributors,
  metric_test,
  metric_control,
  difference,
  relative_difference,
  unexpected_difference,
  relative_unexpected_difference,
  apriori_support
FROM
  ML.GET_INSIGHTS(
    MODEL `demos-vertex-ai.bqml_tutorial.bqml_contribution-analysis_20240923204124`)
WHERE relative_difference IS NOT NULL
ORDER BY unexpected_difference DESC;



In [30]:
insights = bq.query(query = query).to_dataframe()

In [31]:
insights.head(10)

Unnamed: 0,contributors,metric_test,metric_control,difference,relative_difference,unexpected_difference,relative_unexpected_difference,apriori_support
0,[all],1305121.630946658,983978.603443601,321143.027503057,0.326371962,321143.027503057,0.326371962,1.0
1,[payment_type=5],82996.99307095,138.26,82858.73307095,599.296492629,82825.246757081,482.253417818,0.063593301
2,[vendor_id=1],651674.026105812,475749.798118647,175924.227987165,0.369783085,39985.82041025,0.065369611,0.499320531
3,[passenger_count=1],304615.252142054,214839.058249037,89776.193893017,0.417876501,25149.907437652,0.08999294,0.233399895
4,[passenger_count=6],106980.236314059,64751.247894565,42228.988419494,0.65217258,22582.018639759,0.267565108,0.081969553
5,"[passenger_count=1, vendor_id=1]",169184.64374,114583.997774386,54600.645965614,0.476511965,19471.037967023,0.130055237,0.129631323
6,[month=10],125499.822773923,84169.515779731,41330.306994192,0.49103653,15156.197935889,0.137354539,0.096159484
7,"[passenger_count=2, vendor_id=1]",156456.340431637,108337.644898706,48118.695532931,0.444154897,14339.082097738,0.100896135,0.119878743
8,"[passenger_count=6, vendor_id=2]",97561.899343064,63464.886089009,34097.013254055,0.537257929,14306.604620684,0.171840172,0.074753109
9,[month=11],123306.319251865,83189.102370695,40117.21688117,0.482241252,14164.111406941,0.129776665,0.094478795
