# 1 - Setup Environment 

This notebook will guide you through setting up a Google Cloud Platform environment.

In [7]:
REGION = 'us-central1'
PROJECT_ID = 'demos-vertex-ai'
DATANAME = 'propensity'

BQ_SOURCE = f"{PROJECT_ID}.{DATANAME}.training_data"

## Setup
inputs:

In [10]:
from google.cloud import storage
from google.cloud import bigquery

import pandas as pd
from sklearn import datasets

In [11]:
bq = bigquery.Client(project = PROJECT_ID)

parameters:

In [12]:
BUCKET = PROJECT_ID

## Create Storage Bucket

In [13]:
gcs = storage.Client(project = PROJECT_ID)

In [14]:
if not gcs.lookup_bucket(BUCKET):
    bucketDef = gcs.bucket(BUCKET)
    bucket = gcs.create_bucket(bucketDef, project=PROJECT_ID, location=REGION)
    print(bucket)
else:
    print(gcs.lookup_bucket(BUCKET))

<Bucket: demos-vertex-ai>


## Store data in Storage Bucket

* export from bigquery table to GCS bucket as CSV (the table is referenced in the BQ_SOURCE variable at the top of this notebook)
* More details here: [Exporting table data  |  BigQuery  |  Google Cloud](https://cloud.google.com/bigquery/docs/exporting-data#python)

### Save training data as BigQuery table for export to GCS 

In [15]:
table_id = BQ_SOURCE

job_config = bigquery.QueryJobConfig(destination=table_id,
                                    write_disposition = 'WRITE_TRUNCATE')

sql = """
  SELECT
    fullVisitorId,
    bounces,
    time_on_site,
    will_buy_on_return_visit
  FROM (
        # select features
        SELECT
          fullVisitorId,
          IFNULL(totals.bounces, 0) AS bounces,
          IFNULL(totals.timeOnSite, 0) AS time_on_site
        FROM
          `data-to-insights.ecommerce.web_analytics`
        WHERE
          totals.newVisits = 1
        AND date BETWEEN '20160801' # train on first 9 months of data
        AND '20170430'
       )
  JOIN (
        SELECT
          fullvisitorid,
          IF (
              COUNTIF (
                       totals.transactions > 0
                       AND totals.newVisits IS NULL
                      ) > 0,
              1,
              0
             ) AS will_buy_on_return_visit
        FROM
          `bigquery-public-data.google_analytics_sample.*`
        GROUP BY
          fullvisitorid
       )
  USING (fullVisitorId)
  ORDER BY time_on_site DESC
"""

# Start the query, passing in the extra configuration.
query_job = bq.query(sql, job_config=job_config)  # Make an API request.
query_job.result()  # Wait for the job to complete.

print("Query results loaded to the table {}".format(table_id))

Query results loaded to the table demos-vertex-ai.propensity.training_data


In [17]:
bq = bigquery.Client(project = PROJECT_ID)
destination = f"gs://{BUCKET}/{DATANAME}/data/{DATANAME}.csv"
source = bigquery.TableReference.from_string(BQ_SOURCE)
extract = bq.extract_table(source, destination)