# Preparing the environment


## Import the required packages

In [11]:
import os
import pprint
import pandas as pd
import time

import matplotlib.pyplot as plt
from google.cloud import bigquery
from google.cloud import exceptions

## Configure GCP settings

**Change the prefix to a unique value. Use letters and digits only**

In [12]:
PREFIX = 'jk1' #Change it to your prefix

shell_output=!gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT = shell_output[0]
print("Project ID: ", PROJECT)

Project ID:  jk-vertex-demos


In [13]:
REGION = 'us-central1'

### Create a GCS bucket

In [14]:
!gsutil mb -l {REGION} gs://{PREFIX}-bucket

Creating gs://jk1-bucket/...
ServiceException: 409 A Cloud Storage bucket named 'jk1-bucket' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


### Create Tensorboard

In [6]:
DISPLAY_NAME = f'{PREFIX}-tensorboard'

!gcloud beta ai tensorboards create --display-name $DISPLAY_NAME \
  --project $PROJECT --region $REGION

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
Waiting for operation [2267470328282218496]...done.                            
Created Vertex AI Tensorboard: projects/910094146258/locations/us-central1/tensorboards/1647226748082847744.


## Preparing training data in BigQuery

### Create data splits

In [15]:
BQ_DATASET_NAME = f'{PREFIX}_dataset' 

#### Create a BQ dataset to host the splits

In [16]:
client = bigquery.Client()

dataset_id = f'{PROJECT}.{BQ_DATASET_NAME}'
dataset = bigquery.Dataset(dataset_id)
dataset.location = REGION

try:
    dataset = client.create_dataset(dataset, timeout=30)
    print('Created dataset: ', dataset_id)
except exceptions.Conflict:
    print('Dataset {} already exists'.format(dataset_id))

Created dataset:  jk-vertex-demos.jk1_dataset


#### Load data splits

In [17]:
GCS_PATH_TO_DATA = 'gs://workshop-datasets/datasets/taxi'
!gsutil ls {GCS_PATH_TO_DATA}

gs://workshop-datasets/datasets/taxi/features
gs://workshop-datasets/datasets/taxi/testing
gs://workshop-datasets/datasets/taxi/training
gs://workshop-datasets/datasets/taxi/validation


In [18]:
job_config = bigquery.LoadJobConfig(
    source_format=bigquery.SourceFormat.AVRO,
    write_disposition='WRITE_TRUNCATE'
)

for file in ['features', 'training', 'validation', 'testing']:
    file_path = '{}/{}'.format(GCS_PATH_TO_DATA, file)
    table_name = '{}.{}.{}'.format(PROJECT, BQ_DATASET_NAME, file)
    load_job = client.load_table_from_uri(
        file_path,
        table_name,
        job_config=job_config
    )
    print('Loading: {} to {}'.format(file_path, table_name))
    load_job.result()

Loading: gs://workshop-datasets/datasets/taxi/features to jk-vertex-demos.jk1_dataset.features
Loading: gs://workshop-datasets/datasets/taxi/training to jk-vertex-demos.jk1_dataset.training
Loading: gs://workshop-datasets/datasets/taxi/validation to jk-vertex-demos.jk1_dataset.validation
Loading: gs://workshop-datasets/datasets/taxi/testing to jk-vertex-demos.jk1_dataset.testing


#### Review the created tables

In [19]:
sql_script = f'''
SELECT * EXCEPT (trip_start_timestamp)
FROM {PROJECT}.{BQ_DATASET_NAME}.features 
LIMIT 100
'''
df = client.query(sql_script).result().to_dataframe()
df.head().T

Unnamed: 0,0,1,2,3,4
trip_month,1,1,1,1,1
trip_day,1,1,1,1,1
trip_day_of_week,4,4,4,4,4
trip_hour,17,16,19,0,0
trip_seconds,600,180,1335,300,720
trip_miles,1.3,0.9,7.44,0.9,3.2
payment_type,Cash,Credit Card,Prcard,Cash,Cash
pickup_grid,POINT(-87.7 42),POINT(-87.6 41.9),POINT(-87.7 41.9),POINT(-87.6 41.9),POINT(-87.6 41.9)
dropoff_grid,POINT(-87.6 41.9),POINT(-87.6 41.9),POINT(-87.6 41.9),POINT(-87.6 41.9),POINT(-87.7 41.9)
euclidean,9524.3274,0.0,8190.655447,1230.61674,3567.255865


In [20]:
sql_script = f'''
SELECT * 
FROM {PROJECT}.{BQ_DATASET_NAME}.training 
LIMIT 100
'''
df = client.query(sql_script).result().to_dataframe()
df.head().T

Unnamed: 0,0,1,2,3,4
trip_month,2,2,2,2,2
trip_day,5,5,5,5,10
trip_day_of_week,4,4,4,4,2
trip_hour,16,16,16,18,17
trip_seconds,120,226,204,261,1093
trip_miles,0.8,0.46,0.6,0.85,0.75
payment_type,Cash,Cash,Cash,Cash,Cash
pickup_grid,POINT(-87.6 42),POINT(-87.6 42),POINT(-87.6 42),POINT(-87.6 42),POINT(-87.6 42)
dropoff_grid,POINT(-87.6 42),POINT(-87.6 42),POINT(-87.6 42),POINT(-87.6 42),POINT(-87.6 42)
euclidean,0.0,0.0,0.0,0.0,0.0


## Export data for batch predict

In [24]:
gcs_path_to_instances = f'gs://{PREFIX}-bucket/batch_data/batch_instances.jsonl'
BQ_TEST_SPLIT_NAME = 'testing'

In [25]:
dataset_ref = bigquery.DatasetReference(PROJECT, BQ_DATASET_NAME)
table_ref = dataset_ref.table(BQ_TEST_SPLIT_NAME)

extract_job = client.extract_table(
    table_ref,
    gcs_path_to_instances,
    location=REGION,
)  
extract_job.result()  # Waits for job to complete.


Forbidden: 403 Access Denied: BigQuery BigQuery: Permission denied while writing data.