In [1]:
# !pip install google-cloud-aiplatform[prediction]@git+https://github.com/googleapis/python-aiplatform.git@custom-prediction-routine

# Sklearn with Pandas - Custom Prediction Routine to get `.predict_proba()`

This is similar to [the other notebook](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/ml_ops/stage6/get_started_with_cpr.ipynb) except we will be using pandas and bigquery

Topics covered
* Training sklearn locally, deploying to endpoint
* Saving data as CSV and doing batch predict from GCS
* Loading data to BQ, using BQ magics
* Running a batch prediction from BQ to BQ

In [26]:
PROJECT_ID = 'wortz-project' #SET THIS TO YOUR PROJECT ID
BUCKET = "gs://xxx-model-artifacts" #BE SURE TO gsutil mb -l <REGION> <LOG_BUCKET> to create the bucket on GCP
REGION = "us-central1"

In [27]:
# generate synthetic data
import pandas as pd
import numpy as np #for the random integer example

# set seed

np.random.seed(1234)

x = np.random.randint(0.0,100.0,size=(10,3))
y = np.random.binomial(1, .25, size=(10,1))
df = pd.DataFrame(np.append(x, y, axis=1),
              index=range(10,20),
              columns=['col1','col2','col3','label'],
              dtype='float64')

In [28]:
df

Unnamed: 0,col1,col2,col3,label
10,47.0,83.0,38.0,0.0
11,53.0,76.0,24.0,1.0
12,15.0,49.0,23.0,0.0
13,26.0,30.0,43.0,0.0
14,30.0,26.0,58.0,1.0
15,92.0,69.0,80.0,0.0
16,73.0,47.0,50.0,0.0
17,76.0,37.0,34.0,1.0
18,38.0,67.0,11.0,0.0
19,0.0,75.0,80.0,1.0


In [29]:
from sklearn.ensemble import RandomForestClassifier

# Set the model parameters. 
n_estimators = 100
max_depth = 6
max_features = 3

rf = RandomForestClassifier(n_estimators = n_estimators, max_depth = max_depth, max_features = max_features)
rf.fit(df[['col1', 'col2', 'col3']], df['label'])

RandomForestClassifier(max_depth=6, max_features=3)

In [30]:
import os
import pickle
import joblib

artifact_filename = 'model.joblib' #has to be joblib to work with CPR

# Save model artifact to local filesystem (doesn't persist)

joblib.dump(rf, artifact_filename)

['model.joblib']

#### Upload the model pipeline to gcs

In [6]:
! gsutil cp $artifact_filename $BUCKET/model/

Copying file://model.joblib [Content-Type=application/octet-stream]...
/ [1 files][ 78.7 KiB/ 78.7 KiB]                                                
Operation completed over 1 objects/78.7 KiB.                                     


## Create a generic sklearn container that returns `predict_proba`

https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/ml_ops/stage6/get_started_with_cpr.ipynb

**highly recommend reviewing this notebook first as it breaks down the custom predictor interface**

In [7]:
! mkdir src

mkdir: cannot create directory ‘src’: File exists


In [8]:
%%writefile src/requirements.txt
fastapi
uvicorn
joblib~=1.0
numpy~=1.20
scikit-learn~=1.0
google-cloud-storage>=1.26.0,<2.0.0dev
google-cloud-aiplatform[prediction] @ git+https://github.com/googleapis/python-aiplatform.git@custom-prediction-routine

Overwriting src/requirements.txt


In [9]:
%%writefile src/predictor.py
import joblib
import numpy as np
import pickle

from google.cloud import storage
from google.cloud.aiplatform.prediction.sklearn.predictor import SklearnPredictor
import json

class CprPredictor(SklearnPredictor):
    
    def __init__(self):
        return
    
    def load(self, gcs_artifacts_uri: str):
        """Loads the preprocessor artifacts."""
        gcs_client = storage.Client()
        with open("model.joblib", 'wb') as gcs_model:
            gcs_client.download_blob_to_file(
                gcs_artifacts_uri + "/model.joblib", gcs_model
            )

        with open("model.joblib", "rb") as f:
            self._model = joblib.load("model.joblib")

    
    def predict(self, instances):
        outputs = self._model.predict_proba(instances) 
        return outputs

Overwriting src/predictor.py


### Build and push container to Artifact Registry
#### Build your container
To build a custom container, we also need to write an entrypoint of the image that starts the model server. However, with the Custom Prediction Routine feature, you don't need to write the entrypoint anymore. Vertex AI SDK will populate the entrypoint with the custom predictor you provide.

In [10]:
import os
REGION = 'us-central1'

from google.cloud.aiplatform.prediction import LocalModel
from src.predictor import CprPredictor

REPOSITORY = "custom-preprocess-container-prediction"  # @param {type:"string"}
SERVER_IMAGE = "sklearn-cpr-preprocess-server"  # @param {type:"string"}

local_model = LocalModel.create_cpr_model(
    "src",
    f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/{SERVER_IMAGE}",
    predictor=CprPredictor,
    requirements_path="src/requirements.txt",
)

INFO:google.cloud.aiplatform.utils.prediction_utils:"src/entrypoint.py" already exists, skip generating "entrypoint.py" in "src".
INFO:google.cloud.aiplatform.docker_utils.build:Running command: docker build -t us-central1-docker.pkg.dev/wortz-project/custom-preprocess-container-prediction/sklearn-cpr-preprocess-server --rm -f- src
INFO:google.cloud.aiplatform.docker_utils.local_util:Sending build context to Docker daemon  8.751kB
INFO:google.cloud.aiplatform.docker_utils.local_util:

INFO:google.cloud.aiplatform.docker_utils.local_util:Step 1/11 : FROM python:3.7

INFO:google.cloud.aiplatform.docker_utils.local_util: ---> 7c891de3e220

INFO:google.cloud.aiplatform.docker_utils.local_util:Step 2/11 : ENV PYTHONDONTWRITEBYTECODE=1

INFO:google.cloud.aiplatform.docker_utils.local_util: ---> Using cache

INFO:google.cloud.aiplatform.docker_utils.local_util: ---> df138691af1e

INFO:google.cloud.aiplatform.docker_utils.local_util:Step 3/11 : EXPOSE 8080

INFO:google.cloud.aiplatform.docker_

### Test it out with a locally deployed endpoint
Need to generate credentials to test

In [11]:
local_model.get_serving_container_spec()

image_uri: "us-central1-docker.pkg.dev/wortz-project/custom-preprocess-container-prediction/sklearn-cpr-preprocess-server"
predict_route: "/predict"
health_route: "/health"

In [26]:
! gcloud services enable iam.googleapis.com

#### Only run once to generate creds

In [17]:
# ! gcloud iam service-accounts keys create credentials.json --iam-account=633325234048-compute@developer.gserviceaccount.com
CREDENTIALS_FILE = "credentials.json"

## Create example instances

In [64]:
INPUT_FILE = "instances.json"

In [65]:
%%writefile $INPUT_FILE
{
    "instances": [
        [61.7, 11.1, 41.7],
        [41.6, 31.1, 11.5]
    ]
}

Overwriting instances.json


In [18]:
with local_model.deploy_to_local_endpoint(
    artifact_uri=f"{BUCKET}/model",
    credential_path=CREDENTIALS_FILE,  # Update this to the path to your credentials.
) as local_endpoint:
    predict_response = local_endpoint.predict(
        request_file=INPUT_FILE,
        headers={"Content-Type": "application/json"},
    )

    health_check_response = local_endpoint.run_health_check()

INFO:google.cloud.aiplatform.prediction.local_endpoint:Got the project id from the global config: wortz-project.


## Local results should show a n x 2 shaped return for binomial classification

In [19]:
predict_response.content

b'{"predictions": [[0.37, 0.63], [0.67, 0.33]]}'

### Create a repository to house your artifacts / images

In [20]:
! gcloud beta artifacts repositories create {REPOSITORY} \
    --repository-format=docker \
    --location=$REGION

[1;31mERROR:[0m (gcloud.beta.artifacts.repositories.create) ALREADY_EXISTS: the repository already exists


In [21]:
! gcloud auth configure-docker {REGION}-docker.pkg.dev --quiet


{
  "credHelpers": {
    "gcr.io": "gcloud",
    "us.gcr.io": "gcloud",
    "eu.gcr.io": "gcloud",
    "asia.gcr.io": "gcloud",
    "staging-k8s.gcr.io": "gcloud",
    "marketplace.gcr.io": "gcloud",
    "us-central1-docker.pkg.dev": "gcloud"
  }
}
Adding credentials for: us-central1-docker.pkg.dev
gcloud credential helpers already registered correctly.


## Upload the model to Vertex using new Prediction Route Serving Container

In [22]:
local_model.push_image() #push to container registry

INFO:google.cloud.aiplatform.docker_utils.local_util:Using default tag: latest

INFO:google.cloud.aiplatform.docker_utils.local_util:The push refers to repository [us-central1-docker.pkg.dev/wortz-project/custom-preprocess-container-prediction/sklearn-cpr-preprocess-server]

INFO:google.cloud.aiplatform.docker_utils.local_util:f85dc2438b81: Preparing

INFO:google.cloud.aiplatform.docker_utils.local_util:ff0aeb915ea5: Preparing

INFO:google.cloud.aiplatform.docker_utils.local_util:958be198471c: Preparing

INFO:google.cloud.aiplatform.docker_utils.local_util:7a49e899b9ac: Preparing

INFO:google.cloud.aiplatform.docker_utils.local_util:1ab435391482: Preparing

INFO:google.cloud.aiplatform.docker_utils.local_util:ea02e4889d36: Preparing

INFO:google.cloud.aiplatform.docker_utils.local_util:c4418e789e70: Preparing

INFO:google.cloud.aiplatform.docker_utils.local_util:7c12a541abbf: Preparing

INFO:google.cloud.aiplatform.docker_utils.local_util:6d95196cbe50: Preparing

INFO:google.cloud.aipl

In [23]:
from google.cloud import aiplatform

model = local_model.upload(
        display_name='pandas test CLASSIFICATION',
        artifact_uri="gs://ulta-model-artifacts/model",
        description='pandas test for deploying models to vertex CLASSIFICATION',
        sync=True, #this will not bind up your notebook instance with the creation operation
    ) #note this will automatcially designate the latest sklearn serving container

INFO:google.cloud.aiplatform.models:Creating Model
INFO:google.cloud.aiplatform.models:Create Model backing LRO: projects/633325234048/locations/us-central1/models/9111965120682000384/operations/7544196298367303680
INFO:google.cloud.aiplatform.models:Model created. Resource name: projects/633325234048/locations/us-central1/models/9111965120682000384
INFO:google.cloud.aiplatform.models:To use this Model in another session:
INFO:google.cloud.aiplatform.models:model = aiplatform.Model('projects/633325234048/locations/us-central1/models/9111965120682000384')


## Batch predictions with GCS / CSV
### Now we will create a different dataframe to make predictions on for batch predictions

In [71]:
df2 = pd.DataFrame(np.random.randint(0.0,100.0,size=(10,3)), # we will do batch predictions based on this
              index=range(10,20),
              columns=['col1','col2','col3'],
              dtype='float64')
rf.predict_proba(df2[['col1','col2','col3']])

array([[0.47, 0.53],
       [0.29, 0.71],
       [0.63, 0.37],
       [0.68, 0.32],
       [0.67, 0.33],
       [0.35, 0.65],
       [0.44, 0.56],
       [0.36, 0.64],
       [0.89, 0.11],
       [0.45, 0.55]])

### Expected output
From documentation:
```
array([[0.8 , 0.2 ],
       [0.38, 0.62],
       [0.61, 0.39],
       [0.65, 0.35],
       [0.56, 0.44],
       [0.63, 0.37],
       [0.55, 0.45],
       [0.43, 0.57],
       [0.43, 0.57],
       [0.38, 0.62]])
```

In [99]:
from google.cloud import storage
import csv

# save the csv with the header, no index
df2.to_csv('df2.csv', index=False)

data_directory = BUCKET + "/data"
storage_path = os.path.join(data_directory, 'df2.csv')
blob = storage.blob.Blob.from_string(storage_path, client=storage.Client())
blob.upload_from_filename("df2.csv")

In [100]:
batch_prediction_job = model.batch_predict(
        job_display_name='pandas batch predict job sklearn - VALUES JSON',
        gcs_source=storage_path,
        gcs_destination_prefix=BUCKET+"/predictions",
        machine_type='n1-standard-2',
        instances_format='csv', #This is key to parsing CSV input
        # accelerator_count=accelerator_count,
        # accelerator_type=accelerator_type, #if you want gpus
        starting_replica_count=1,
        max_replica_count=2,
        sync=False,
    )

INFO:google.cloud.aiplatform.jobs:Creating BatchPredictionJob
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob created. Resource name: projects/633325234048/locations/us-central1/batchPredictionJobs/8388483720826322944
INFO:google.cloud.aiplatform.jobs:To use this BatchPredictionJob in another session:
INFO:google.cloud.aiplatform.jobs:bpj = aiplatform.BatchPredictionJob('projects/633325234048/locations/us-central1/batchPredictionJobs/8388483720826322944')
INFO:google.cloud.aiplatform.jobs:View Batch Prediction Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/batch-predictions/8388483720826322944?project=633325234048
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/633325234048/locations/us-central1/batchPredictionJobs/8388483720826322944 current state:
JobState.JOB_STATE_RUNNING
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/633325234048/locations/us-central1/batchPredictionJobs/8388483720826322944 current state:
JobState.JOB_STAT

### When successful you should see this
```
{"instance": [16.0, 64.0, 61.0], "prediction": [0.63, 0.37]}
{"instance": [83.0, 27.0, 87.0], "prediction": [0.35, 0.65]}
{"instance": [96.0, 83.0, 57.0], "prediction": [0.68, 0.32]}
{"instance": [11.0, 62.0, 17.0], "prediction": [0.89, 0.11]}
{"instance": [61.0, 28.0, 1.0], "prediction": [0.36, 0.64]}
```

## Batch Prediction with BQ

In [None]:
#!pip install pandas_gbq --user

## Create an empty dataset to house the tables

In [43]:
!bq --location=location mk \
--dataset \
--description "test dataset" \
--location "US" \
$PROJECT_ID:TEST

Dataset 'wortz-project:TEST' successfully created.


In [None]:
# Load the table to BQ and make Batch predictions
from pandas_gbq import to_gbq

df2.to_gbq(destination_table=f"{PROJECT_ID}.TEST.df2", project_id=PROJECT_ID)

## Bigquery magic comes available by default

In [131]:
%%bigquery
select * from TEST.df2

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 865.70query/s] 
Downloading: 100%|██████████| 10/10 [00:01<00:00,  7.21rows/s]


Unnamed: 0,col1,col2,col3
0,20.0,12.0,62.0
1,14.0,18.0,79.0
2,56.0,19.0,81.0
3,58.0,91.0,74.0
4,77.0,49.0,99.0
5,50.0,26.0,38.0
6,49.0,23.0,69.0
7,78.0,23.0,96.0
8,98.0,36.0,15.0
9,79.0,82.0,33.0


## Now run batch predicitons on this bq table

Note you have to have write permissions on the dataset - you may see a error if you don't

In [24]:
batch_prediction_job = model.batch_predict(
        job_display_name='bigquery batch predict job sklearn',
        bigquery_source=f"bq://{PROJECT_ID}.TEST.df2",
        bigquery_destination_prefix=f'bq://{PROJECT_ID}', #this will create a seperate dataset with predictions
        machine_type='n1-standard-2',
        # accelerator_count=accelerator_count,
        # accelerator_type=accelerator_type, #if you want gpus
        starting_replica_count=1,
        max_replica_count=2,
        sync=False,
    ) 

# Output table will look something like this:  wortz-project.prediction_pandas_test_2022_04_22T11_32_14_834Z.predictions_2022_04_22T11_32_14_834Z 

INFO:google.cloud.aiplatform.jobs:Creating BatchPredictionJob
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob created. Resource name: projects/633325234048/locations/us-central1/batchPredictionJobs/8652859092701806592
INFO:google.cloud.aiplatform.jobs:To use this BatchPredictionJob in another session:
INFO:google.cloud.aiplatform.jobs:bpj = aiplatform.BatchPredictionJob('projects/633325234048/locations/us-central1/batchPredictionJobs/8652859092701806592')
INFO:google.cloud.aiplatform.jobs:View Batch Prediction Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/batch-predictions/8652859092701806592?project=633325234048
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/633325234048/locations/us-central1/batchPredictionJobs/8652859092701806592 current state:
JobState.JOB_STATE_RUNNING
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/633325234048/locations/us-central1/batchPredictionJobs/8652859092701806592 current state:
JobState.JOB_STAT

### Final section - deploy to endpoint

In [101]:
endpoint = model.deploy(machine_type="n1-standard-4")

INFO:google.cloud.aiplatform.models:Creating Endpoint
INFO:google.cloud.aiplatform.models:Create Endpoint backing LRO: projects/633325234048/locations/us-central1/endpoints/7875643460085088256/operations/7350682251878727680
INFO:google.cloud.aiplatform.models:Endpoint created. Resource name: projects/633325234048/locations/us-central1/endpoints/7875643460085088256
INFO:google.cloud.aiplatform.models:To use this Endpoint in another session:
INFO:google.cloud.aiplatform.models:endpoint = aiplatform.Endpoint('projects/633325234048/locations/us-central1/endpoints/7875643460085088256')
INFO:google.cloud.aiplatform.models:Deploying model to Endpoint : projects/633325234048/locations/us-central1/endpoints/7875643460085088256
INFO:google.cloud.aiplatform.models:Deploy Endpoint model backing LRO: projects/633325234048/locations/us-central1/endpoints/7875643460085088256/operations/4151437666585411584
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/633325234048/locations/us-central1

In [102]:
endpoint.predict(instances=[[6.7, 3.1, 4.7], [4.6, 3.1, 1.5]])

Prediction(predictions=[[0.65, 0.35], [0.65, 0.35]], deployed_model_id='6711621286084214784', explanations=None)

INFO:google.cloud.aiplatform.jobs:BatchPredictionJob projects/633325234048/locations/us-central1/batchPredictionJobs/8388483720826322944 current state:
JobState.JOB_STATE_SUCCEEDED
INFO:google.cloud.aiplatform.jobs:BatchPredictionJob run completed. Resource name: projects/633325234048/locations/us-central1/batchPredictionJobs/8388483720826322944
