In [1]:
# !pip install google-cloud-aiplatform[prediction]@git+https://github.com/googleapis/python-aiplatform.git@custom-prediction-routine

# Sklearn with Pandas - Custom Prediction Routine to get `.predict_proba()`

This is similar to [the other notebook](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/ml_ops/stage6/get_started_with_cpr.ipynb) except we will be using pandas and bigquery

Topics covered
* Training sklearn locally, deploying to endpoint
* Saving data as CSV and doing batch predict from GCS
* Loading data to BQ, using BQ magics
* Running a batch prediction from BQ to BQ

In [2]:
# !gsutil mb -l us-central1 gs://wortz-project-bucket

In [3]:
from datetime import datetime

PROJECT_ID = 'wortz-project-352116' #SET THIS TO YOUR PROJECT ID
PREFIX = 'pre-processor-example'
BUCKET = f"gs://wortz-project-bucket/model-artifacts/{PREFIX}" #BE SURE TO gsutil mb -l <REGION> <LOG_BUCKET> to create the bucket on GCP
REGION = "us-central1"

In [4]:
# generate synthetic data
import pandas as pd
import numpy as np #for the random integer example

# set seed

np.random.seed(1234)

x = np.random.randint(0.0,100.0,size=(10,3))
y = np.random.binomial(1, .25, size=(10,1))
df = pd.DataFrame(np.append(x, y, axis=1),
              index=range(10,20),
              columns=['col1','col2','col3','label'],
              dtype='float64')

In [5]:
df

Unnamed: 0,col1,col2,col3,label
10,47.0,83.0,38.0,0.0
11,53.0,76.0,24.0,1.0
12,15.0,49.0,23.0,0.0
13,26.0,30.0,43.0,0.0
14,30.0,26.0,58.0,1.0
15,92.0,69.0,80.0,0.0
16,73.0,47.0,50.0,0.0
17,76.0,37.0,34.0,1.0
18,38.0,67.0,11.0,0.0
19,0.0,75.0,80.0,1.0


# New section - preprocessor creation.

In this section we will create a pipeline object that stores a standard scaler 
using the `PipeLine` class is important as it provides a lot of flexibility and conforms to sklearn's framework

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


# Set the model parameters. 
n_estimators = 100
max_depth = 6
max_features = 3

rf = RandomForestClassifier(n_estimators = n_estimators, max_depth = max_depth, max_features = max_features)


pipe = Pipeline([
        ('scale', StandardScaler()),
        ('clf', rf)])

fitted_pipe = pipe.fit(df[['col1', 'col2', 'col3']], df['label'])

## At this point you have a single artifact. We implement preprocessing in a couple of ways:
1) Store the entire pipeline - may work with prior example (worth testing/considering)
2) Break out preprocessing, store artifacts and leverage in preprocessor for cpr

In [7]:
import os
import pickle
import joblib

model_artifact_filename = 'model.joblib' #has to be joblib to work with CPR

# Save model artifact to local filesystem (doesn't persist)

joblib.dump(fitted_pipe.named_steps['clf'], model_artifact_filename)

scaler_artifact_filename = 'scaler_preproc.joblib' #has to be joblib to work with CPR

# Save model artifact to local filesystem (doesn't persist)

joblib.dump(fitted_pipe.named_steps['scale'], scaler_artifact_filename)

['scaler_preproc.joblib']

#### Upload the model pipeline to gcs

In [8]:
! gsutil cp $model_artifact_filename $BUCKET/model/
! gsutil cp $scaler_artifact_filename $BUCKET/scaler/

Copying file://model.joblib [Content-Type=application/octet-stream]...
/ [1 files][ 78.3 KiB/ 78.3 KiB]                                                
Operation completed over 1 objects/78.3 KiB.                                     
Copying file://scaler_preproc.joblib [Content-Type=application/octet-stream]...
/ [1 files][  997.0 B/  997.0 B]                                                
Operation completed over 1 objects/997.0 B.                                      


## Create a generic sklearn container that returns `predict_proba`

https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/ml_ops/stage6/get_started_with_cpr.ipynb

**highly recommend reviewing this notebook first as it breaks down the custom predictor interface**

In [9]:
! rm -rf container_code
! mkdir container_code

In [10]:
%%writefile container_code/requirements.txt
# fastapi
# uvicorn
joblib~=1.0
numpy>=1.20
scikit-learn~=1.0
google-cloud-storage>=1.5.0,<2.0.0dev
google-cloud-aiplatform[prediction] @ git+https://github.com/googleapis/python-aiplatform.git@custom-prediction-routine

Writing container_code/requirements.txt


In [11]:
%%writefile container_code/predictor.py
import joblib
import numpy as np
import pickle

from google.cloud import storage
from google.cloud.aiplatform.prediction.sklearn.predictor import SklearnPredictor
import json

class CprPredictor(SklearnPredictor):
    
    def __init__(self):
        return
    
    def load(self, gcs_artifacts_uri: str):
        """Loads the preprocessor and model artifacts."""
        # super().load(gcs_artifacts_uri)   
        gcs_client = storage.Client()
        with open("model.joblib", 'wb') as gcs_model:
            gcs_client.download_blob_to_file(
                gcs_artifacts_uri + "/model/model.joblib", gcs_model
            )

        with open("model.joblib", "rb") as f:
            self._model = joblib.load("model.joblib")
        
        #load the scaler
        with open("scaler_preproc.joblib", 'wb') as gcs_scaler:
            gcs_client.download_blob_to_file(
                gcs_artifacts_uri + "/scaler/scaler_preproc.joblib", gcs_scaler
            )

        with open("scaler_preproc.joblib", "rb") as f:
            scaler_obj = joblib.load("scaler_preproc.joblib") #load the scaler object
        
        self._preprocessor = scaler_obj #call transform as it's already fitted
            
    def preprocess(self, prediction_input: dict):
        """Perform scaling preprocessing"""
        inputs = super().preprocess(prediction_input) #we are using instances format here as we haven't changed the prediction handler (ie data looks the same here as inputs for predict
        return self._preprocessor.transform(inputs)

    
    def predict(self, instances):
        outputs = self._model.predict_proba(instances) 
        return outputs

Writing container_code/predictor.py


### Build and push container to Artifact Registry
#### Build your container
To build a custom container, we also need to write an entrypoint of the image that starts the model server. However, with the Custom Prediction Routine feature, you don't need to write the entrypoint anymore. Vertex AI SDK will populate the entrypoint with the custom predictor you provide.

In [12]:
import os

from google.cloud.aiplatform.prediction import LocalModel
from container_code.predictor import CprPredictor

REPOSITORY = "custom-preprocess-container-prediction"  # @param {type:"string"}
SERVER_IMAGE = "sklearn-cpr-preprocess-server"  # @param {type:"string"} 

local_model = LocalModel.create_cpr_model(
    "container_code",
    f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/{SERVER_IMAGE}",
    predictor=CprPredictor,
    requirements_path="container_code/requirements.txt"
)

### Test it out with a locally deployed endpoint
Need to generate credentials to test

In [13]:
local_model.get_serving_container_spec()

image_uri: "us-central1-docker.pkg.dev/wortz-project-352116/custom-preprocess-container-prediction/sklearn-cpr-preprocess-server"
predict_route: "/predict"
health_route: "/health"

#### Only run once to generate creds

### Create a repository to house your artifacts / images

In [14]:
!gcloud beta artifacts repositories create $REPOSITORY \
    --repository-format=docker \
    --location=$REGION

[1;31mERROR:[0m (gcloud.beta.artifacts.repositories.create) ALREADY_EXISTS: the repository already exists


In [15]:
! gcloud auth configure-docker {REGION}-docker.pkg.dev --quiet


{
  "credHelpers": {
    "gcr.io": "gcloud",
    "us.gcr.io": "gcloud",
    "eu.gcr.io": "gcloud",
    "asia.gcr.io": "gcloud",
    "staging-k8s.gcr.io": "gcloud",
    "marketplace.gcr.io": "gcloud",
    "us-central1-docker.pkg.dev": "gcloud"
  }
}
Adding credentials for: us-central1-docker.pkg.dev
gcloud credential helpers already registered correctly.


## Upload the model to Vertex using new Prediction Route Serving Container

In [16]:
local_model.push_image() #push to container registry

In [17]:
from google.cloud import aiplatform

model = local_model.upload(
        display_name='pandas test CLASSIFICATION',
        artifact_uri=BUCKET,
        description='preprocessor example - pandas test for deploying models to vertex CLASSIFICATION',
        labels= {'version': 'v1_1_'}, 
              
        sync=True, #false will not bind up your notebook instance with the creation operation
    ) 
# model = aiplatform.Model('projects/679926387543/locations/us-central1/models/5966834099661307904')

Creating Model
Create Model backing LRO: projects/679926387543/locations/us-central1/models/8299698706639224832/operations/6891626251677597696
Model created. Resource name: projects/679926387543/locations/us-central1/models/8299698706639224832
To use this Model in another session:
model = aiplatform.Model('projects/679926387543/locations/us-central1/models/8299698706639224832')


In [18]:
endpoint = model.deploy(machine_type="n1-standard-4")
# endpoint = aiplatform.Endpoint('projects/679926387543/locations/us-central1/endpoints/8555880517864521728')

Creating Endpoint
Create Endpoint backing LRO: projects/679926387543/locations/us-central1/endpoints/7051678242322776064/operations/1548668243755925504
Endpoint created. Resource name: projects/679926387543/locations/us-central1/endpoints/7051678242322776064
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/679926387543/locations/us-central1/endpoints/7051678242322776064')
Deploying model to Endpoint : projects/679926387543/locations/us-central1/endpoints/7051678242322776064
Deploy Endpoint model backing LRO: projects/679926387543/locations/us-central1/endpoints/7051678242322776064/operations/5585582359740153856
Endpoint model deployed. Resource name: projects/679926387543/locations/us-central1/endpoints/7051678242322776064


In [19]:
endpoint.predict(instances=[[47.7, 83.1, 38.7], [53.6, 76.1, 24.]])

Prediction(predictions=[[0.79, 0.21], [0.24, 0.76]], deployed_model_id='2882294965424095232', explanations=None)

# You should be able to see the logging ops by searching for `aiplatform.googleapis.com`
+ Make sure you click `show query` slider in case there are other limitations
![](images/log_example.png)

In [20]:
df2 = pd.DataFrame(np.random.randint(0.0,100.0,size=(10,3)), # we will do batch predictions based on this
              index=range(10,20),
              columns=['col1','col2','col3'],
              dtype='float64')

instances_formatted_data = df2.to_numpy().tolist()

predict_response = model.predict(
        request_file=instances_formatted_data,
        headers={"Content-Type": "application/json"},
    )

AttributeError: 'Model' object has no attribute 'predict'

### Expected output
From documentation:
```
array([[0.8 , 0.2 ],
       [0.38, 0.62],
       [0.61, 0.39],
       [0.65, 0.35],
       [0.56, 0.44],
       [0.63, 0.37],
       [0.55, 0.45],
       [0.43, 0.57],
       [0.43, 0.57],
       [0.38, 0.62]])
```

In [None]:
from google.cloud import storage
import csv

# save the csv with the header, no index
df2.to_csv('df2.csv', index=False)

data_directory = BUCKET + "/data"
storage_path = os.path.join(data_directory, 'df2.csv')
blob = storage.blob.Blob.from_string(storage_path, client=storage.Client())
blob.upload_from_filename("df2.csv")

In [None]:
batch_prediction_job = model.batch_predict(
        job_display_name='pandas batch predict job sklearn - VALUES JSON',
        gcs_source=storage_path,
        gcs_destination_prefix=BUCKET+"/predictions",
        machine_type='n1-standard-2',
        instances_format='csv', #This is key to parsing CSV input
        # accelerator_count=accelerator_count,
        # accelerator_type=accelerator_type, #if you want gpus
        starting_replica_count=1,
        max_replica_count=2,
        sync=False,
    )

### When successful you should see this
```
{"instance": [16.0, 64.0, 61.0], "prediction": [0.63, 0.37]}
{"instance": [83.0, 27.0, 87.0], "prediction": [0.35, 0.65]}
{"instance": [96.0, 83.0, 57.0], "prediction": [0.68, 0.32]}
{"instance": [11.0, 62.0, 17.0], "prediction": [0.89, 0.11]}
{"instance": [61.0, 28.0, 1.0], "prediction": [0.36, 0.64]}
```