In [75]:
!pwd

/home/jupyter/vertex-ai-samples/notebooks


E0422 19:03:55.498913995   26965 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies


# Sklearn with Pandas

This is similar to the other notebook except we will be using pandas and bigquery
Topics covered
* Training sklearn locally, deploying to endpoint
* Saving data as CSV and doing batch predict from GCS
* Loading data to BQ, using BQ magics
* Running a batch prediction from BQ to BQ

In [16]:
PROJECT_ID = 'YOUR-PROJECT' #SET THIS TO YOUR PROJECT ID
BUCKET = "gs://YOUR-BUCKET" #BE SURE TO gsutil mb -l <REGION> <LOG_BUCKET> to create the bucket on GCP

In [3]:
# generate synthetic data
import pandas as pd
import numpy as np #for the random integer example
df = pd.DataFrame(np.random.randint(0.0,100.0,size=(10,4)),
              index=range(10,20),
              columns=['col1','col2','col3','label'],
              dtype='float64')

In [4]:
df

Unnamed: 0,col1,col2,col3,label
10,62.0,35.0,3.0,79.0
11,9.0,14.0,56.0,52.0
12,46.0,80.0,68.0,95.0
13,15.0,92.0,9.0,7.0
14,67.0,64.0,38.0,17.0
15,79.0,68.0,73.0,99.0
16,47.0,61.0,50.0,37.0
17,59.0,35.0,57.0,38.0
18,4.0,35.0,63.0,29.0
19,56.0,96.0,62.0,46.0


In [6]:
from sklearn.ensemble import RandomForestRegressor

# Set the model parameters. 
n_estimators = 100
max_depth = 6
max_features = 3

rf = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, max_features = max_features)
rf.fit(df[['col1', 'col2', 'col3']], df['label'])

RandomForestRegressor(max_depth=6, max_features=3)

In [13]:
import os
import pickle

artifact_filename = 'pandas_model_test.pkl'

# Save model artifact to local filesystem (doesn't persist)
local_path = artifact_filename
with open(local_path, 'wb') as model_file:
    pickle.dump(rf, model_file)

## Upload the model to Vertex

In [15]:
from google.cloud import aiplatform

model = aiplatform.Model.upload_scikit_learn_model_file(
        display_name='pandas test',
        model_file_path=local_path,
        description='pandas test for deploying models to vertex',
        sync=False, #this will not bind up your notebook instance with the creation operation
    ) #note this will automatcially designate the latest sklearn serving container

Creating Model
Create Model backing LRO: projects/633325234048/locations/us-central1/models/2404029397574090752/operations/769929237778923520
Model created. Resource name: projects/633325234048/locations/us-central1/models/2404029397574090752
To use this Model in another session:
model = aiplatform.Model('projects/633325234048/locations/us-central1/models/2404029397574090752')


### Now we will create a different dataframe to make predictions on for batch predictions

In [25]:
df2 = pd.DataFrame(np.random.randint(0.0,100.0,size=(10,3)), # we will do batch predictions based on this
              index=range(10,20),
              columns=['col1','col2','col3'],
              dtype='float64')
rf.predict(df2[['col1','col2','col3']])

array([44.75      , 80.7       , 82.95      , 82.03      , 88.81      ,
       35.02833333, 82.36      , 89.93      , 50.6       , 44.        ])

### Exepected output
From documentation:
```
    "input1","input2","input3"
    0.1,1.2,3.0
    4.0,5.0,6.0
```

In [38]:
from google.cloud import storage
import csv

# save the csv with the header, no index
df2.to_csv('df2.csv', index=False)#, quoting=csv.QUOTE_ALL) #quotenonumeric to get header quotes

data_directory = BUCKET + "/data"
storage_path = os.path.join(data_directory, 'df2.csv')
blob = storage.blob.Blob.from_string(storage_path, client=storage.Client())
blob.upload_from_filename("df2.csv")

In [39]:
batch_prediction_job = model.batch_predict(
        job_display_name='pandas batch predict job sklearn',
        gcs_source=storage_path,
        gcs_destination_prefix=BUCKET+"/predictions",
        machine_type='n1-standard-2',
        instances_format='csv', #This is key to parsing CSV input
        # accelerator_count=accelerator_count,
        # accelerator_type=accelerator_type, #if you want gpus
        starting_replica_count=1,
        max_replica_count=2,
        sync=False,
    )

Creating BatchPredictionJob
BatchPredictionJob created. Resource name: projects/633325234048/locations/us-central1/batchPredictionJobs/1196121866217979904
To use this BatchPredictionJob in another session:
bpj = aiplatform.BatchPredictionJob('projects/633325234048/locations/us-central1/batchPredictionJobs/1196121866217979904')
View Batch Prediction Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/batch-predictions/1196121866217979904?project=633325234048
BatchPredictionJob projects/633325234048/locations/us-central1/batchPredictionJobs/1196121866217979904 current state:
JobState.JOB_STATE_RUNNING
BatchPredictionJob projects/633325234048/locations/us-central1/batchPredictionJobs/1196121866217979904 current state:
JobState.JOB_STATE_RUNNING
BatchPredictionJob projects/633325234048/locations/us-central1/batchPredictionJobs/1196121866217979904 current state:
JobState.JOB_STATE_RUNNING
BatchPredictionJob projects/633325234048/locations/us-central1/batchPredictionJobs/

In [40]:
!pip install pandas_gbq --user

Collecting pandas_gbq
  Downloading pandas_gbq-0.17.4-py2.py3-none-any.whl (25 kB)
Collecting pydata-google-auth
  Downloading pydata_google_auth-1.4.0-py2.py3-none-any.whl (14 kB)
Collecting db-dtypes<2.0.0,>=0.3.1
  Downloading db_dtypes-1.0.0-py2.py3-none-any.whl (14 kB)
Installing collected packages: db-dtypes, pydata-google-auth, pandas_gbq
Successfully installed db-dtypes-1.0.0 pandas_gbq-0.17.4 pydata-google-auth-1.4.0


## Create an empty dataset to house the tables

In [43]:
!bq --location=location mk \
--dataset \
--description "test dataset" \
--location "US" \
$PROJECT_ID:TEST

Dataset 'wortz-project:TEST' successfully created.


In [48]:
# Load the table to BQ and make Batch predictions
from pandas_gbq import to_gbq

df2.to_gbq(destination_table=f"{PROJECT_ID}.TEST.df2", project_id=PROJECT_ID)


100%|██████████| 1/1 [00:00<00:00, 1704.31it/s]


## Bigquery magic comes available by default

In [49]:
%%bigquery
select * from TEST.df2

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 579.56query/s]                          
Downloading: 100%|██████████| 10/10 [00:01<00:00,  7.76rows/s]


Unnamed: 0,col1,col2,col3
0,20.0,12.0,62.0
1,14.0,18.0,79.0
2,56.0,19.0,81.0
3,58.0,91.0,74.0
4,77.0,49.0,99.0
5,50.0,26.0,38.0
6,49.0,23.0,69.0
7,78.0,23.0,96.0
8,98.0,36.0,15.0
9,79.0,82.0,33.0


BatchPredictionJob projects/633325234048/locations/us-central1/batchPredictionJobs/1196121866217979904 current state:
JobState.JOB_STATE_RUNNING


## Now run batch predicitons on this bq table

Note you have to have write permissions on the dataset - you may see a error if you don't

In [63]:
?model.batch_predict # Use in-notebook help for help with fields for these

[0;31mSignature:[0m
[0mmodel[0m[0;34m.[0m[0mbatch_predict[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mjob_display_name[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgcs_source[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mSequence[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbigquery_source[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minstances_format[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'jsonl'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgcs_destination_prefix[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0

In [73]:
batch_prediction_job = model.batch_predict(
        job_display_name='bigquery batch predict job sklearn',
        bigquery_source=f"bq://{PROJECT_ID}.TEST.df2",
        bigquery_destination_prefix=f'bq://{PROJECT_ID}', #this will create a seperate dataset with predictions
        machine_type='n1-standard-2',
        # accelerator_count=accelerator_count,
        # accelerator_type=accelerator_type, #if you want gpus
        starting_replica_count=1,
        max_replica_count=2,
        sync=True,
    ) 

# Output table will look something like this:  wortz-project.prediction_pandas_test_2022_04_22T11_32_14_834Z.predictions_2022_04_22T11_32_14_834Z 

Creating BatchPredictionJob
BatchPredictionJob created. Resource name: projects/633325234048/locations/us-central1/batchPredictionJobs/5876487778962767872
To use this BatchPredictionJob in another session:
bpj = aiplatform.BatchPredictionJob('projects/633325234048/locations/us-central1/batchPredictionJobs/5876487778962767872')
View Batch Prediction Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/batch-predictions/5876487778962767872?project=633325234048
BatchPredictionJob projects/633325234048/locations/us-central1/batchPredictionJobs/5876487778962767872 current state:
JobState.JOB_STATE_RUNNING
BatchPredictionJob projects/633325234048/locations/us-central1/batchPredictionJobs/5876487778962767872 current state:
JobState.JOB_STATE_RUNNING
BatchPredictionJob projects/633325234048/locations/us-central1/batchPredictionJobs/5876487778962767872 current state:
JobState.JOB_STATE_RUNNING
BatchPredictionJob projects/633325234048/locations/us-central1/batchPredictionJobs/

# Other topics to consider
* Batch training
* Pipeline orchastration