![](../../images/ml_flow_1.png)

## Pipelines Components

### Create Code/Folder Structure and Set Environment

In [1]:
PROJECT_ID = 'jchavezar-demo'
TRAIN_IMAGE = 'gcr.io/jchavezar-demo/pytorch-custom-random-t:v2'
PREDICTION_IMAGE = 'gcr.io/jchavezar-demo/pytorch-custom-random-p:v2'
STAGING_BUCKET = 'gs://vtx-staging'

#### Training Code

In [2]:
!rm -fr training
!mkdir training

In [3]:
%%writefile training/train.py
#%%
import pandas as pd
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig

train = pd.read_csv('gs://vtx-datasets-public/pytorch_tabular/synthetic/train.csv')
test = pd.read_csv('gs://vtx-datasets-public/pytorch_tabular/synthetic/test.csv')
val = pd.read_csv('gs://vtx-datasets-public/pytorch_tabular/synthetic/val.csv')

cat_col_names = [col for col in train.columns if 'cat' in col]
num_col_names = [col for col in train.columns if 'num' in col]

data_config = DataConfig(
    target=['target'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
    auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
    batch_size=1024,
    max_epochs=100,
    accelerator="auto", # can be 'cpu','gpu', 'tpu', or 'ipu' 
)
optimizer_config = OptimizerConfig()


head_config = LinearHeadConfig(
    layers="", # No additional layer in head, just a mapping layer to output_dim
    dropout=0.1,
    initialization="kaiming"
).__dict__ # Convert to dict to pass to the model config (OmegaConf doesn't accept objects)

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="32-16", # Number of nodes in each layer
    activation="LeakyReLU", # Activation between each layers
    dropout=0.1,
    initialization="kaiming",
    head = "LinearHead", #Linear Head
    head_config = head_config, # Linear Head Config
    learning_rate = 1e-3
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

tabular_model.fit(train=train, validation=val)
tabular_model.save_model('/gcs/vtx-models/pytorch/tabular_random')

Writing training/train.py


In [4]:
%%writefile training/Dockerfile
FROM pytorch/pytorch:1.12.1-cuda11.3-cudnn8-devel

COPY . .
RUN pip install pytorch_tabular[extra]
RUN pip install gcsfs

ENTRYPOINT ["python", "train.py"]

Writing training/Dockerfile


In [None]:
!gcloud builds submit -t $TRAIN_IMAGE training/.

In [57]:
model.upload_model.component.UnmanagedContainerModel.

Help on package google_cloud_pipeline_components.v1.model.upload_model in google_cloud_pipeline_components.v1.model:

NAME
    google_cloud_pipeline_components.v1.model.upload_model - Google Cloud Pipeline V2 Model Upload Component.

PACKAGE CONTENTS
    component

FILE
    /opt/conda/lib/python3.7/site-packages/google_cloud_pipeline_components/v1/model/upload_model/__init__.py




#### Prediction Code

In [59]:
!rm -fr prediction
!mkdir prediction
!mkdir prediction/app

In [60]:
%%writefile prediction/app/main.py

#%%
import json
import os
import pandas as pd
from starlette.responses import JSONResponse
from fastapi import Request, FastAPI
from pytorch_tabular import TabularModel

app = FastAPI()
#columns = pd.read_csv('gs://vtx-datasets-public/pytorch_tabular/synthetic/train.csv', nrows=0).iloc[:,:-1].columns.to_list()
loaded_model = TabularModel.load_from_checkpoint("tabular_random")
#%%
@app.get('/health_check')
def health():
    return 200
if os.environ.get('AIP_PREDICT_ROUTE') is not None:
    method = os.environ['AIP_PREDICT_ROUTE']
else:
    method = '/predict'

@app.post(method)
async def predict(request: Request):
    print("----------------- PREDICTING -----------------")
    body = await request.json()
    instances = body["instances"]
    data_pred = pd.DataFrame.from_dict(instances)
    outputs = loaded_model.predict(data_pred)
    response = outputs['prediction'].tolist()
    print("----------------- OUTPUTS -----------------")
    return JSONResponse({
        "predictions": {"probability": response}
        })

Writing prediction/app/main.py


In [61]:
%%writefile prediction/Dockerfile

FROM pytorch/pytorch:1.12.1-cuda11.3-cudnn8-devel

COPY app /app
WORKDIR /app

RUN pip install pytorch_tabular[extra]
RUN pip install uvicorn fastapi
RUN pip install gcsfs

CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"]

Writing prediction/Dockerfile


In [None]:
!gsutil cp -r gs://vtx-models/pytorch/tabular_random prediction/app/

In [None]:
!gcloud builds submit -t $PREDICTION_IMAGE prediction/.

In [65]:
!pip list | grep kfp

kfp                                   2.0.0b13
kfp-pipeline-spec                     0.2.0
kfp-server-api                        2.0.0a6


In [None]:
!pip list | grep component

google-cloud-pipeline-components      2.0.0b1


### Data Extraction and Training Pipeline

In [64]:
from google_cloud_pipeline_components.v1 import custom_job, model
from google_cloud_pipeline_components.types import artifact_types
google_cloud_pipeline_components.v1.model
from google_cloud_pipeline_components.aiplatform import ModelUploadOp
from kfp.dsl import pipeline, importer

## Worker pool spec for training
worker_pool_specs = [
        {
            "machine_spec": {
                "machine_type": "n1-standard-4",
                "accelerator_type": "NVIDIA_TESLA_T4",
                "accelerator_count": 1,
            },
            "replica_count": 1,
            "container_spec": {
                "image_uri": TRAIN_IMAGE,
            },
        }
    ]

## Prediction image definition
importer_spec = importer(
    artifact_uri= "gs://vtx-models/pytorch/tabular_random",
    artifact_class=model.upload_model.component.UnmanagedContainerModel, 
    metadata={
        "container_spec": {
            "image_uri": PREDICTION_IMAGE,
            "health_route": "/health_check",
            "ports": [8080]
        }
    }
)

@pipeline(name="pytorch-tabular-gpu")
def pipeline(
    project_id: str,
    display_name: str,
):
    train_task = custom_job.CustomTrainingJobOp(
        display_name=f"{display_name}-train",
        project=project_id,
        worker_pool_specs=worker_pool_specs
    )
    upload_task = model.ModelUploadOp(
        display_name=f"{display_name}-model",
        project=project_id,
        unmanaged_container_model=importer_spec,
        #explanation_parameters=parameters,
        #explanation_metadata=EXPLANATION_METADATA,
    ).after(train_task)

ValueError: Constant argument inputs must be one of type ['String', 'Integer', 'Float', 'Boolean', 'List', 'Dict'] Got: <kfp.components.pipeline_task.PipelineTask object at 0x7f63091730d0> of type <class 'kfp.components.pipeline_task.PipelineTask'>.

In [24]:
# Compile File
from kfp.v2 import compiler

compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path='pytorch-tabular-pipe.yaml')



NameError: name 'model' is not defined

## [OPTIONAL]

In [None]:
## Create Template:
from kfp.registry import RegistryClient

client = RegistryClient(host=f"https://us-central1-kfp.pkg.dev/jchavezar-demo/simple-samples-repo")

## Upload Template

templateName, versionName = client.upload_pipeline(
  file_name="pytorch-tabular-pipe.yaml",
  tags=["v1", "latest"],
  extra_headers={"description":"This is an example pipeline template."})

## Creating Pipelines from Templates

In [8]:
from google.cloud import aiplatform

# Initialize the aiplatform package
aiplatform.init(
    project="jchavezar-demo",
    location='us-central1',
    staging_bucket="gs://vtx-staging")

In [9]:
# Create a job via version id.
job = aiplatform.PipelineJob(
    display_name="pytorch-tabular-latest",
    template_path="pytorch-tabular-pipe.json",
    parameter_values={
        "project_id": PROJECT_ID
    },
)
job.submit(experiment='pytorch-tabular-pipe')

Creating PipelineJob
PipelineJob created. Resource name: projects/569083142710/locations/us-central1/pipelineJobs/pytorch-tabular-gpu-20230327140940
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/569083142710/locations/us-central1/pipelineJobs/pytorch-tabular-gpu-20230327140940')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/pytorch-tabular-gpu-20230327140940?project=569083142710


In [52]:
# Create a job via tag and with different 
job = aiplatform.PipelineJob(
    display_name="simple-sample-latest",
    template_path="https://us-central1-kfp.pkg.dev/jchavezar-demo/simple-samples-repo/simple-testing/v1",
    parameter_values={"dataset": "gs://vtx-datasets-public/pytorch_tabular/synthetic/test.csv"}
)
job.submit()

Creating PipelineJob
PipelineJob created. Resource name: projects/569083142710/locations/us-central1/pipelineJobs/simple-testing-20230322170958
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/569083142710/locations/us-central1/pipelineJobs/simple-testing-20230322170958')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/simple-testing-20230322170958?project=569083142710


In [41]:
from sklearn.datasets import fetch_openml

data = fetch_openml("credit-g")  # get the credit data from OpenML
X_raw = data.data  # features (pandas DataFrame)
y_raw = data.target  # labels (pandas Series)

  " {version}.".format(name=name, version=res[0]["version"])


In [42]:
X_raw['target']=y_raw

In [45]:
X_raw.to_csv('dataset.csv', index=False)