![](../../images/ml_flow_1.png)

## Pipelines Components

### Create Code/Folder Structure and Set Environment

In [1]:
PROJECT_ID = 'jchavezar-demo'
TRAIN_IMAGE = 'gcr.io/jchavezar-demo/pytorch-custom-random-t:v2'
PREDICTION_IMAGE = 'gcr.io/jchavezar-demo/pytorch-custom-random-p:v2'
STAGING_BUCKET = 'gs://vtx-staging'

#### Training Code

In [None]:
!rm -fr training
!mkdir training

In [None]:
%%writefile training/train.py
#%%
import pandas as pd
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig

train = pd.read_csv('gs://vtx-datasets-public/pytorch_tabular/synthetic/train.csv')
test = pd.read_csv('gs://vtx-datasets-public/pytorch_tabular/synthetic/test.csv')
val = pd.read_csv('gs://vtx-datasets-public/pytorch_tabular/synthetic/val.csv')

cat_col_names = [col for col in train.columns if 'cat' in col]
num_col_names = [col for col in train.columns if 'num' in col]

data_config = DataConfig(
    target=['target'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)
trainer_config = TrainerConfig(
    auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
    batch_size=1024,
    max_epochs=100,
    accelerator="auto", # can be 'cpu','gpu', 'tpu', or 'ipu' 
)
optimizer_config = OptimizerConfig()


head_config = LinearHeadConfig(
    layers="", # No additional layer in head, just a mapping layer to output_dim
    dropout=0.1,
    initialization="kaiming"
).__dict__ # Convert to dict to pass to the model config (OmegaConf doesn't accept objects)

model_config = CategoryEmbeddingModelConfig(
    task="classification",
    layers="32-16", # Number of nodes in each layer
    activation="LeakyReLU", # Activation between each layers
    dropout=0.1,
    initialization="kaiming",
    head = "LinearHead", #Linear Head
    head_config = head_config, # Linear Head Config
    learning_rate = 1e-3
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

tabular_model.fit(train=train, validation=val)
tabular_model.save_model('/gcs/vtx-models/pytorch/tabular_random')

In [None]:
%%writefile training/Dockerfile
FROM pytorch/pytorch:1.12.1-cuda11.3-cudnn8-devel

COPY . .
RUN pip install pytorch_tabular[extra]
RUN pip install gcsfs

ENTRYPOINT ["python", "train.py"]

In [None]:
!gcloud builds submit -t $TRAIN_IMAGE training/.

In [None]:
model.upload_model.component.UnmanagedContainerModel.

#### Prediction Code

In [None]:
!rm -fr prediction
!mkdir prediction
!mkdir prediction/app

In [None]:
%%writefile prediction/app/main.py

#%%
import json
import os
import pandas as pd
from starlette.responses import JSONResponse
from fastapi import Request, FastAPI
from pytorch_tabular import TabularModel

app = FastAPI()
#columns = pd.read_csv('gs://vtx-datasets-public/pytorch_tabular/synthetic/train.csv', nrows=0).iloc[:,:-1].columns.to_list()
loaded_model = TabularModel.load_from_checkpoint("tabular_random")
#%%
@app.get('/health_check')
def health():
    return 200
if os.environ.get('AIP_PREDICT_ROUTE') is not None:
    method = os.environ['AIP_PREDICT_ROUTE']
else:
    method = '/predict'

@app.post(method)
async def predict(request: Request):
    print("----------------- PREDICTING -----------------")
    body = await request.json()
    instances = body["instances"]
    data_pred = pd.DataFrame.from_dict(instances)
    outputs = loaded_model.predict(data_pred)
    response = outputs['prediction'].tolist()
    print("----------------- OUTPUTS -----------------")
    return JSONResponse({
        "predictions": {"probability": response}
        })

In [None]:
%%writefile prediction/Dockerfile

FROM pytorch/pytorch:1.12.1-cuda11.3-cudnn8-devel

COPY app /app
WORKDIR /app

RUN pip install pytorch_tabular[extra]
RUN pip install uvicorn fastapi
RUN pip install gcsfs

CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"]

In [None]:
!gsutil cp -r gs://vtx-models/pytorch/tabular_random prediction/app/

In [None]:
!gcloud builds submit -t $PREDICTION_IMAGE prediction/.

### Data Extraction and Training Pipeline

In [65]:
from google_cloud_pipeline_components.v1 import custom_job, model
from google_cloud_pipeline_components.types import artifact_types
from google_cloud_pipeline_components.aiplatform import ModelUploadOp
from kfp.dsl import pipeline, importer
from kfp import compiler

## Worker pool spec for training
worker_pool_specs = [
        {
            "machine_spec": {
                "machine_type": "n1-standard-4",
                "accelerator_type": "NVIDIA_TESLA_T4",
                "accelerator_count": 1,
            },
            "replica_count": 1,
            "container_spec": {
                "image_uri": TRAIN_IMAGE,
            },
        }
    ]

@pipeline(name="pytorch-tabular-gpu")
def pipeline(
    project_id: str,
    display_name: str,
):
    train_task = custom_job.CustomTrainingJobOp(
        display_name=f"{display_name}-train",
        project=project_id,
        worker_pool_specs=worker_pool_specs
    )
    import_unmanaged_model_task = importer(
    artifact_uri= "gs://vtx-models/pytorch/tabular_random",
    artifact_class=artifact_types.UnmanagedContainerModel, 
    metadata={
        "containerSpec": {
            "imageUri": PREDICTION_IMAGE,
            "healthRoute": "/health_check",
            "ports": [{"containerPort": 8080}]
        }
    }
    ).after(train_task)
    upload_task = model.ModelUploadOp(
        display_name=f"{display_name}-model",
        project=project_id,
        unmanaged_container_model=import_unmanaged_model_task.outputs["artifact"],
        #explanation_parameters=parameters,
        #explanation_metadata=EXPLANATION_METADATA,
    )

In [66]:
# Compile File

compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path='pytorch-tabular-gpu.yaml')

## [OPTIONAL]

In [67]:
## Create Template:
from kfp.registry import RegistryClient

client = RegistryClient(host=f"https://us-central1-kfp.pkg.dev/jchavezar-demo/pipe-repo")

## Upload Template

templateName, versionName = client.upload_pipeline(
  file_name="pytorch-tabular-gpu.yaml",
  tags=["v5", "latest"],
  extra_headers={"description":"This is an example pipeline template."})

## Creating Pipelines from Templates

In [68]:
from google.cloud import aiplatform

# Initialize the aiplatform package
aiplatform.init(
    project="jchavezar-demo",
    location='us-central1',
    staging_bucket="gs://vtx-staging")

In [69]:
# Create a job via tag and with different 
job = aiplatform.PipelineJob(
    display_name="pytorch-tabular-run",
    template_path="https://us-central1-kfp.pkg.dev/jchavezar-demo/pipe-repo/pytorch-tabular-gpu/v5",
    parameter_values={
        "project_id": "jchavezar-demo", 
        "display_name": "pytorch-tab-pipe"}
)
job.submit()

FileNotFoundError: [Errno 2] No such file or directory: 'https://us-central1-kfp.pkg.dev/jchavezar-demo/pipe-repo/pytorch-tabular-gpu/v5'

In [38]:
# Create a job via version id.
job = aiplatform.PipelineJob(
    display_name="pytorch-tabular-latest",
    template_path="https://us-central1-kfp.pkg.dev/jchavezar-demo/pipe-repo/pytorch-tabular/sha256:9782bda73edd2ec2c4a6e471d4405cab0ac7a768bdd2cf320b66d00aecfa7591",
    parameter_values={
        "project_id": PROJECT_ID,
        "display_name": "pytorch-tabular"
    },
)
job.submit(experiment='pytorch-tabular-pipe')

FileNotFoundError: [Errno 2] No such file or directory: 'https://us-central1-kfp.pkg.dev/jchavezar-demo/pipe-repo/pytorch-tabular/sha256:9782bda73edd2ec2c4a6e471d4405cab0ac7a768bdd2cf320b66d00aecfa7591'

In [None]:
# Create a job via tag and with different 
job = aiplatform.PipelineJob(
    display_name="simple-sample-latest",
    template_path="https://us-central1-kfp.pkg.dev/jchavezar-demo/simple-samples-repo/simple-testing/v1",
    parameter_values={"dataset": "gs://vtx-datasets-public/pytorch_tabular/synthetic/test.csv"}
)
job.submit()

In [None]:
from sklearn.datasets import fetch_openml

data = fetch_openml("credit-g")  # get the credit data from OpenML
X_raw = data.data  # features (pandas DataFrame)
y_raw = data.target  # labels (pandas Series)

In [None]:
X_raw['target']=y_raw

In [None]:
X_raw.to_csv('dataset.csv', index=False)