![](images/ml_flow_1.png)

## Pipelines Components

### Data Extraction and Processing

In [67]:
from kfp.dsl import (component, pipeline, Artifact, Input, Output, ClassificationMetrics)
from google_cloud_pipeline_components.v1.model import ModelUploadOp

#Data Processing
@component(packages_to_install=['google-cloud-bigquery[bqstorage,pandas]', 'scikit-learn'])
def preprocess(
    bq_dataset: str,
    x_train_out: Output[Artifact],
    y_train_out: Output[Artifact],
    x_test_out: Output[Artifact],
    y_test_out: Output[Artifact]
):
    import random
    import numpy as np
    import pandas as pd
    from google.cloud import bigquery
    from sklearn.datasets import fetch_openml
    from sklearn.metrics import accuracy_score
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split

    ########################################################################
    # Loading DS from BigQuery
    
    client = bigquery.Client(project='jchavezar-demo')
    sql = f"""
        SELECT * 
        FROM `{bq_dataset}`
    """
    df = client.query(sql).to_dataframe()
    X_raw = df.iloc[:,:-1]  # features (pandas DataFrame)
    y_raw = df.target  # labels (pandas Series)

    ########################################################################
    
    ########################################################################
    # Feature Engineering
    
    SEED = 123456
    np.random.seed(SEED)
    random.seed(SEED)
    
    cat_features = X_raw.select_dtypes(["object", "bool"]).columns
    num_features = X_raw.select_dtypes("float64").columns
    
    X_encoded = pd.get_dummies(X_raw, columns=cat_features, drop_first=True)
    print(X_encoded)
    y = y_raw.map({"bad": 0, "good": 1})  # encode labels as integers
    
    X_train, X_test, y_train, y_test = train_test_split(
        X_encoded,
        y,
        test_size=0.25,
        random_state=SEED,
    )

    scaler = StandardScaler()
    X_train[num_features] = scaler.fit_transform(X_train[num_features])
    X_test[num_features] = scaler.transform(X_test[num_features])
    ########################################################################
    
    ########################################################################
    # Storing DS' GCS
    
    # Target encoding (from text to int)
    X_train.to_csv(x_train_out.path, index=False)
    y_train.to_csv(y_train_out.path, index=False)
    X_test.to_csv(x_test_out.path, index=False)
    y_test.to_csv(y_test_out.path, index=False)

### Training

In [68]:
@component(
    base_image="us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-0:latest",
    packages_to_install=['pandas', 'gcsfs', 'scikit-learn'])
def train(
    x_train_in: Input[Artifact],
    y_train_in: Input[Artifact],
    x_test_in: Input[Artifact],
    y_test_in: Input[Artifact],
    metrics: Output[ClassificationMetrics],
    model: Output[Artifact]
) -> str:
    import pickle
    import pathlib
    import pandas as pd
    from joblib import dump
    from sklearn.metrics import confusion_matrix
    from sklearn.linear_model import LogisticRegression
    
    x_train = pd.read_csv(x_train_in.path)
    y_train = pd.read_csv(y_train_in.path)
    X_test = pd.read_csv(x_test_in.path)
    y_test = pd.read_csv(y_test_in.path)

    clf = LogisticRegression()
    clf.fit(x_train, y_train)
    acc_og = clf.score(X_test, y_test)
    print(f"Test accuracy of original logistic regression: {acc_og}")
    
    # Saving Model
    model.metadata["framework"] = "scikit-learn"
    model.metadata["containerSpec"] = {"imageUri": "us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-0:latest"}
    
    file_name = model.path + "/model.pkl"
    
    pathlib.Path(model.path).mkdir()
    with open(file_name, "wb") as file:
        pickle.dump(clf, file)
    
    # Metrics export (Confusion Matrix)
    y_test_pred = clf.predict(X_test)
    
    metrics.log_confusion_matrix(
        ['good', 'bad'],
        confusion_matrix(y_test, y_test_pred).tolist()
    )
    
    return str(acc_og)

### Pipeline

In [69]:
@pipeline(name='simple-testing')
def pipeline(bq_dataset: str):
    _preprocess = preprocess(bq_dataset=bq_dataset)
    _train = train(
        x_train_in = _preprocess.outputs['x_train_out'],
        y_train_in = _preprocess.outputs['y_train_out'],
        x_test_in = _preprocess.outputs['x_test_out'],
        y_test_in = _preprocess.outputs['y_test_out'],
    )
    model_upload_op = ModelUploadOp(
        display_name='sklearn-pipe',
        project='jchavezar-demo',
        location='us-central1',
        unmanaged_container_model=_train.outputs['model']
    )

In [70]:
# Compile File
from kfp import compiler

compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path='simple_testing.yaml')

In [71]:
## Create Template:
from kfp.registry import RegistryClient

client = RegistryClient(host=f"https://us-central1-kfp.pkg.dev/jchavezar-demo/simple-samples-repo")

## Upload Template

templateName, versionName = client.upload_pipeline(
  file_name="simple_testing.yaml",
  tags=["v1", "latest"],
  extra_headers={"description":"This is an example pipeline template."})

## Creating Pipelines from Templates

In [72]:
## Creating 2 pipelines from template

from google.cloud import aiplatform

# Initialize the aiplatform package
aiplatform.init(
    project="jchavezar-demo",
    location='us-central1',
    staging_bucket="gs://vtx-staging")

In [73]:
# Create a job via version id.
job = aiplatform.PipelineJob(
    display_name="simple-sample-latest",
    template_path="https://us-central1-kfp.pkg.dev/jchavezar-demo/simple-samples-repo/simple-testing/" + versionName,
    parameter_values={"bq_dataset": "jchavezar-demo.vertex_datasets_public.credit-openml"},
)
job.submit()

Creating PipelineJob
PipelineJob created. Resource name: projects/569083142710/locations/us-central1/pipelineJobs/simple-testing-20230323004721
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/569083142710/locations/us-central1/pipelineJobs/simple-testing-20230323004721')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/simple-testing-20230323004721?project=569083142710


In [52]:
# Create a job via tag and with different 
job = aiplatform.PipelineJob(
    display_name="simple-sample-latest",
    template_path="https://us-central1-kfp.pkg.dev/jchavezar-demo/simple-samples-repo/simple-testing/v1",
    parameter_values={"dataset": "gs://vtx-datasets-public/pytorch_tabular/synthetic/test.csv"}
)
job.submit()

Creating PipelineJob
PipelineJob created. Resource name: projects/569083142710/locations/us-central1/pipelineJobs/simple-testing-20230322170958
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/569083142710/locations/us-central1/pipelineJobs/simple-testing-20230322170958')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/simple-testing-20230322170958?project=569083142710


In [41]:
from sklearn.datasets import fetch_openml

data = fetch_openml("credit-g")  # get the credit data from OpenML
X_raw = data.data  # features (pandas DataFrame)
y_raw = data.target  # labels (pandas Series)

  " {version}.".format(name=name, version=res[0]["version"])


In [42]:
X_raw['target']=y_raw

In [45]:
X_raw.to_csv('dataset.csv', index=False)