In [3]:
from kfp.dsl import component, Input, Output, Dataset, Metrics, ClassificationMetrics

In [35]:
@component(
    base_image="python:3.10",
    packages_to_install=["pandas", "google-cloud-bigquery", "google-cloud-storage", "db-dtypes", "scikit-learn", "catboost"]
)
def training(
        project_id: str,
        bucket_id: str,
        bucket_folder: str,
        bigquery_table_id: str,
        metrics: Output[Metrics],
        confussion_metrics: Output[ClassificationMetrics],
):
    """

    :param confussion_metrics:
    :param metrics:
    :param bucket_id:
    :param project_id:
    :param bucket_folder:
    :param bigquery_table_id:
    """
    import pickle
    import logging
    import pandas as pd
    from datetime import datetime
    from catboost import CatBoostClassifier
    from google.cloud import bigquery, storage
    from sklearn.preprocessing import LabelEncoder
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score

    logging.warning("----------")
    logging.warning("MODEL CREATION STAGE")

    logging.warning("Reading Final Dataset...")

    bigquery_client = bigquery.Client(project=project_id)
    bucket = storage.Client(project=project_id).bucket(bucket_id)

    sql = f"""
    SELECT * FROM `{bigquery_table_id}`
    """

    dataMat = bigquery_client.query_and_wait(sql).to_dataframe()
    data = dataMat.to_numpy()

    logging.warning("Read Final Dataset")

    logging.warning("Checking Categorical Features...")

    cat_feat = [i for i in dataMat.columns if dataMat[i].dtypes == 'O']

    logging.warning("Checking Missing Values...")

    a = dict(dataMat.isnull().sum())
    b = [[i, a[i]] for i in a.keys()]
    missing = pd.DataFrame(b, columns=['features', 'null_values_count'])

    logging.warning("Storing Missing Values...")

    missing.to_csv("missing_values.csv", index=False)

    logging.warning("Storing Missing Values Done")

    logging.warning("Encoding Categorical Features...")

    label_encoders = {}
    label_mappings = {}

    encoder = LabelEncoder()
    for i in cat_feat:
        encoder.fit(dataMat[i])
        dataMat[i] = encoder.transform(dataMat[i])

        label_encoders[i] = encoder
        label_mappings[i] = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))

    blob = bucket.blob(f"{bucket_folder}/label_encoder.pkl")
    # pickle.dump(model, open(blob, 'wb'))

    with blob.open("wb") as f:
        pickle.dump(label_mappings, f)

    logging.warning("Features Encoding Done")

    logging.warning("Creating X and y variables ...")

    X = dataMat.iloc[:, :-1]
    y = dataMat['isFraud']

    logging.warning(f"Shape of X: {X.shape} and Shape of y: {y.shape}")

    logging.warning("Splitting Dataset...")

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    logging.warning("Instantiating Model...")

    model = CatBoostClassifier(random_state=42, class_weights={0:1, 1:12}, silent=True)

    logging.warning("Fitting Model...")

    model.fit(X_train, y_train)
    y_pred_cat = model.predict(X_test)

    logging.warning("Saving Model...")

    #model_path = "model.pkl"
    blob = bucket.blob(f"{bucket_folder}/model.pkl")
    # pickle.dump(model, open(blob, 'wb'))

    with blob.open("wb") as f:
        pickle.dump(model, f)

    f.close()

    logging.warning("Saving Model Metrics...")

    model_metric = {
        "time_stamp": datetime.now().strftime("%d-%m-%Y_%H:%M:%S"),
        "confusion_matrix": confusion_matrix(y_test, y_pred_cat).tolist(),
        "precision": precision_score(y_test, y_pred_cat),
        "recall": recall_score(y_test, y_pred_cat),
        "f1_score": f1_score(y_test, y_pred_cat)
    }

    confussion_metrics.log_confusion_matrix(
        ["laundering", "no_laundering"],
        confusion_matrix(y_test, y_pred_cat).tolist())

    metrics.log_metric("f1_score", f1_score(y_test, y_pred_cat))

    # data['model_metric'].append(model_metric)
    # with open(metric_file_path, "w") as f:
    #     json.dump(data, f, indent=4)

    logging.warning("Model Metrics Stored")

## Prediction Container

In [10]:
%%writefile Dockerfile_inference
FROM python:3.10-slim

WORKDIR /app
COPY inference/ .

RUN pip install --upgrade pip && \
    pip install catboost && \
    pip install google-cloud-storage && \
    pip install pandas && \
    pip install fastapi==0.75.0 uvicorn

CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"]

Overwriting Dockerfile_inference


In [11]:
%%writefile inference/main.py

import os
import pickle
import pandas as pd
from google.cloud import storage
from fastapi import Request, FastAPI

app = FastAPI()

AIP_PROJECT_NUMBER = os.getenv("AIP_PROJECT_NUMBER", "254356041555")
AIP_STORAGE_URI = os.getenv("AIP_STORAGE_URI")

buck = AIP_STORAGE_URI.split("/")[2]
blb = "/".join(AIP_STORAGE_URI.split("/")[3 :])
model_file_name = "model.json"
encoded_file_name = "label_encoder.pkl"

blob = storage.Client(AIP_PROJECT_NUMBER).bucket(buck).blob(blb + "/" + model_file_name)
blob.download_to_filename(model_file_name)

blob = storage.Client(AIP_PROJECT_NUMBER).bucket(buck).blob(blb + "/" + encoded_file_name)
blob.download_to_filename(encoded_file_name)

with open(model_file_name, "rb") as f:
    laundering_model = pickle.load(f)
with open(encoded_file_name, "rb") as f:
    label_mappings = pickle.load(f)


@app.get(os.getenv("AIP_HEALTH_ROUTE", "/healthcheck"), status_code=200)
def read_root() :
    """

    :return:
    """
    return {"Hello" : "World"}

@app.post(os.getenv("AIP_PREDICT_ROUTE", "/predict"), status_code=200)
async def predict(request: Request) :
    """

    :param request:
    :return:
    """
    body = await request.json()
    to_predict_df = pd.DataFrame(body["instances"])
    cat_feat = [i for i in to_predict_df.columns if to_predict_df[i].dtypes == 'O']

    for i in cat_feat:
        to_predict_df[i] = to_predict_df[i].map(label_mappings[i])

    re = laundering_model.predict(to_predict_df)[0]

    return {"predictions" : str(re)}

Overwriting inference/main.py


In [8]:
!docker build -t us-central1-docker.pkg.dev/jesusarguelles-sandbox/custom-predictions/fraud-pipe:v1 -f Dockerfile_inference .

Sending build context to Docker daemon  649.7MB
Step 1/5 : FROM python:3.10-slim
 ---> 152de85cbe2a
Step 2/5 : WORKDIR /app
 ---> Using cache
 ---> 66b2323b99aa
Step 3/5 : COPY inference/ .
 ---> 199fcffae68d
Step 4/5 : RUN pip install --upgrade pip &&     pip install catboost &&     pip install google-cloud-storage &&     pip install pandas &&     pip install fastapi==0.75.0 uvicorn
 ---> Running in d82a68cc45fa
Collecting pip
  Downloading pip-24.0-py3-none-any.whl (2.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.1/2.1 MB 8.7 MB/s eta 0:00:00
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.0.1
    Uninstalling pip-23.0.1:
      Successfully uninstalled pip-23.0.1
Successfully installed pip-24.0
[0mCollecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)


In [12]:
!docker push us-central1-docker.pkg.dev/jesusarguelles-sandbox/custom-predictions/fraud-pipe:v1

The push refers to repository [us-central1-docker.pkg.dev/jesusarguelles-sandbox/custom-predictions/fraud-pipe]

[1Bd2637e9e: Preparing 
[1B8e7558ed: Preparing 
[1Be94817bb: Preparing 
[1B3f6747a1: Preparing 
[1B04f811db: Preparing 
[1B675b4718: Preparing 
[2B675b4718: Waiting g 
[8Bd2637e9e: Pushed    1.04GB/1.008GB[6A[2K[3A[2K[8A[2K[2A[2K[1A[2K[8A[2K[8A[2K[8A[2K[7A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K[8A[2K

## Building Pipeline Vertex AI Pipeline

In [50]:
from kfp.dsl import pipeline, importer
from google_cloud_pipeline_components.types import artifact_types
from google_cloud_pipeline_components.v1.model import ModelUploadOp
from google_cloud_pipeline_components.v1.endpoint import ModelDeployOp
from google_cloud_pipeline_components.v1.endpoint import EndpointCreateOp

@pipeline(name="e2e-kfp-pipeline")
def pipeline(
        project_id: str,
        bucket_id: str,
        bucket_folder: str,
        bigquery_table_id: str,
        artifact_uri: str,
):
    training_job = training(
        project_id = project_id,
        bucket_id = bucket_id,
        bucket_folder = bucket_folder,
        bigquery_table_id = bigquery_table_id,
    )

    unmanaged_model_importer = importer(
        artifact_uri = artifact_uri,
        artifact_class = artifact_types.UnmanagedContainerModel,
        metadata = {
            "containerSpec": { "imageUri": "us-central1-docker.pkg.dev/jesusarguelles-sandbox/custom-predictions/fraud-pipe:v1"}
        }
    ).after(training_job)

    model_upload_op = ModelUploadOp(
        display_name = "kube-pipe-model",
        unmanaged_container_model = unmanaged_model_importer.outputs["artifact"],
    )

    endpoint_create_op = EndpointCreateOp(
        display_name="kube-pipe-endpoint",
    )

    endpoint = ModelDeployOp(
        model = model_upload_op.outputs["model"],
        endpoint = endpoint_create_op.outputs["endpoint"],
        dedicated_resources_machine_type = "n1-standard-4",
        dedicated_resources_min_replica_count = 1,
        dedicated_resources_max_replica_count = 1
    )

In [51]:
from kfp import compiler

compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path='money_laundering_detection_kfp.yaml'
)

In [52]:
from google.cloud import aiplatform as aip

project_id = "jesusarguelles-sandbox"
bucket_id = "jesusarguelles-datasets-public"
bucket_folder_name = "money_laundering_detection_kfp"
pipeline_root_path = "gs://jesusarguelles-staging/"
bigquery_table_id = "jesusarguelles-sandbox.demos.money_laun_final"
artifact_uri = "gs://{}/{}".format(bucket_id, bucket_folder_name)

aip.init(
    project=project_id,
    location="us-central1",
)

# Prepare the pipeline job
job = aip.PipelineJob(
    display_name="money_laundering_detection_kfp",
    template_path="money_laundering_detection_kfp.yaml",
    pipeline_root=pipeline_root_path,
    parameter_values={
        'project_id': project_id,
        'bucket_id': bucket_id,
        'bucket_folder': bucket_folder_name,
        'bigquery_table_id': bigquery_table_id,
        'artifact_uri': artifact_uri,
    }
)

job.submit()

Creating PipelineJob
PipelineJob created. Resource name: projects/390227712642/locations/us-central1/pipelineJobs/e2e-kfp-pipeline-20240508173934
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/390227712642/locations/us-central1/pipelineJobs/e2e-kfp-pipeline-20240508173934')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/e2e-kfp-pipeline-20240508173934?project=390227712642
