# AML Use Case

In [None]:
!pip install alibi

In [6]:
## data from https://github.com/IBM/AML-Data
## !wget https://ibm.box.com/shared/static/d65gqvyxgm5w670up3dtz97l52xes4ko.txt

In [2]:
import json
import kfp
from kfp.components import InputPath, OutputPath
import kfp.dsl as dsl
from kfp.dsl import PipelineConf, data_passing_methods
from kubernetes.client.models import V1Volume, V1PersistentVolumeClaimVolumeSource
import numpy as np
import os
from pydoc import importfile
import requests

%load_ext lab_black

In [3]:
BASE_IMAGE = "quay.io/ibm/kubeflow-notebook-image-ppc64le:latest"

COMPONENT_CATALOG_FOLDER = f"{os.getenv('HOME')}/components"
COMPONENT_CATALOG_GIT = "https://github.com/lehrig/kubeflow-ppc64le-components.git"
COMPONENT_CATALOG_RELEASE = "main"

NUMBER_OF_WORKER = os.getenv("NUMBER_OF_WORKERS", default="1")

ARGUMENTS = {
    "blackboard": "artefacts",
    "model_name": "aml-classification",
    "cluster_configuration_secret": os.getenv(
        "CLUSTER_CONFIGURATION_SECRET", default=""
    ),
    "training_gpus": os.getenv("TRAINING_GPUS", default="1"),
    "number_of_workers": NUMBER_OF_WORKER,
    "distribution_type": "Job" if int(NUMBER_OF_WORKER) <= 1 else "MPI",
    "training_node_selector": os.getenv("TRAINING_NODE_SELECTOR", default=""),
}
MODEL_NAME = ARGUMENTS["model_name"]

with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace") as f:
    NAMESPACE = f.read()

ARGUMENTS

{'blackboard': 'artefacts',
 'model_name': 'aml-classification',
 'cluster_configuration_secret': '',
 'training_gpus': '1',
 'number_of_workers': '1',
 'distribution_type': 'Job',
 'training_node_selector': ''}

In [4]:
!git clone --branch $COMPONENT_CATALOG_RELEASE $COMPONENT_CATALOG_GIT $COMPONENT_CATALOG_FOLDER

Cloning into '/home/jovyan/components'...
remote: Enumerating objects: 724, done.[K
remote: Counting objects: 100% (204/204), done.[K
remote: Compressing objects: 100% (62/62), done.[K
remote: Total 724 (delta 154), reused 171 (delta 136), pack-reused 520[K
Receiving objects: 100% (724/724), 258.57 KiB | 4.54 MiB/s, done.
Resolving deltas: 100% (371/371), done.


In [5]:
CATALOG = importfile(f"{COMPONENT_CATALOG_FOLDER}/catalog.py")

In [9]:
def preprocess_dataset(
    data_dir: InputPath(str),
    train_dataset_dir: OutputPath(str),
    test_dataset_dir: OutputPath(str),
):
    import os
    import pandas as pd
    from imblearn.over_sampling import RandomOverSampler
    from sklearn.preprocessing import Normalizer

    # data = pd.read_csv("/tmp/jovyan/AMLdata_features_graph.csv", index_col=0)
    data = pd.read_feather(data_dir)
    data.set_index("date", inplace=True)
    print(f"Loaded {len(data)} records:\n", data.head(2))

    nmlz = Normalizer()
    feat_list = [4, 6, 10, 11, 12, 13, 14, 15]
    data.iloc[:, feat_list] = nmlz.fit_transform(data.iloc[:, feat_list])

    print(
        "Original % of Laundering:",
        1 - data["is laundering"].value_counts(normalize=True).max(),
    )

    x = data.iloc[:700_000, 1:]
    y = data.iloc[:700_000, 0]

    normal = y.value_counts()[0]

    ros = RandomOverSampler(
        random_state=0, sampling_strategy={0: normal, 1: int(normal / 10)}
    )
    x, y = ros.fit_resample(x, y)
    x = x.sample(frac=1)
    y = y[x.index]

    print("Train % of Laundering:", y.value_counts()[1] / len(y))

    x_test = data.iloc[700_000:, 1:]
    y_test = data.iloc[700_000:, 0]

    print("Test % of Laundering:", y_test.value_counts()[1] / len(y_test))

    if not os.path.exists(train_dataset_dir):
        os.makedirs(train_dataset_dir)
    x.to_csv(train_dataset_dir + "/x_train.csv")
    y.to_csv(train_dataset_dir + "/y_train.csv")

    if not os.path.exists(test_dataset_dir):
        os.makedirs(test_dataset_dir)
    x_test.to_csv(test_dataset_dir + "/x_test.csv")
    y_test.to_csv(test_dataset_dir + "/y_test.csv")

    print(
        f"Pre-processed dataset saved. Contents of '{test_dataset_dir}' and '{train_dataset_dir}':"
        f"\n {os.listdir(test_dataset_dir)}\n{os.listdir(train_dataset_dir)}"
    )


preprocess_dataset_comp = kfp.components.create_component_from_func(
    func=preprocess_dataset,
    base_image=BASE_IMAGE,
    packages_to_install=["imbalanced-learn"],
)

In [11]:
def train_classifier(
    train_dataset_dir: InputPath(str),
    test_dataset_dir: InputPath(str),
    model_dir: OutputPath(str),
):
    import numpy as np
    import pandas as pd
    from lightgbm import LGBMClassifier
    from onnx import load, save_model
    from onnxmltools.convert.lightgbm.operator_converters.LightGbm import (
        convert_lightgbm,
    )
    from skl2onnx.common.data_types import FloatTensorType
    from skl2onnx.common.shape_calculator import (
        calculate_linear_classifier_output_shapes,
    )
    from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
    from sklearn.model_selection import GridSearchCV
    from skl2onnx import convert_sklearn, update_registered_converter

    x_train = pd.read_csv(train_dataset_dir + "/x_train.csv", index_col=0)
    y_train = pd.read_csv(train_dataset_dir + "/y_train.csv", index_col=0)
    x_test = pd.read_csv(test_dataset_dir + "/x_test.csv", index_col=0)
    y_test = pd.read_csv(test_dataset_dir + "/y_test.csv", index_col=0)

    params = {
        "num_leaves": np.arange(18, 34, 5),
        "objective": ["binary"],
        "metric": ["binary_logloss"],
        "max_depth": np.arange(12, 22, 3),
        # "learning_rate": np.linspace(0.01, 0.3, 5),
        # "n_estimators": np.arange(70, 111, 10),
    }
    print("GridSearch starts with parameters:", params)

    gs = GridSearchCV(
        LGBMClassifier(),
        params,
        cv=10,
        verbose=1,
        n_jobs=-1,
        scoring=["roc_auc", "f1", "neg_log_loss"],
        refit="f1",
    )
    gs.fit(x_train, y_train.to_numpy().ravel())

    print("Best Estimator:", gs.best_estimator_)

    update_registered_converter(
        LGBMClassifier,
        "LightGbmLGBMClassifier",
        calculate_linear_classifier_output_shapes,
        convert_lightgbm,
        options={"nocl": [True, False], "zipmap": [True, False, "columns"]},
    )

    model_onnx = convert_sklearn(
        gs,
        "lgbm_aml_classifier",
        [("input", FloatTensorType([None, x_train.shape[1]]))],
        target_opset={"": 12, "ai.onnx.ml": 2},
    )

    print("Converting model to ONNX")
    with open(model_dir, "wb") as f:
        f.write(model_onnx.SerializeToString())

    preds = gs.predict(x_test)
    try:
        print(gs.best_estimator_.predict(x_test))
    except:
        pass

    print(
        "PR, RE, F1:", precision_recall_fscore_support(y_test, preds, average="macro")
    )
    print("Accuracy:", sum(preds == y_test.to_numpy().ravel()) / len(preds))
    print("# TN, FP # FN, TP:", confusion_matrix(y_test, preds))


train_classifier_comp = kfp.components.create_component_from_func(
    func=train_classifier,
    base_image=BASE_IMAGE,
    packages_to_install=["lightgbm", "skl2onnx", "onnxmltools"],
)

In [12]:
@dsl.pipeline(
    name="AML Classification",
    description="An example pipeline that performs anti-money laundering classification",
)
def aml_pipeline(
    blackboard: str,
    model_name: str,
    cluster_configuration_secret: str,
    training_gpus: int,
    number_of_workers: int,
    distribution_type: str,
    training_node_selector: str,
):
    create_blackboard = dsl.VolumeOp(
        name="Create Artefacts Blackboard",
        resource_name=blackboard,
        modes=dsl.VOLUME_MODE_RWO,
        size="4Gi",
        set_owner_reference=True,
    )

    load_dataframe_via_trino_task = CATALOG.load_dataframe_via_trino_comp(
        query="SELECT * FROM laundering_transactions",
        columns_query="SHOW COLUMNS FROM laundering_transactions",
        catalog="postgresql",
        schema="public",
    )
    load_dataframe_via_trino_task.after(create_blackboard)

    preprocess_dataset_task = preprocess_dataset_comp(
        data_dir=load_dataframe_via_trino_task.outputs["dataframe"]
    )
    # preprocess_dataset_task.add_pvolumes(
    #    {"/tmp/jovyan": dsl.PipelineVolume(pvc="aml-volume")}
    # )
    # preprocess_dataset_task.after(create_blackboard)

    train_classifier_task = train_classifier_comp(
        train_dataset_dir=preprocess_dataset_task.outputs["train_dataset_dir"],
        test_dataset_dir=preprocess_dataset_task.outputs["test_dataset_dir"],
    )

    upload_model_task = CATALOG.upload_model_comp(
        train_classifier_task.outputs["model_dir"],
        model_name=model_name,
    )

    deploy_model_with_kserve_task = CATALOG.deploy_model_with_kserve_comp(
        model_name=model_name
    )

    deploy_model_with_kserve_task.after(upload_model_task)

In [13]:
def disable_cache_transformer(op):
    if isinstance(op, dsl.ContainerOp):
        op.execution_options.caching_strategy.max_cache_staleness = "P0D"
    else:
        op.add_pod_annotation(
            name="pipelines.kubeflow.org/max_cache_staleness", value="P0D"
        )
    return op


pipeline_conf = PipelineConf()
pipeline_conf.add_op_transformer(disable_cache_transformer)
pipeline_conf.data_passing_method = data_passing_methods.KubernetesVolume(
    volume=V1Volume(
        name=ARGUMENTS["blackboard"],
        persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(
            "{{workflow.name}}-%s" % ARGUMENTS["blackboard"]
        ),
    ),
    path_prefix=f'{ARGUMENTS["blackboard"]}/',
)

kfp.Client().create_run_from_pipeline_func(
    aml_pipeline,
    arguments=ARGUMENTS,
    namespace=NAMESPACE,
    pipeline_conf=pipeline_conf,
)

RunPipelineResult(run_id=fef01198-a81e-4c8f-828a-4b5c319ca269)

In [189]:
HOST = MODEL_NAME + "-predictor-default." + NAMESPACE
HEADERS = {"Host": HOST}
MODEL_ENDPOINT = "http://" + MODEL_NAME + "-predictor-default/v2/models/" + MODEL_NAME

res = requests.get(MODEL_ENDPOINT, headers=HEADERS)
response = json.loads(res.text)
response

ConnectionError: HTTPConnectionPool(host='aml-classification-predictor-default', port=80): Max retries exceeded with url: /v2/models/aml-classification (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7ff734a7dc40>: Failed to establish a new connection: [Errno -2] Name or service not known'))