# Anti Money Laundering

Classification resp. anomaly detection example to show case the integration of Alibi Explainers using data by [IBM](https://github.com/IBM/AML-Data).

## Authors

Natalie Jann [natalie.jann@ibm.com](mailto:natalie.jann@ibm.com)

Sebastian Lehrig [sebastian.lehrig1@ibm.com](mailto:sebastian.lehrig1@ibm.com)

Marvin Giessing [MARVING@de.ibm.com](mailto:MARVING@de.ibm.com)

## License
Apache-2.0 License

## 0.) Imports and Constants

In [89]:
## !wget https://ibm.box.com/shared/static/hki349pzectptwr2hpine38zkgdpbm8j.csv

In [1]:
import kfp
from kfp.components import InputPath, OutputPath
import kfp.dsl as dsl
from kfp.dsl import PipelineConf, data_passing_methods
from kubernetes.client.models import V1Volume, V1PersistentVolumeClaimVolumeSource
import os
from pydoc import importfile
from typing import List

%load_ext lab_black

In [2]:
BASE_IMAGE = "quay.io/ibm/kubeflow-notebook-image-ppc64le:latest"

COMPONENT_CATALOG_FOLDER = f"{os.getenv('HOME')}/components"
COMPONENT_CATALOG_GIT = "https://github.com/lehrig/kubeflow-ppc64le-components.git"
COMPONENT_CATALOG_RELEASE = "main"

NUMBER_OF_WORKER = os.getenv("NUMBER_OF_WORKERS", default="1")

ARGUMENTS = {
    "blackboard": "artefacts",
    "model_name": "aml-classification",
    "cluster_configuration_secret": os.getenv(
        "CLUSTER_CONFIGURATION_SECRET", default=""
    ),
    "training_gpus": os.getenv("TRAINING_GPUS", default="0"),
    "number_of_workers": NUMBER_OF_WORKER,
    "distribution_type": "Job" if int(NUMBER_OF_WORKER) <= 1 else "MPI",
    "training_node_selector": os.getenv("TRAINING_NODE_SELECTOR", default=""),
}
MODEL_NAME = ARGUMENTS["model_name"]

with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace") as f:
    NAMESPACE = f.read()

ARGUMENTS

{'blackboard': 'artefacts',
 'model_name': 'aml-classification',
 'cluster_configuration_secret': '',
 'training_gpus': '1',
 'number_of_workers': '1',
 'distribution_type': 'Job',
 'training_node_selector': ''}

## 1.) Load catalog with reusable Kubeflow components

In [3]:
!git clone --branch $COMPONENT_CATALOG_RELEASE $COMPONENT_CATALOG_GIT $COMPONENT_CATALOG_FOLDER

fatal: destination path '/home/jovyan/components' already exists and is not an empty directory.


In [80]:
CATALOG = importfile(f"{COMPONENT_CATALOG_FOLDER}/catalog.py")

## 2.) Create custom components
### 2.1) Component: Preprocess data (dataset loading, rebalancing & splitting)

In [None]:
def preprocess_dataset(
    data_dir: InputPath(str),
    train_dataset_dir: OutputPath(str),
    test_dataset_dir: OutputPath(str),
) -> List[str]:
    import os
    import pandas as pd
    from imblearn.over_sampling import RandomOverSampler
    from sklearn.preprocessing import Normalizer

    data = pd.read_feather(data_dir)
    data = data.iloc[: int(0.5 * len(data))]
    data.set_index("date", inplace=True)
    train_len = int(0.8 * len(data))
    print(f"Loaded {len(data)} records:\n", data.head(2))

    nmlz = Normalizer()
    feat_list = [4, 6, 10, 11, 12, 13, 14, 15]
    data.iloc[:, feat_list] = nmlz.fit_transform(data.iloc[:, feat_list])

    print(
        "Original % of Laundering:",
        1 - data["is laundering"].value_counts(normalize=True).max(),
    )

    x = data.iloc[:train_len, 1:]
    y = data.iloc[:train_len, 0]
    y.name = "y"

    normal = y.value_counts()[0]

    ros = RandomOverSampler(
        random_state=0, sampling_strategy={0: normal, 1: int(normal / 10)}
    )
    x, y = ros.fit_resample(x, y)
    x = x.sample(frac=1)
    y = y[x.index]

    print("Train % of Laundering:", y.value_counts()[1] / len(y))

    x_test = data.iloc[train_len:, 1:]
    y_test = data.iloc[train_len:, 0]

    print("Test % of Laundering:", y_test.value_counts()[1] / len(y_test))
    y_test.name = "y"

    if not os.path.exists(train_dataset_dir):
        os.makedirs(train_dataset_dir)

    train = pd.concat([x, y], axis=1)
    train.to_csv(train_dataset_dir + "/data.csv")

    if not os.path.exists(test_dataset_dir):
        os.makedirs(test_dataset_dir)
    test = pd.concat([x_test, y_test], axis=1)
    test.to_csv(test_dataset_dir + "/data.csv")

    print(
        f"Pre-processed dataset saved. Contents of '{test_dataset_dir}' and '{train_dataset_dir}':"
        f"\n {os.listdir(test_dataset_dir)}\n{os.listdir(train_dataset_dir)}"
    )

    return list(x.columns)


preprocess_dataset_comp = kfp.components.create_component_from_func(
    func=preprocess_dataset,
    base_image=BASE_IMAGE,
)

### 2.2) Component: model training, evaluation and conversion to ONNX

In [84]:
def train_classifier(
    train_dataset_dir: InputPath(str),
    test_dataset_dir: InputPath(str),
    onnx_dir: OutputPath(str),
    model_dir: OutputPath(str),
):
    import numpy as np
    import pandas as pd
    from joblib import dump
    from lightgbm import LGBMClassifier
    from onnxmltools.convert import convert_lightgbm
    from skl2onnx.common.data_types import FloatTensorType
    from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
    from sklearn.model_selection import GridSearchCV

    x_train = pd.read_csv(train_dataset_dir + "/data.csv", index_col=0, nrows=100_000)
    y_train = x_train.pop("y")
    x_test = pd.read_csv(test_dataset_dir + "/data.csv", index_col=0)
    y_test = x_test.pop("y")

    params = {
        # "num_leaves": np.arange(18, 34, 5),
        "objective": ["binary"],
        "metric": ["binary_logloss"],
        "max_depth": np.arange(12, 22, 3),
        # "learning_rate": np.linspace(0.01, 0.3, 5),
        # "n_estimators": np.arange(70, 111, 10),
    }
    print("GridSearch starts with parameters:", params)

    gs = GridSearchCV(
        LGBMClassifier(),
        params,
        cv=10,
        verbose=1,
        n_jobs=-1,
        scoring=["roc_auc", "f1", "neg_log_loss"],
        refit="f1",
    )
    gs.fit(x_train, y_train.to_numpy().ravel())

    with open(model_dir, "wb") as f:
        dump(gs.best_estimator_, f)

    print("Best Estimator:", gs.best_estimator_, "\nConverting classifier to ONNX.")
    model_onnx = convert_lightgbm(
        gs.best_estimator_,
        initial_types=[("input", FloatTensorType([None, x_train.shape[1]]))],
        zipmap=False,
    )

    with open(onnx_dir, "wb") as f:
        f.write(model_onnx.SerializeToString())

    preds = gs.best_estimator_.predict(x_test)

    print(
        "PR, RE, F1:", precision_recall_fscore_support(y_test, preds, average="macro")
    )
    print("Accuracy:", sum(preds == y_test.to_numpy().ravel()) / len(preds))
    print("# TN, FP # FN, TP:", confusion_matrix(y_test, preds))


train_classifier_comp = kfp.components.create_component_from_func(
    func=train_classifier,
    base_image=BASE_IMAGE,
    packages_to_install=["skl2onnx", "onnxmltools"],
)

### 2.3) Component: Create an explainer

In [85]:
def create_explainer(
    train_dataset_dir: InputPath(str),
    model_dir: InputPath(str),
    explainer_dir: OutputPath(str),
):
    from alibi.explainers import AnchorTabular
    from joblib import load
    import os
    import pandas as pd

    model = load(model_dir)
    x_train = pd.read_csv(train_dataset_dir + "/data.csv", index_col=0)
    x_train.pop("y")

    predict_fn = lambda x: model.predict(x)
    feature_names = list(x_train.columns.values)

    explainer = AnchorTabular(predict_fn, feature_names)
    explainer.fit(x_train.to_numpy())

    if not os.path.exists(explainer_dir):
        os.makedirs(explainer_dir)

    explainer.save(explainer_dir)
    print(
        f"Explainer saved. Contents of '{explainer_dir}':\n {os.listdir(explainer_dir)}"
    )


create_explainer_comp = kfp.components.create_component_from_func(
    func=create_explainer,
    base_image=BASE_IMAGE,
)

In [86]:
def explain_model(
    train_dataset_dir: InputPath(str),
    test_dataset_dir: InputPath(str),
    model_dir: InputPath(str),
    test_num: int = None,
):

    import pandas as pd
    from alibi.explainers import AnchorTabular
    from joblib import load
    from tqdm import tqdm
    import numpy as np

    model = load(model_dir)
    x_train = pd.read_csv(train_dataset_dir + "/data.csv", index_col=0)
    x_train.pop("y")
    x_test = pd.read_csv(test_dataset_dir + "/data.csv", index_col=0).iloc[:test_num, :]
    y_test = x_test.pop("y")
    print(f"Loaded {len(x_train)} train and {len(x_test)} test records.")

    predict_fn = lambda x: model.predict(x)
    explainer = AnchorTabular(predict_fn, list(x_train.columns.values))
    explainer.fit(x_train.to_numpy())

    results = x_test.copy(deep=True)
    results["Prediction"] = predict_fn(x_test.to_numpy())
    print(results.head())

    def explain(x):
        exp = explainer.explain(x.copy(deep=True).to_numpy(), threshold=0.8)
        precision = (
            exp.precision[0]
            if type(exp.precision) in [list, np.ndarray]
            else exp.precision
        )
        coverage = (
            exp.coverage[0]
            if type(exp.coverage) in [list, np.ndarray]
            else exp.coverage
        )
        return " AND ".join(exp.anchor), precision, coverage

    tqdm.pandas(total=len(x_test), miniters=5)
    results[["Anchor", "Precision", "Coverage"]] = results.iloc[:, :-1].progress_apply(
        explain, axis=1, result_type="expand"
    )

    results["Y"] = y_test.copy(deep=True)

    print(results.iloc[:, 16:].head())


explain_model_comp = kfp.components.create_component_from_func(
    func=explain_model,
    base_image=BASE_IMAGE,
)

## 3.) Create the actual pipeline by combining the components

In [87]:
@dsl.pipeline(
    name="AML Classification",
    description="An example pipeline that performs anti-money laundering classification",
)
def aml_pipeline(
    blackboard: str,
    model_name: str,
    cluster_configuration_secret: str,
    training_gpus: int,
    number_of_workers: int,
    distribution_type: str,
    training_node_selector: str,
):
    create_blackboard = dsl.VolumeOp(
        name="Create Artefacts Blackboard",
        resource_name=blackboard,
        modes=dsl.VOLUME_MODE_RWO,
        size="4Gi",
        set_owner_reference=True,
    )

    load_dataframe_via_trino_task = CATALOG.load_dataframe_via_trino_comp(
        query="SELECT * FROM laundering_transactions",
        columns_query="SHOW COLUMNS FROM laundering_transactions",
        catalog="postgresql",
        schema="public",
    )
    load_dataframe_via_trino_task.after(create_blackboard)

    preprocess_dataset_task = preprocess_dataset_comp(
        data_dir=load_dataframe_via_trino_task.outputs["dataframe"]
    )

    train_classifier_task = train_classifier_comp(
        train_dataset_dir=preprocess_dataset_task.outputs["train_dataset_dir"],
        test_dataset_dir=preprocess_dataset_task.outputs["test_dataset_dir"],
    )

    upload_model_task = CATALOG.upload_model_comp(
        train_classifier_task.outputs["onnx_dir"],
        project_name=model_name,
    )

    plot_confusion_matrix_task = CATALOG.plot_confusion_matrix_comp(
        input_columns=preprocess_dataset_task.outputs["output"],
        label_columns={"y": [0, 1]},
        test_dataset_dir=preprocess_dataset_task.outputs["test_dataset_dir"],
        model_dir=train_classifier_task.outputs["model_dir"],
        num_predictions=50,
    )

    explain_model_task = explain_model_comp(
        train_dataset_dir=preprocess_dataset_task.outputs["train_dataset_dir"],
        test_dataset_dir=preprocess_dataset_task.outputs["test_dataset_dir"],
        model_dir=train_classifier_task.outputs["model_dir"],
        test_num=2000,
    )
    
    create_explainer_task = create_explainer_comp(
        train_dataset_dir=preprocess_dataset_task.outputs["train_dataset_dir"],
        model_dir=train_classifier_task.outputs["model_dir"],
    )
    
    upload_explainer_task = CATALOG.upload_explainer_comp(
        file_dir=create_explainer_task.outputs["explainer_dir"],
        project_name=model_name,
        model_version=upload_model_task.outputs["model_version"],
    )

    deploy_model_with_kserve_task = CATALOG.deploy_model_with_kserve_comp(
        project_name=model_name,
        explainer_type="AnchorTabular",
        model_version=upload_model_task.outputs["model_version"],
    )

    deploy_model_with_kserve_task.after(upload_explainer_task)

## 4.) Run the pipeline within an experiment

Create a pipeline run, using a pipeline configuration that:

- enables data passing via persistent volumes (faster than the default MinIO-based passing)
- disables caching (which currently is not supported for data passing via volumes)

In [88]:
def disable_cache_transformer(op):
    if isinstance(op, dsl.ContainerOp):
        op.execution_options.caching_strategy.max_cache_staleness = "P0D"
    else:
        op.add_pod_annotation(
            name="pipelines.kubeflow.org/max_cache_staleness", value="P0D"
        )
    return op


pipeline_conf = PipelineConf()
pipeline_conf.add_op_transformer(disable_cache_transformer)
pipeline_conf.data_passing_method = data_passing_methods.KubernetesVolume(
    volume=V1Volume(
        name=ARGUMENTS["blackboard"],
        persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(
            "{{workflow.name}}-%s" % ARGUMENTS["blackboard"]
        ),
    ),
    path_prefix=f'{ARGUMENTS["blackboard"]}/',
)

kfp.Client().create_run_from_pipeline_func(
    aml_pipeline,
    arguments=ARGUMENTS,
    namespace=NAMESPACE,
    pipeline_conf=pipeline_conf,
)

RunPipelineResult(run_id=c828b642-25ab-454f-b0df-84b4772f114a)