# Fraud Detection

Classify transactions as fraud or non-fraud using an LSTM-based neural network. Data coming from https://github.com/IBM/TabFormer/tree/main/data/credit_card

## Authors

Natalie Jann [natalie.jann@ibm.com](mailto:natalie.jann@ibm.com)

Sebastian Lehrig [sebastian.lehrig1@ibm.com](mailto:sebastian.lehrig1@ibm.com)

Marvin Giessing [MARVING@de.ibm.com](mailto:MARVING@de.ibm.com)

## License

Apache-2.0 License

## 0.) Imports & Constants

In [None]:
import json
import kfp
from kfp.components import InputPath, OutputPath
import kfp.dsl as dsl
from kfp.dsl import PipelineConf, data_passing_methods
from kubernetes.client.models import V1Volume, V1PersistentVolumeClaimVolumeSource
import os
import requests
from requests import post
from tensorflow import keras

%load_ext lab_black

In [None]:
# download data from https://ibm.ent.box.com/v/tabformer-data/file/770766751708 and upload here
!tar -xvf transactions.tgz

In [None]:
BASE_IMAGE = "quay.io/ibm/kubeflow-notebook-image-ppc64le:latest"

COMPONENT_CATALOG_FOLDER = f"{os.getenv('HOME')}/components"
COMPONENT_CATALOG_GIT = "https://github.com/lehrig/kubeflow-ppc64le-components.git"
COMPONENT_CATALOG_RELEASE = "main"

ARGUMENTS = {
    "blackboard": "artefacts",
    "model_name": "fraud-detection",
    "cluster_configuration_secret": os.getenv(
        "CLUSTER_CONFIGURATION_SECRET", default=""
    ),
    "training_gpus": os.getenv("TRAINING_GPUS", default="1"),
    "training_node_selector": os.getenv("TRAINING_NODE_SELECTOR", default=""),
}
MODEL_NAME = ARGUMENTS["model_name"]

with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace") as f:
    NAMESPACE = f.read()

ARGUMENTS

In [None]:
!git clone --branch $COMPONENT_CATALOG_RELEASE $COMPONENT_CATALOG_GIT $COMPONENT_CATALOG_FOLDER

In [None]:
CATALOG = importfile(f"{COMPONENT_CATALOG_FOLDER}/catalog.py")

## 1.) Component Definition: Dataset Loading, Rebalancing & Splitting

In [None]:
###############################################################################
# Task 1.1
###############################################################################
# (a) Find out where the load_dataframe_via_trino_comp component is used.
# (b) Update the Trino query to use an OFFSET of 20 (check the Trino
#     documentation in case you're unfamiliar with the OFFSET statement).
# (c) Which other parameters does the load_dataframe_via_trino_comp component
#     provide?
###############################################################################

load_dataframe_via_trino_comp = kfp.components.load_component_from_file(
    LOAD_DATAFRAME_VIA_TRINO_COMPONENT
)

In [None]:
###############################################################################
# Task 1.2
###############################################################################
# Define a function wrapping the code below to preprocess a dataframe. The
# function signature should have the following parameters:
# - dataframe: InputPath(str)
# - test_dataset_dir: OutputPath(str)
# - train_dataset_dir: OutputPath(str)
###############################################################################

from imblearn.over_sampling import RandomOverSampler
import math
import numpy as np
import os
import pandas as pd

def save_to_dir(x, y, directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
    np.savez(os.path.join(directory, "data.npz"), x=x, y=y)

def split_dataset(n, df):
    test = df.iloc[:n, :]
    train = df.iloc[n:, :]
    return test, train

def merge_splits(frauds, non_frauds):
    print("Ratio fraud/non-fraud:", len(frauds) / len(non_frauds))
    df = pd.concat([frauds, non_frauds])
    df.sort_values("year_month_day_time", inplace=True)

    x, y = df.drop(["is fraud"], axis=1), df["is fraud"]
    min_ind = math.floor(len(x) / 128)
    x, y = x[-min_ind * 128 :], y[-min_ind * 128 :]
    return x, y
    
dataset = pd.read_feather(dataframe)
dataset = dataset.sample(frac=1)
frauds = dataset[dataset["is fraud"] == 1]
non_frauds = dataset[dataset["is fraud"] == 0]
print(f"{len(frauds)} Frauds and {len(non_frauds)} Non-Frauds.")

n_frauds = int(0.001 * len(dataset))
n_non_frauds = int(len(dataset) * 0.2 - n_frauds)

test_frauds, train_frauds = split_dataset(n_frauds, frauds)
test_non_frauds, train_non_frauds = split_dataset(n_non_frauds, non_frauds)
x_train, y_train = merge_splits(train_frauds, train_non_frauds)
x_test, y_test = merge_splits(test_frauds, test_non_frauds)

over_sampler = RandomOverSampler(random_state=37, sampling_strategy=0.1)
train_input, train_target = over_sampler.fit_resample(x_train, y_train)
print(
    sum(train_target == 0),
    "negative &",
    sum(train_target == 1),
    "positive training samples (after upsampling)",
)
print(
    sum(y_test == 0),
    "negative &",
    sum(y_test == 1),
    "positive test samples",
)
train = pd.concat([pd.DataFrame(train_target), pd.DataFrame(train_input)], axis=1)
train.columns = dataset.columns
train.sort_values("year_month_day_time", inplace=True)
train_input, train_target = train.drop(["is fraud"], axis=1), train["is fraud"]

train_target = train_target.to_numpy().reshape(len(train_target), 1)
y_test = y_test.to_numpy().reshape(len(y_test), 1)

save_to_dir(train_input.to_numpy(), train_target, train_dataset_dir)
save_to_dir(x_test.to_numpy(), y_test, test_dataset_dir)

print(f"Pre-processed train dataset saved. Contents of '{train_dataset_dir}':")
print(os.listdir(train_dataset_dir))
print(f"Pre-processed test dataset saved. Contents of '{test_dataset_dir}':")
print(os.listdir(test_dataset_dir))

###############################################################################
# Task 1.3
###############################################################################
# Define a component from the function above:
# - use the BASE_IMAGE
# - the component should install the package 'imbalanced-learn'
###############################################################################

## 2.) Component Definition: Model Training, Evaluation & Deployment

In [None]:
monitor_training_comp = kfp.components.load_component_from_file(
    MONITOR_TRAINING_COMPONENT
)

In [None]:
###############################################################################
# Task 2.1
###############################################################################
# Define a function wrapping the code below to train the model. The function
# signature should have the following parameters:
# - model_dir: OutputPath(str)
# - test_dataset_dir: InputPath(str)
# - train_dataset_dir: InputPath(str)
# - seq_len: int = 7
###############################################################################

import numpy as np
import os
from tensorflow import keras
from tensorflow.keras.callbacks import (
    EarlyStopping,
    ModelCheckpoint,
    ReduceLROnPlateau,
    TensorBoard,
)
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.metrics import (
    TruePositives,
    FalsePositives,
    FalseNegatives,
    TrueNegatives,
)

def load_dataset(path):
    data = np.load(os.path.join(path, "data.npz"))
    x, y = data["x"], data["y"]
    dataset = keras.preprocessing.timeseries_dataset_from_array(
        x, y, sequence_length=seqlen, batch_size=128
    )
    return dataset

if not os.path.exists(model_dir):
    os.makedirs(model_dir)

train_dataset = load_dataset(train_dataset_dir)
test_dataset = load_dataset(test_dataset_dir)

for batch in train_dataset.take(1):
    input_d, targets = batch
print("Input shape:", input_d.numpy().shape, "Target shape:", targets.numpy().shape)

input_shape = (input_d.shape[1], input_d.shape[2])
inputs = Input(shape=input_shape)
lstm_in = LSTM(200, batch_size=7, return_sequences=True)(inputs)
lstm_out = LSTM(200, batch_size=7)(lstm_in)
outputs = Dense(1, activation="sigmoid")(lstm_out)
model = keras.Model(inputs=inputs, outputs=outputs)

metrics = [
    "accuracy",
    TruePositives(name="tp"),
    FalsePositives(name="fp"),
    FalseNegatives(name="fn"),
    TrueNegatives(name="tn"),
]
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=metrics)
print(model.summary())

print("Initializing training callbacks...")
callbacks = [
    EarlyStopping(monitor="loss", patience=20, verbose=0, mode="min"),
    ModelCheckpoint(
        f"{model_dir}/best_model.keras",
        monitor="loss",
        save_best_only=True,
        save_weights_only=True,
        mode="min",
    ),
    ReduceLROnPlateau(
        monitor="loss",
        factor=0.1,
        patience=7,
        verbose=1,
        min_delta=0.0001,
        mode="min",
    ),
    TensorBoard(
        log_dir=os.environ["TENSORBOARD_S3_ADDRESS"],
        histogram_freq=1,
    ),
]

###############################################################################
# Task 2.2
###############################################################################
# Make the number of epochs dynamic by adding it to the function signature.
###############################################################################

model.fit(
    train_dataset,
    epochs=10,
    verbose=3,
    callbacks=callbacks,
)

results = model.evaluate(test_dataset)
print("Evaluation Loss, Accuracy, TP, FP, FN, TN:", results)

model.save(model_dir)

###############################################################################
# Task 2.3
###############################################################################
# (a) Read the description of the train_specification parameter of the
#     train_model_comp component inside its documentation.
# (b) Initialize a train_specification variable to be used as train
#     specification based on the function above.
###############################################################################

train_model_comp = kfp.components.load_component_from_file(TRAIN_MODEL_COMPONENT)

In [None]:
convert_model_to_onnx_comp = kfp.components.load_component_from_file(
    CONVERT_MODEL_TO_ONNX_COMPONENT
)

upload_model_comp = kfp.components.load_component_from_file(UPLOAD_MODEL_COMPONENT)

###############################################################################
# Task 3.1
###############################################################################
# Apply the schema of defining components from a file shown right above by
# instantiating a component for the model deployment using the file behind
# DEPLOY_MODEL_WITH_KSERVE_COMPONENT.
###############################################################################

## 3.) Pipeline Definition

In [None]:
###############################################################################
# Task 3.2
###############################################################################
# Add the number of epochs to train as a pipeline parameter. Don't forget to
# update the ARGUMENTS at the beginning of the notebook, too.
###############################################################################
@dsl.pipeline(
    name="Fraud detection",
    description="An example pipeline that tries to predict fraudulent credit card transactions",
)
def fraud_pipeline(
    blackboard: str,
    model_name: str,
    cluster_configuration_secret: str,
    training_gpus: int,
    training_node_selector: str,
):
    create_blackboard = dsl.VolumeOp(
        name="Create Artefacts Blackboard",
        resource_name=blackboard,
        modes=dsl.VOLUME_MODE_RWO,
        size="4Gi",
        set_owner_reference=True,
    )

    load_dataframe_via_trino_task = CATALOG.load_dataframe_via_trino_comp(
        query="SELECT * FROM postgresql.public.transactions OFFSET 20",
        columns_query="SHOW COLUMNS FROM postgresql.public.transactions",
    )

    load_dataframe_via_trino_task.after(create_blackboard)

    create_dataset_quality_report_task = CATALOG.create_dataset_quality_report(
        dataset_dir=load_dataframe_via_trino_task.outputs["dataframe"],
        dataset_type="df/feather",
    )

    preprocess_dataset_task = preprocess_dataset_comp(
        dataframe=load_dataframe_via_trino_task.outputs["dataframe"]
    )

    monitor_training_task = CATALOG.monitor_training_comp().after(preprocess_dataset_task)

    ###########################################################################
    # Task 3.3
    ###########################################################################
    # Add the parameter 'epochs' from the train model function here and set it
    # to 3 epochs. Note how both train_specification and train_parameters are
    # used as parameters of the train_model_comp. The other parameters showcase
    # some of its additional features, e.g., cluster_configuration_secret
    # points to a secret holding the credentials to an external clusters for
    # for train bursting.
    ###########################################################################
    train_parameters = {
        "train_dataset_dir": "train_dataset_dir",
        "test_dataset_dir": "validation_dataset_dir",
        "model_dir": "model_dir",
    }

    train_model_task = CATALOG.train_model_comp(
        preprocess_dataset_task.outputs["train_dataset_dir"],
        preprocess_dataset_task.outputs["test_dataset_dir"],
        train_specification,
        train_parameters,
        model_name=model_name,
        gpus=training_gpus,
        node_selector=training_node_selector,
        tensorboard_s3_address=monitor_training_task.outputs["tensorboard_s3_address"],
        cluster_configuration_secret=cluster_configuration_secret,
    )

    plot_confusion_matrix_task = CATALOG.plot_confusion_matrix_comp(
        input_columns=preprocess_dataset_task.outputs["output"],
        label_columns={"is fraud": [0, 1]},
        test_dataset_dir=preprocess_dataset_task.outputs["test_dataset_dir"],
        model_dir=train_model_task.outputs["model_dir"],
        seq_len=int(train_parameters["seqlen"]),
    )

    convert_model_to_onnx_task = CATALOG.convert_model_to_onnx_comp(
        train_model_task.outputs["model_dir"]
    )

    upload_model_task = CATALOG.upload_model_comp(
        file_dir=convert_model_to_onnx_task.outputs["onnx_model_dir"],
        project_name=model_name,
    )

    ###########################################################################
    # Task 3.4
    ###########################################################################
    # Add a new task below to run the model deployment component and set 
    # project_name=model_name and model_version to the "model_version" output 
    # of the upload_model_task.
    ###########################################################################

    deploy_model_with_kserve_task.after(upload_model_task)

In [None]:
def disable_cache_transformer(op):
    if isinstance(op, dsl.ContainerOp):
        op.execution_options.caching_strategy.max_cache_staleness = "P0D"
    else:
        op.add_pod_annotation(
            name="pipelines.kubeflow.org/max_cache_staleness", value="P0D"
        )
    return op


pipeline_conf = PipelineConf()
pipeline_conf.add_op_transformer(disable_cache_transformer)
pipeline_conf.data_passing_method = data_passing_methods.KubernetesVolume(
    volume=V1Volume(
        name=ARGUMENTS["blackboard"],
        persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(
            "{{workflow.name}}-%s" % ARGUMENTS["blackboard"]
        ),
    ),
    path_prefix=f'{ARGUMENTS["blackboard"]}/',
)

client.create_run_from_pipeline_func(
    fraud_pipeline,
    arguments=ARGUMENTS,
    namespace=NAMESPACE,
    pipeline_conf=pipeline_conf,
)

## 4.) Inference

In [None]:
HOST = f"{MODEL_NAME}-predictor-default.{NAMESPACE}"
HEADERS = {"Host": HOST}
MODEL_ENDPOINT = f"http://{MODEL_NAME}-predictor-default/v2/models/{MODEL_NAME}"

res = requests.get(MODEL_ENDPOINT, headers=HEADERS)
response = json.loads(res.text)
response

In [None]:
def get_data_table():
    import pandas as pd
    from trino.dbapi import Connection

    with Connection(
        host="trino.trino",
        port="8080",
        user="anybody",
        catalog="postgresql",
        schema="public",
    ) as conn:
        link = conn.cursor()
        link.execute("SELECT * FROM transactions LIMIT 20")
        return pd.DataFrame(link.fetchall())


vdf = get_data_table()
print(f"Retrieved {len(vdf)} rows")

In [None]:
x, y = vdf.drop([0], axis=1).to_numpy(), vdf[0].to_numpy().reshape(len(vdf), 1)
dataset = keras.preprocessing.timeseries_dataset_from_array(
    x, y, sequence_length=7, batch_size=128
)

HOST = f"{MODEL_NAME}-predictor-default.{NAMESPACE}"
HEADERS = {"Host": HOST}
PREDICT_ENDPOINT = f"http://{MODEL_NAME}-predictor-default/v2/models/{MODEL_NAME}/infer"

for batch in dataset.take(10):
    input_d, output_d = batch[0], batch[1]
    for in_x, out_y in zip(input_d, output_d):
        payload = {
            "inputs": [
                {
                    "name": "input_1",
                    "shape": [1, 7, 103],
                    "datatype": "FP32",
                    "data": in_x.numpy().tolist(),
                }
            ]
        }
        res = post(PREDICT_ENDPOINT, headers=HEADERS, data=json.dumps(payload))
        response = json.loads(res.text)
        pred = response["outputs"][0]["data"][0]
        print(
            f"Actual ({out_y.numpy()[0]}) vs. Prediction ({round(pred, 3)} => {int(round(pred, 0))})"
        )