# Fraud Detection

Classify transactions as fraud or non-fraud using an LSTM-based neural network. Data coming from https://github.com/IBM/TabFormer/tree/main/data/credit_card

## Authors

Natalie Jann [natalie.jann@ibm.com](mailto:natalie.jann@ibm.com)

Sebastian Lehrig [sebastian.lehrig1@ibm.com](mailto:sebastian.lehrig1@ibm.com)

Marvin Giessing [MARVING@de.ibm.com](mailto:MARVING@de.ibm.com)

## License

Apache-2.0 License

## 0.) Imports & Constants

In [3]:
import os

import kfp
from kfp.components import (
    InputPath,
    OutputPath
)
import kfp.dsl as dsl
from kfp.dsl import (
    PipelineConf,
    data_passing_methods
)
from kubernetes.client.models import (
    V1Volume,
    V1PersistentVolumeClaimVolumeSource
)

In [4]:
client = kfp.Client()

COMPONENT_CATALOG_FOLDER = f"{os.getenv('HOME')}/components"
COMPONENT_CATALOG_GIT = "https://github.com/lehrig/kubeflow-ppc64le-components.git"
COMPONENT_CATALOG_RELEASE = "main"

CONVERT_MODEL_TO_ONNX_COMPONENT = f"{COMPONENT_CATALOG_FOLDER}/model-building/convert-to-onnx/component.yaml"
UPLOAD_MODEL_COMPONENT = f"{COMPONENT_CATALOG_FOLDER}/model-building/upload-model/component.yaml"
DEPLOY_MODEL_WITH_KSERVE_COMPONENT = f"{COMPONENT_CATALOG_FOLDER}/model-deployment/deploy-model-with-kserve/component.yaml"

BASE_IMAGE = "quay.io/ibm/kubeflow-notebook-image-ppc64le:latest"

ARGUMENTS = {
    'blackboard': 'artefacts',
    'model_name': "fraud-detection",
    'cluster_configuration_secret': os.getenv('CLUSTER_CONFIGURATION_SECRET', default=''),
}
MODEL_NAME = ARGUMENTS["model_name"]

with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace") as f:
    NAMESPACE = f.read()

ARGUMENTS

{'blackboard': 'artefacts',
 'model_name': 'fraud-detection',
 'cluster_configuration_secret': ''}

In [5]:
!git clone --branch $COMPONENT_CATALOG_RELEASE $COMPONENT_CATALOG_GIT $COMPONENT_CATALOG_FOLDER

fatal: destination path '/home/jovyan/components' already exists and is not an empty directory.


## 1.) Component Definition: Dataset Loading, Rebalancing & Splitting

In [9]:
## define a function wrapping the code below to load the dataset 
# the function signature should have the following parameters: 
# columns: list, test_dataset_dir: OutputPath(str), train_dataset_dir: OutputPath(str)


import math
import os
import numpy as np
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from trino.dbapi import Connection

def save_to_dir(x, y, directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
    np.savez(os.path.join(directory, 'data'), x=x, y=y)

with Connection(
    host='trino.trino',
    port='8080',
    user="anybody",
    catalog='postgresql',
    schema='public',
) as conn:
    link = conn.cursor()
    # update the query below to retrieve only 999980 transactions
    link.execute('SELECT * FROM transactions')
    tdf = pd.DataFrame(link.fetchall())
tdf.columns = columns
print(f'Retrieved {len(tdf)} rows')

X_train, X_test, y_train, y_test = train_test_split(
    tdf.drop(['Is Fraud?'], axis=1),
    tdf['Is Fraud?'],
    test_size=0.2,
    random_state=37
)

min_ind = math.floor(len(X_train)/128)
X_train, y_train = X_train[-min_ind*128:], y_train[-min_ind*128:]
min_ind = math.floor(len(X_test)/128)
X_test, y_test = X_test[-min_ind*128:].to_numpy(), y_test[-min_ind*128:].to_numpy()

over_sampler = RandomOverSampler(random_state=37, sampling_strategy=0.5)
train_input, train_target = over_sampler.fit_resample(X_train, y_train)
print(sum(train_target==0), 'negative &', sum(train_target==1), 'positive samples (after upsampling)')

train_input = train_input.to_numpy()
train_target = train_target.to_numpy().reshape(len(train_target), 1)
y_test = y_test.reshape(len(y_test), 1)

save_to_dir(X_train, y_train, train_dataset_dir)
save_to_dir(X_test, y_test, test_dataset_dir)

print(f"Pre-processed train dataset saved. Contents of '{train_dataset_dir}':")
print(os.listdir(train_dataset_dir))
print(f"Pre-processed test dataset saved. Contents of '{test_dataset_dir}':")
print(os.listdir(test_dataset_dir))

## define a component from the function above 
# use the BASE_IMAGE
# the component should install the package 'imbalanced-learn'



## 2.) Component Definition: Model Training, Evaluation & Deployment

In [10]:
## define a function wrapping the code below to train the model
# the function signature should have the following parameters:
# model_dir: OutputPath(str), test_dataset_dir: InputPath(str), train_dataset_dir: InputPath(str), seq_len: int = 7


import os
import numpy as np
from tensorflow import keras
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.metrics import TruePositives, FalsePositives, FalseNegatives, TrueNegatives

metrics = ['accuracy',
           TruePositives(name='tp'),
           FalsePositives(name='fp'),
           FalseNegatives(name='fn'),
           TrueNegatives(name='tn')
           ]

def load_dataset(path):
    data = np.load(os.path.join(path, 'data.npz'))
    x, y = data['x'], data['y']
    dataset = keras.preprocessing.timeseries_dataset_from_array(
        x, y, sequence_length=seq_len, batch_size=128)
    return dataset

train_dataset = load_dataset(train_dataset_dir)
test_dataset = load_dataset(test_dataset_dir)

for batch in train_dataset.take(1):
    input_d, targets = batch
print("Input shape:", input_d.numpy().shape, "Target shape:", targets.numpy().shape)

input_shape = (input_d.shape[1], input_d.shape[2])
inputs = Input(shape=input_shape)
lstm_in = LSTM(200, batch_size=7, return_sequences=True)(inputs)
lstm_out = LSTM(200, batch_size=7)(lstm_in)
outputs = Dense(1, activation='sigmoid')(lstm_out)
model = keras.Model(inputs=inputs, outputs=outputs)

model.compile(optimizer='adam', loss="binary_crossentropy", metrics=metrics)
print(model.summary())

# make the number of epochs & steps per epoch dynamic by adding it to the function signature
model.fit(train_dataset, epochs=5, steps_per_epoch=500, verbose=3)

results = model.evaluate(test_dataset)
print('Evaluation Loss, Accuracy, TP, FP, FN, TN:', results)

if not os.path.exists(model_dir):
    os.makedirs(model_dir)
model.save(model_dir)


## define a component from the function above 
# use the BASE_IMAGE



In [11]:
convert_model_to_onnx_comp = kfp.components.load_component_from_file(
    CONVERT_MODEL_TO_ONNX_COMPONENT
)

upload_model_comp = kfp.components.load_component_from_file(
    UPLOAD_MODEL_COMPONENT
)

## apply the schema of defining components from a file shown right above 
# define a component for the model deployment using DEPLOY_MODEL_WITH_KSERVE_COMPONENT



## 3.) Pipeline Definition

In [14]:
@dsl.pipeline(
  name='Fraud detection pipeline',
  description='An example pipeline that tries to predict fraudulent credit card transactions'
)
def fraud_pipeline(
    blackboard: str,
    model_name: str,
    ## add the number of epochs to train as a pipeline parameter here 
    # don't forget to update the ARGUMENTS at the beginning of the notebook, too
    cluster_configuration_secret: str
):

    create_blackboard = dsl.VolumeOp(
        name='Create Artefacts Blackboard',
        resource_name=blackboard,
        modes=dsl.VOLUME_MODE_RWO,
        size="4Gi",
        set_owner_reference=True
    )

    load_dataset_task = load_dataset_comp(
        columns=columns
    )
    load_dataset_task.after(create_blackboard)

    train_model_task = train_model_comp(
        ## set the parameter 'epochs' and 'steps_per_epoch' from the train model function here
        # use other than the default values

        test_dataset_dir=load_dataset_task.outputs['test_dataset_dir'],
        train_dataset_dir=load_dataset_task.outputs['train_dataset_dir']
    )
    convert_model_to_onnx_task = convert_model_to_onnx_comp(
        train_model_task.outputs['model_dir']
    )

    upload_model_task = upload_model_comp(
        convert_model_to_onnx_task.outputs['onnx_model_dir'],
        model_name=model_name
    )
    ## add a new task below to run the model deployment component
    # set model_name=model_name

    deploy_model_with_kserve_task.after(upload_model_task)

In [15]:
def disable_cache_transformer(op):
    if isinstance(op, dsl.ContainerOp):
        op.execution_options.caching_strategy.max_cache_staleness = "P0D"
    else:
        op.add_pod_annotation(name="pipelines.kubeflow.org/max_cache_staleness", value="P0D")
    return op


pipeline_conf = PipelineConf()
pipeline_conf.add_op_transformer(disable_cache_transformer)
pipeline_conf.data_passing_method = data_passing_methods.KubernetesVolume(
    volume=V1Volume(
        name=ARGUMENTS["blackboard"],
        persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(
            "{{workflow.name}}-%s" % ARGUMENTS["blackboard"]
        ),
    ),
    path_prefix=f'{ARGUMENTS["blackboard"]}/',
)

client.create_run_from_pipeline_func(
    fraud_pipeline,
    arguments=ARGUMENTS,
    namespace=NAMESPACE,
    pipeline_conf=pipeline_conf
)

RunPipelineResult(run_id=61a00ca6-bed5-4090-b6c0-c7bf0fef770f)

## 4.) Inference

In [None]:
import json
HOST = f'{MODEL_NAME}-predictor-default.{NAMESPACE}'
HEADERS = {'Host': HOST}
MODEL_ENDPOINT = f'http://{MODEL_NAME}-predictor-default/v2/models/{MODEL_NAME}'

res = requests.get(MODEL_ENDPOINT, headers=HEADERS)
response = json.loads(res.text)
response

In [None]:
import os
import numpy as np
from requests import post
from tensorflow import keras

with Connection(
    host='trino.trino',
    port='8080',
    user="anybody",
    catalog='postgresql',
    schema='public',
) as conn:
    link = conn.cursor()
    link.execute('SELECT * FROM transactions OFFSET 999980')
    vdf = pd.DataFrame(link.fetchall())
vdf.columns = columns
print(f'Retrieved {len(tdf)} rows')

x, y = vdf.drop(['Is Fraud?'], axis=1).to_numpy(), vdf['Is Fraud?'].to_numpy().reshape(len(tdf), 1)
dataset = keras.preprocessing.timeseries_dataset_from_array(
        x, y, sequence_length=7, batch_size=128)

HOST = f'{MODEL_NAME}-predictor-default.{NAMESPACE}'
HEADERS = {'Host': HOST}
PREDICT_ENDPOINT = f'http://{MODEL_NAME}-predictor-default/v2/models/{MODEL_NAME}/infer'

for batch in dataset.take(10):
    input_d, output_d = batch[0], batch[1]
    for in_x, out_y in zip(input_d, output_d):
        payload = {
          "inputs": [{
              "name": "input_1",
              "shape": [1, 7, 103],
              "datatype": "FP32",
              "data": in_x.numpy().tolist()
            }
          ]
        }
        res = post(PREDICT_ENDPOINT, headers=HEADERS, data=json.dumps(payload))
        response = json.loads(res.text)
        print("Actual vs. Prediction", out_y, round(response['outputs'][0]['data'][0], 3))