Ref: https://fraud-detection-handbook.github.io/fraud-detection-handbook

In [1]:
import h2o
import json
import utils
import transform
import train
import explain
import configs.base_config as C
import evaluate
import pandas as pd

# Start H2O
h2o.init(verbose=False)


Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


In [2]:
# Specify model to train
model_name = "lof"


In [3]:
# Define model parameters
model_parameters = json.load(open(f"{C.CONFIG_DIR}{model_name}.json", "r"))


In [4]:
# Load dataset
data = utils.read_from_files(C.DATA_DIR + 'simulated-data-raw/data/', "2018-04-01", "2018-08-31")


  df_final = df_final.replace([-1], 0).infer_objects(copy=False)


In [5]:
# Transform data
data = transform.extract_features(data)


In [6]:
# Split Train and Val
train_data = data[data["TX_DATETIME"] < (pd.to_datetime("2018-08-01").value // 1e9)].copy()
val_data = data[data["TX_DATETIME"] >= (pd.to_datetime("2018-08-01").value // 1e9)].copy()


In [7]:
# Generate background_data
background_data = explain.generate_background_dataset(train_data, model_parameters)


found 0 physical cores < 1
  File "c:\Users\mario\miniconda3\envs\ai-practice-projects\lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


In [8]:
# Train Isolation Forest Model - Unsupervised Algorithm
# Ref: https://github.com/h2oai/h2o-tutorials/blob/master/tutorials/isolation-forest/isolation-forest.ipynb

if model_name == "isolation_forest":
    model = train.train_isolation_forest(train_data, model_parameters)
    train_predictions = evaluate.get_isoforest_predictions(
        model, train_data, model_parameters["feature_names"]
    )
    val_predictions = evaluate.get_isoforest_predictions(
        model, val_data, model_parameters["feature_names"]
    )


In [9]:
# Train K Means Model - Unsupervised Algorithm
# Ref: https://medium.com/@tommaso.romani2000/harnessing-the-power-of-k-means-for-anomaly-detection-24dc71d260a8

if model_name == "kmeans":
    model = train.train_kmeans(train_data, model_parameters)
    train_predictions = evaluate.get_kmeans_predictions(
        model, train_data, model_parameters["feature_names"]
    )
    val_predictions = evaluate.get_kmeans_predictions(
        model, val_data, model_parameters["feature_names"]
    )


In [10]:
# Train Local Outlier Factor Model - Unsupervised Algorithm but used as Semi-Supervised for Novelty Detection
# Ref: https://scikit-learn.org/stable/auto_examples/neighbors/plot_lof_outlier_detection.html

if model_name == "lof":
    model = train.train_local_outlier_factor(train_data[train_data["TX_FRAUD"] != 1], model_parameters)
    train_predictions = evaluate.get_lof_predictions(
        model, train_data, model_parameters["feature_names"]
    )
    val_predictions = evaluate.get_lof_predictions(
        model, val_data, model_parameters["feature_names"]
    )


In [11]:
# Train One Class SVM Model - UnSupervised Algorithm
# Ref: https://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html

if model_name == "svm":
    model = train.train_svm(train_data, model_parameters)
    train_predictions = evaluate.get_svm_predictions(
        model, train_data, model_parameters["feature_names"]
    )
    val_predictions = evaluate.get_svm_predictions(
        model, val_data, model_parameters["feature_names"]
    )


In [12]:
# Train Autoencoders Model - Semi Supervised Algorithm (requires non-anomalous data during training)
# Ref: https://github.com/h2oai/h2o-tutorials/blob/master/best-practices/anomaly-detection/anomaly_detection.ipynb

if model_name == "auto_encoder":
    model, dl_grid = train.train_autoencoder(
        train_data[train_data["TX_FRAUD"] != 1], model_parameters, grid_search=True
    )
    train_predictions = evaluate.get_autoencoder_predictions(
        model, train_data, model_parameters["feature_names"]
    )
    val_predictions = evaluate.get_autoencoder_predictions(
        model, val_data, model_parameters["feature_names"]
    )


In [13]:
# Calculate thresholds and get train score
model_parameters["soft_threshold"], model_parameters["hard_threshold"] = (
    evaluate.calculate_thresholds(train_predictions, model_parameters["output_name"])
)

print("Train results")
train_metrics = evaluate.get_metrics(train_data["TX_FRAUD"], train_predictions[model_parameters["output_name"]], model_parameters, prefix="train")


Train results
Metrics: {'train_average_precision': 0.2764187361582989, 'train_roc_auc': 0.6931575634392415, 'train_soft_accuracy': 0.9481826039156279, 'train_soft_precision': 0.06274898695436593, 'train_soft_recall': 0.38774432118330693, 'train_soft_f1': 0.10801742406404521, 'train_soft_f2': 0.19045740915648646, 'train_hard_accuracy': 0.9869276743297345, 'train_hard_precision': 0.2509830740297487, 'train_hard_recall': 0.3101954569466455, 'train_hard_f1': 0.27746538770495677, 'train_hard_f2': 0.296218572178054}


In [14]:
# Evaluate model

print("Validation results")
val_metrics = evaluate.get_metrics(val_data["TX_FRAUD"], val_predictions[model_parameters["output_name"]], model_parameters, prefix="val")


Validation results
Metrics: {'val_average_precision': 0.2622335434346816, 'val_roc_auc': 0.6793510304364113, 'val_soft_accuracy': 0.9476630282675623, 'val_soft_precision': 0.06610398379473328, 'val_soft_recall': 0.36680404645934805, 'val_soft_f1': 0.11202013845185652, 'val_soft_f2': 0.19206623244134036, 'val_hard_accuracy': 0.9861376656921557, 'val_hard_precision': 0.25853985264567986, 'val_hard_recall': 0.28924690895466465, 'val_hard_f1': 0.27303271441202476, 'val_hard_f2': 0.28253549992680427}


In [15]:
# Save model
model_filename = utils.save_model(model, background_data, model_parameters, C.MODEL_DIR, model_name, is_h2o=model_parameters["is_h2o"])


Saved model and data at models/lof_v1


In [16]:
# Store results and save to a csv
utils.log_results(
    C.LOG_DIR + "anomaly_detection.csv",
    model_filename,
    model_parameters,
    train_metrics,
    val_metrics,
)


In [17]:
# Shutdown H2O
h2o.cluster().shutdown()
