## Energy-based Flow Classifier (EFC) - Pure Python

### Applying EFC to Bitcoin blockchain in the presence of label scarcity

In [None]:
from math import ceil
from pprint import pprint

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix

In [None]:
from efc_python.classification_functions import (
    one_class_fit,
    one_class_predict,
)
from efc_python.generic_discretize import discretize, get_intervals

In [None]:
from research_aml_elliptic.src.reaml.model_performance import calculate_model_score

In [None]:
from research_aml_elliptic.src.experiments.general_functions.elliptic_data_preprocessing import run_elliptic_preprocessing_pipeline

In [None]:
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    precision_score,
    recall_score,
    roc_auc_score,
)

In [None]:
# Import Elliptic data set and set variables
last_time_step = 49
last_train_time_step = 34
only_labeled = True

In [None]:
# '1': 1, -> class1 (illicit)
# '2': 0, -> class2 (licit)
# 'unknown': 2 -> dropped
X_train, X_test, y_train, y_test = run_elliptic_preprocessing_pipeline(last_train_time_step=last_train_time_step,
                                                                             last_time_step=last_time_step,
                                                                             only_labeled=only_labeled)

In [None]:
intervals = get_intervals(X_train, 10)  # get discretization intervals from train set

In [None]:
X_train = discretize(X_train, intervals)  # discretize train
X_test = discretize(X_test, intervals)  # discretize test

In [None]:
idx_abnormal = np.where(y_train == 1)[0]  # find abnormal samples indexes in the training set

In [None]:
X_train.drop(idx_abnormal, axis=0, inplace=True)  # remove abnormal samples from training (EFC trains with only benign instances)

In [None]:
y_train.drop(idx_abnormal, axis=0, inplace=True)  # remove the corresponding abonrmal training targets

In [None]:
# EFC's hyperparameters
Q = X_test.values.max()
LAMBDA = 0.5  # pseudocount parameter

In [None]:
coupling, h_i, cutoff, _, _ = one_class_fit(np.array(X_train), Q, LAMBDA)  # train model

In [None]:
y_predicted, energies = one_class_predict(np.array(X_test), coupling, h_i, cutoff, Q)  # test model

In [None]:
# colect results
print("Single-class results")
print('confusion_matrix', confusion_matrix(y_test, y_predicted))

In [None]:
y_predicted

In [None]:
y_test.values

In [None]:
model_score = calculate_model_score(y_true=y_test.values, y_pred=y_predicted, metric="f1")

In [None]:
model_score

In [None]:
y_true = y_test.values
y_pred = y_predicted

In [None]:
metric_dict = {
        "accuracy": accuracy_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred, pos_label=1),
        "f1_micro": f1_score(y_true, y_pred, average="micro"),
        "f1_macro": f1_score(y_true, y_pred, average="macro"),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "roc_auc": roc_auc_score(y_true, y_pred),
    }

In [None]:
pprint(metric_dict)

### Train EFC With 5% Labeled Elliptic Data Set

In [None]:
# elliptic data set from reaml repo
X_train, X_test, y_train, y_test = run_elliptic_preprocessing_pipeline(last_train_time_step=last_train_time_step,
                                                                             last_time_step=last_time_step,
                                                                             only_labeled=only_labeled)

In [None]:
y_test

In [None]:
# efc preps
intervals = get_intervals(X_train, 10)  # get discretization intervals from train set
X_train = discretize(X_train, intervals)  # discretize train
X_test = discretize(X_test, intervals)  # discretize test

In [None]:
indices_illicit = np.where(y_train == 1)[0]

In [None]:
drop_indices_illicit = np.random.choice(indices_illicit, size=ceil(len(indices_illicit) * 0.95))

In [None]:
# retrieve idxs abnormals and choose 95% of them
# abnormals == class1 (illicit)
# drop random labeled indices
X_train.drop(drop_indices_illicit, axis=0, inplace=True)  # remove abnormal samples from training (EFC trains with only benign instances)
y_train.drop(drop_indices_illicit, axis=0, inplace=True)  # remove the corresponding abonrmal training targets

In [None]:
X_train

In [None]:
# EFC's hyperparameters
Q = np.int64(X_test.values.max())
LAMBDA = 0.5  # pseudocount parameter

In [None]:
coupling, h_i, cutoff, _, _ = one_class_fit(np.array(X_train), Q, LAMBDA)  # train model

In [None]:
y_predicted, energies = one_class_predict(np.array(X_test), coupling, h_i, cutoff, Q)  # test model

In [None]:
model_score = calculate_model_score(y_true=y_test.values, y_pred=y_predicted, metric="f1")
model_score

In [None]:
y_true = y_test.values
y_pred = y_predicted

In [None]:
metric_dict = {
        "accuracy": accuracy_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred, pos_label=1),
        "f1_micro": f1_score(y_true, y_pred, average="micro"),
        "f1_macro": f1_score(y_true, y_pred, average="macro"),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "roc_auc": roc_auc_score(y_true, y_pred),
    }

In [None]:
pprint(metric_dict)

### Train EFC With 10% Labeled Elliptic Data Set

In [None]:
# elliptic data set from reaml repo
X_train, X_test, y_train, y_test = run_elliptic_preprocessing_pipeline(last_train_time_step=last_train_time_step,
                                                                             last_time_step=last_time_step,
                                                                             only_labeled=only_labeled)

In [None]:
# efc preps
intervals = get_intervals(X_train, 10)  # get discretization intervals from train set
X_train = discretize(X_train, intervals)  # discretize train
X_test = discretize(X_test, intervals)  # discretize test

In [None]:
indices_illicit = np.where(y_train == 1)[0]

In [None]:
drop_indices_illicit = np.random.choice(indices_illicit, size=ceil(len(indices_illicit) * 0.90))

In [None]:
# retrieve idxs abnormals and choose 90% of them
# abnormals == class1 (illicit)
# drop random labeled indices
X_train.drop(drop_indices_illicit, axis=0, inplace=True)  # remove abnormal samples from training (EFC trains with only benign instances)
y_train.drop(drop_indices_illicit, axis=0, inplace=True)  # remove the corresponding abonrmal training targets

In [None]:
X_train

In [None]:
# EFC's hyperparameters
Q = np.int64(X_test.values.max())
LAMBDA = 0.5  # pseudocount parameter

In [None]:
coupling, h_i, cutoff, _, _ = one_class_fit(np.array(X_train), Q, LAMBDA)  # train model

In [None]:
y_predicted, energies = one_class_predict(np.array(X_test), coupling, h_i, cutoff, Q)  # test model

In [None]:
model_score = calculate_model_score(y_true=y_test.values, y_pred=y_predicted, metric="f1")
model_score

In [None]:
y_true = y_test.values
y_pred = y_predicted

In [None]:
metric_dict = {
        "accuracy": accuracy_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred, pos_label=1),
        "f1_micro": f1_score(y_true, y_pred, average="micro"),
        "f1_macro": f1_score(y_true, y_pred, average="macro"),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "roc_auc": roc_auc_score(y_true, y_pred),
    }

In [None]:
pprint(metric_dict)

In [None]:
# colect results
print("Single-class results")
print('confusion_matrix', confusion_matrix(y_test, y_predicted))

### Train EFC With 100% Labeled Elliptic Data Set

In [None]:
# elliptic data set from reaml repo
X_train, X_test, y_train, y_test = run_elliptic_preprocessing_pipeline(last_train_time_step=last_train_time_step,
                                                                             last_time_step=last_time_step,
                                                                             only_labeled=only_labeled)

In [None]:
# efc preps
intervals = get_intervals(X_train, 10)  # get discretization intervals from train set
X_train = discretize(X_train, intervals)  # discretize train
X_test = discretize(X_test, intervals)  # discretize test

In [None]:
# EFC's hyperparameters
Q = np.int64(X_test.values.max())
LAMBDA = 0.5  # pseudocount parameter

In [None]:
coupling, h_i, cutoff, _, _ = one_class_fit(np.array(X_train), Q, LAMBDA)  # train model

In [None]:
y_predicted, energies = one_class_predict(np.array(X_test), coupling, h_i, cutoff, Q)  # test model

In [None]:
model_score = calculate_model_score(y_true=y_test.values, y_pred=y_predicted, metric="f1")
model_score

In [None]:
y_true = y_test.values
y_pred = y_predicted

In [None]:
metric_dict = {
        "accuracy": accuracy_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred, pos_label=1),
        "f1_micro": f1_score(y_true, y_pred, average="micro"),
        "f1_macro": f1_score(y_true, y_pred, average="macro"),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "roc_auc": roc_auc_score(y_true, y_pred),
    }

In [None]:
pprint(metric_dict)

### Train EFC Without Labeled Data

In [None]:
# elliptic data set from reaml repo
X_train, X_test, y_train, y_test = run_elliptic_preprocessing_pipeline(last_train_time_step=last_train_time_step,
                                                                             last_time_step=last_time_step,
                                                                             only_labeled=only_labeled)

In [None]:
# efc preps
intervals = get_intervals(X_train, 10)  # get discretization intervals from train set
X_train = discretize(X_train, intervals)  # discretize train
X_test = discretize(X_test, intervals)  # discretize test

In [None]:
idx_abnormal = np.where(y_train == 1)[0]  # find abnormal samples indexes in the training set
X_train.drop(idx_abnormal, axis=0, inplace=True)  # remove abnormal samples from training (EFC trains with only benign instances)
y_train.drop(idx_abnormal, axis=0, inplace=True)  # remove the corresponding abonrmal training targets

In [None]:
# EFC's hyperparameters
Q = np.int64(X_test.values.max())
LAMBDA = 0.5  # pseudocount parameter

In [None]:
coupling, h_i, cutoff, _, _ = one_class_fit(np.array(X_train), Q, LAMBDA)  # train model

In [None]:
y_predicted, energies = one_class_predict(np.array(X_test), coupling, h_i, cutoff, Q)  # test model

In [None]:
model_score = calculate_model_score(y_true=y_test.values, y_pred=y_predicted, metric="f1")

In [None]:
y_true = y_test.values
y_pred = y_predicted

In [None]:
metric_dict = {
        "accuracy": accuracy_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred, pos_label=1),
        "f1_micro": f1_score(y_true, y_pred, average="micro"),
        "f1_macro": f1_score(y_true, y_pred, average="macro"),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "roc_auc": roc_auc_score(y_true, y_pred),
    }

In [None]:
pprint(metric_dict)