### Dataset

In [None]:
import os
import sys

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
ROOT_DIR = os.getcwd() + '/research_aml_elliptic'
sys.path.insert(0, os.path.join(ROOT_DIR, "src"))

In [None]:
# Import Elliptic data set and set variables
last_time_step = 49
last_train_time_step = 34
only_labeled = True

In [None]:
def train_test_split(X, y, train_test_idx):
    X_train_df = X.loc[train_test_idx["train"]]
    X_test_df = X.loc[train_test_idx["test"]]

    y_train = y.loc[train_test_idx["train"]]
    y_test = y.loc[train_test_idx["test"]]

    return X_train_df, X_test_df, y_train, y_test

In [None]:
def setup_train_test_idx(
    X, last_train_time_step, last_time_step, aggregated_timestamp_column="time_step"
):
    """The aggregated_time_step_column needs to be a column with integer values, such as year, month or day"""

    split_timesteps = {}

    split_timesteps["train"] = list(range(last_train_time_step + 1))
    split_timesteps["test"] = list(range(last_train_time_step + 1, last_time_step + 1))

    train_test_idx = {}
    train_test_idx["train"] = X[
        X[aggregated_timestamp_column].isin(split_timesteps["train"])
    ].index
    train_test_idx["test"] = X[
        X[aggregated_timestamp_column].isin(split_timesteps["test"])
    ].index

    return train_test_idx

In [None]:
def combine_dataframes(df_classes, df_features, only_labeled=True):
    df_combined = pd.merge(
        df_features, df_classes, left_on="id", right_on="txId", how="left"
    )
    if only_labeled == True:
        df_combined = df_combined[df_combined["class"] != 2].reset_index(drop=True)
    df_combined.drop(columns=["txId"], inplace=True)
    return df_combined

In [None]:
def rename_classes(df_classes):
    df_classes.replace({"class": {"1": 1, "2": 0, "unknown": 2}}, inplace=True)
    return df_classes

In [None]:
def rename_features(df_features):
    df_features.columns = (
        ["id", "time_step"]
        + [f"trans_feat_{i}" for i in range(93)]
        + [f"agg_feat_{i}" for i in range(72)]
    )
    return df_features

In [None]:
def import_elliptic_data_from_csvs():
    df_classes = pd.read_csv(
        os.path.join(ROOT_DIR, "data/elliptic/dataset/elliptic_txs_classes.csv")
    )
    df_edges = pd.read_csv(
        os.path.join(ROOT_DIR, "data/elliptic/dataset/elliptic_txs_edgelist.csv")
    )
    df_features = pd.read_csv(
        os.path.join(ROOT_DIR, "data/elliptic/dataset/elliptic_txs_features.csv"),
        header=None,
    )
    return df_classes, df_edges, df_features

In [None]:
def load_elliptic_data(only_labeled=True, drop_node_id=True):
    print(f'load_elliptic_data {only_labeled}')
    df_classes, df_edges, df_features = import_elliptic_data_from_csvs()
    df_features = rename_features(df_features)
    df_classes = rename_classes(df_classes)
    df_combined = combine_dataframes(df_classes, df_features, only_labeled)

    if drop_node_id == True:
        X = df_combined.drop(columns=["id", "class"])
    else:
        X = df_combined.drop(columns="class")

    y = df_combined["class"]

    return X, y

In [None]:
def run_elliptic_preprocessing_pipeline(
    last_train_time_step, last_time_step, only_labeled=True, drop_node_id=True
):
    print(f'run_elliptic_preprocessing_pipeline {only_labeled}')
    X, y = load_elliptic_data(only_labeled, drop_node_id)
    train_test_idx = setup_train_test_idx(X, last_train_time_step, last_time_step)
    X_train_df, X_test_df, y_train, y_test = train_test_split(X, y, train_test_idx)

    return X_train_df, X_test_df, y_train, y_test

In [None]:
only_labeled = False

X_train_df, X_test_df, y_train, y_test = run_elliptic_preprocessing_pipeline(
    last_train_time_step=last_train_time_step,
    last_time_step=last_time_step,
    only_labeled=only_labeled,
)

In [None]:
sizes = {
    "X Size": len(X_train_df) + len(X_test_df),
    "y Size": len(y_train) + len(y_test),
    "X_train Size": len(X_train_df),
    "X_test Size": len(X_test_df),
    "y_train Size": len(y_train),
    "y_test Size": len(y_test),
    "y_train Malicious Size": len(np.where(y_train == 1)[0]),
    "y_train Bening Size": len(np.where(y_train == 0)[0]),
    "y_test Malicious Size": len(np.where(y_test == 1)[0]),
    "y_test Bening Size": len(np.where(y_test == 0)[0]),
}

## EFC Including Labeled and Non Labeled Samples

In [None]:
from efc import EnergyBasedFlowClassifier

### Only Benign Samples

In [None]:
idx_abnormal = np.where(y_train == 1)[0]  # find abnormal samples indexes in the training set

In [None]:
X_train_df.drop(idx_abnormal, axis=0, inplace=True)  # remove abnormal samples from training (EFC trains with only benign instances)

In [None]:
y_train.drop(idx_abnormal, axis=0, inplace=True)  # remove the corresponding abonrmal training targets

In [None]:
sizes = {
    "X Size": len(X_train_df) + len(X_test_df),
    "y Size": len(y_train) + len(y_test),
    "X_train Size": len(X_train_df),
    "X_test Size": len(X_test_df),
    "y_train Size": len(y_train),
    "y_test Size": len(y_test),
    "y_train Malicious Size": len(np.where(y_train == 1)[0]),
    "y_train Bening Size": len(np.where(y_train == 0)[0]),
    "y_test Malicious Size": len(np.where(y_test == 1)[0]),
    "y_test Bening Size": len(np.where(y_test == 0)[0]),
}

In [None]:
sizes

In [None]:
df_sizes = pd.DataFrame(data=sizes, index=[0])

In [None]:
df_sizes

In [None]:
clf = EnergyBasedFlowClassifier(n_bins=10, cutoff_quantile=0.95)

In [None]:
clf.fit(X_train_df, y_train, base_class=0)

In [None]:
y_pred, y_energies = clf.predict(X_test_df, return_energies=True)

In [None]:
# ploting energies
benign = np.where(y_test == 0)[0]
malicious = np.where(y_test == 1)[0]

benign_energies = y_energies[benign]
malicious_energies = y_energies[malicious]
cutoff = clf.estimators_[0].cutoff_

bins = np.histogram(y_energies, bins=60)[1]

plt.hist(
    malicious_energies,
    bins,
    facecolor="#006680",
    alpha=0.7,
    ec="white",
    linewidth=0.3,
    label="malicious",
)
plt.hist(
    benign_energies,
    bins,
    facecolor="#b3b3b3",
    alpha=0.7,
    ec="white",
    linewidth=0.3,
    label="benign",
)
plt.axvline(cutoff, color="r", linestyle="dashed", linewidth=1)
plt.legend()

plt.xlabel("Energy", fontsize=12)
plt.ylabel("Density", fontsize=12)

plt.show()

In [None]:
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    precision_score,
    recall_score,
    roc_auc_score,
)

In [None]:
def calculate_model_score(y_true, y_pred):
    metric_dict = {
        "accuracy": accuracy_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred, average="weighted"),
        "f1_micro": f1_score(y_true, y_pred, average="micro"),
        "f1_macro": f1_score(y_true, y_pred, average="macro"),
        "precision": precision_score(y_true, y_pred, average="weighted"),
        "recall": recall_score(y_true, y_pred, average="weighted"),
    }
    return metric_dict

In [None]:
model_score = calculate_model_score(y_true=y_test.values, y_pred=y_pred)

In [None]:
model_score

In [None]:
df_efc_metrics = pd.DataFrame(data=model_score, index=[0])

In [None]:
df_efc_metrics

### All Samples

In [None]:
X_train_df, X_test_df, y_train, y_test = run_elliptic_preprocessing_pipeline(
    last_train_time_step=last_train_time_step,
    last_time_step=last_time_step,
    only_labeled=only_labeled,
)

In [None]:
X_train_df

In [None]:
sizes = {
    "X Size": len(X_train_df) + len(X_test_df),
    "y Size": len(y_train) + len(y_test),
    "X_train Size": len(X_train_df),
    "X_test Size": len(X_test_df),
    "y_train Size": len(y_train),
    "y_test Size": len(y_test),
    "y_train Malicious Size": len(np.where(y_train == 1)[0]),
    "y_train Bening Size": len(np.where(y_train == 0)[0]),
    "y_test Malicious Size": len(np.where(y_test == 1)[0]),
    "y_test Bening Size": len(np.where(y_test == 0)[0]),
}

In [None]:
sizes

In [None]:
df_sizes.loc[1] = sizes

In [None]:
df_sizes

In [None]:
clf = EnergyBasedFlowClassifier(n_bins=10, cutoff_quantile=0.95)

In [None]:
clf.fit(X_train_df, y_train, base_class=0)

In [None]:
y_pred, y_energies = clf.predict(X_test_df, return_energies=True)

In [None]:
# ploting energies
benign = np.where(y_test == 0)[0]
malicious = np.where(y_test == 1)[0]

benign_energies = y_energies[benign]
malicious_energies = y_energies[malicious]
cutoff = clf.estimators_[0].cutoff_

bins = np.histogram(y_energies, bins=60)[1]

plt.hist(
    malicious_energies,
    bins,
    facecolor="#006680",
    alpha=0.7,
    ec="white",
    linewidth=0.3,
    label="malicious",
)
plt.hist(
    benign_energies,
    bins,
    facecolor="#b3b3b3",
    alpha=0.7,
    ec="white",
    linewidth=0.3,
    label="benign",
)
plt.axvline(cutoff, color="r", linestyle="dashed", linewidth=1)
plt.legend()

plt.xlabel("Energy", fontsize=12)
plt.ylabel("Density", fontsize=12)

plt.show()

In [None]:
model_score_1 = calculate_model_score(y_true=y_test.values, y_pred=y_pred)

In [None]:
model_score_1

In [None]:
df_efc_metrics.loc[1] = model_score_1

In [None]:
df_efc_metrics

In [None]:
from IPython.core.display import display, HTML

def display_side_by_side(dfs:list, captions:list):
    """Display tables side by side to save vertical space
    Input:
        dfs: list of pandas.DataFrame
        captions: list of table captions
    """
    output = ""
    combined = dict(zip(captions, dfs))
    for caption, df in combined.items():
        output += df.style.set_table_attributes("style='display:inline'").set_caption(caption)._repr_html_()
        output += "\xa0\xa0\xa0"
    display(HTML(output))

In [None]:
display_side_by_side([df_sizes, df_efc_metrics], ['Sizes of X and Y, Benign and All Samples', 'Metrics'])

## EFC Including Only Labeled Samples

### Only Benign Samples

In [None]:
only_labeled = True

X_train_df, X_test_df, y_train, y_test = run_elliptic_preprocessing_pipeline(
    last_train_time_step=last_train_time_step,
    last_time_step=last_time_step,
    only_labeled=only_labeled,
)

In [None]:
idx_abnormal = np.where(y_train == 1)[0]  # find abnormal samples indexes in the training set

In [None]:
X_train_df.drop(idx_abnormal, axis=0, inplace=True)  # remove abnormal samples from training (EFC trains with only benign instances)

In [None]:
y_train.drop(idx_abnormal, axis=0, inplace=True)  # remove the corresponding abonrmal training targets

In [None]:
sizes = {
    "X Size": len(X_train_df) + len(X_test_df),
    "y Size": len(y_train) + len(y_test),
    "X_train Size": len(X_train_df),
    "X_test Size": len(X_test_df),
    "y_train Size": len(y_train),
    "y_test Size": len(y_test),
    "y_train Malicious Size": len(np.where(y_train == 1)[0]),
    "y_train Bening Size": len(np.where(y_train == 0)[0]),
    "y_test Malicious Size": len(np.where(y_test == 1)[0]),
    "y_test Bening Size": len(np.where(y_test == 0)[0]),
}

In [None]:
df_sizes_1 = pd.DataFrame(data=sizes, index=[0])

In [None]:
df_sizes_1

In [None]:
clf = EnergyBasedFlowClassifier(n_bins=10, cutoff_quantile=0.95)

In [None]:
clf.fit(X_train_df, y_train, base_class=0)

In [None]:
y_pred, y_energies = clf.predict(X_test_df, return_energies=True)

In [None]:
# ploting energies
benign = np.where(y_test == 0)[0]
malicious = np.where(y_test == 1)[0]

benign_energies = y_energies[benign]
malicious_energies = y_energies[malicious]
cutoff = clf.estimators_[0].cutoff_

bins = np.histogram(y_energies, bins=60)[1]

plt.hist(
    malicious_energies,
    bins,
    facecolor="#006680",
    alpha=0.7,
    ec="white",
    linewidth=0.3,
    label="malicious",
)
plt.hist(
    benign_energies,
    bins,
    facecolor="#b3b3b3",
    alpha=0.7,
    ec="white",
    linewidth=0.3,
    label="benign",
)
plt.axvline(cutoff, color="r", linestyle="dashed", linewidth=1)
plt.legend()

plt.xlabel("Energy", fontsize=12)
plt.ylabel("Density", fontsize=12)

plt.show()

In [None]:
model_score_2 = calculate_model_score(y_true=y_test.values, y_pred=y_pred)

In [None]:
model_score_2

In [None]:
df_efc_metrics_1 = pd.DataFrame(data=model_score_2, index=[0])

In [None]:
df_efc_metrics_1

### All Samples

In [None]:
only_labeled = True

X_train_df, X_test_df, y_train, y_test = run_elliptic_preprocessing_pipeline(
    last_train_time_step=last_train_time_step,
    last_time_step=last_time_step,
    only_labeled=only_labeled,
)

In [None]:
results_efc = 'results/efc'

In [None]:
sizes = {
    "X Size": len(X_train_df) + len(X_test_df),
    "y Size": len(y_train) + len(y_test),
    "X_train Size": len(X_train_df),
    "X_test Size": len(X_test_df),
    "y_train Size": len(y_train),
    "y_test Size": len(y_test),
    "y_train Malicious Size": len(np.where(y_train == 1)[0]),
    "y_train Bening Size": len(np.where(y_train == 0)[0]),
    "y_test Malicious Size": len(np.where(y_test == 1)[0]),
    "y_test Bening Size": len(np.where(y_test == 0)[0]),
}

In [None]:
df_sizes_1.loc[1] = sizes

In [None]:
df_sizes_1

In [None]:
clf = EnergyBasedFlowClassifier(n_bins=10, cutoff_quantile=0.95)

In [None]:
clf.fit(X_train_df, y_train, base_class=0)

In [None]:
y_pred, y_energies = clf.predict(X_test_df, return_energies=True)

In [None]:
# ploting energies
benign = np.where(y_test == 0)[0]
malicious = np.where(y_test == 1)[0]

benign_energies = y_energies[benign]
malicious_energies = y_energies[malicious]
cutoff = clf.estimators_[0].cutoff_

bins = np.histogram(y_energies, bins=60)[1]

plt.hist(
    malicious_energies,
    bins,
    facecolor="#006680",
    alpha=0.7,
    ec="white",
    linewidth=0.3,
    label="malicious",
)
plt.hist(
    benign_energies,
    bins,
    facecolor="#b3b3b3",
    alpha=0.7,
    ec="white",
    linewidth=0.3,
    label="benign",
)
plt.axvline(cutoff, color="r", linestyle="dashed", linewidth=1)
plt.legend()

plt.xlabel("Energy", fontsize=12)
plt.ylabel("Density", fontsize=12)
plt.savefig(f'{results_efc}/experiment_2/1_labeled_samples.png')

In [None]:
model_score_3 = calculate_model_score(y_true=y_test.values, y_pred=y_pred)

In [None]:
df_efc_metrics_1.loc[1] = model_score_3

In [None]:
df_efc_metrics_1

In [None]:
display_side_by_side([pd.DataFrame(df_sizes_1.loc[1]).transpose(), pd.DataFrame(df_efc_metrics_1.loc[1]).transpose()], ['Sizes of X and Y, Only Labeled Samples', 'Metrics'])

In [None]:
pd.DataFrame(df_sizes_1.loc[1]).transpose()

In [None]:
pd.DataFrame(df_efc_metrics_1.loc[1]).transpose()

### Network Packages Flow

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('./cleaned_file.csv')

In [None]:
data

In [None]:
# Select features and target variable
X = data.drop(['hash','malicious'], axis=1)
y = data['malicious']
#z = data['hash']
#z_list = z.values.tolist()
y_list = y.values.tolist()

In [None]:
# spliting train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=139, stratify=y, shuffle=True, test_size=0.3
)

In [None]:
clf = EnergyBasedFlowClassifier(n_bins=30, cutoff_quantile=0.95)

In [None]:
clf.fit(X_train, y_train, base_class=1)

In [None]:
y_pred, y_energies = clf.predict(X_test, return_energies=True)

In [None]:
# ploting energies
benign = np.where(y_test == 0)[0]
malicious = np.where(y_test == 1)[0]

benign_energies = y_energies[benign]
malicious_energies = y_energies[malicious]
cutoff = clf.estimators_[0].cutoff_

bins = np.histogram(y_energies, bins=60)[1]

plt.hist(
    malicious_energies,
    bins,
    facecolor="#006680",
    alpha=0.7,
    ec="white",
    linewidth=0.3,
    label="malicious",
)
plt.hist(
    benign_energies,
    bins,
    facecolor="#b3b3b3",
    alpha=0.7,
    ec="white",
    linewidth=0.3,
    label="benign",
)
plt.axvline(cutoff, color="r", linestyle="dashed", linewidth=1)
plt.legend()

plt.xlabel("Energy", fontsize=12)
plt.ylabel("Density", fontsize=12)

plt.show()

In [None]:
metrics = calculate_model_score(y_true=y_test, y_pred=y_pred)

In [None]:
metrics

In [None]:
# Select features and target variable
X = data.drop(['hash','malicious'], axis=1)
y = data['malicious']
#z = data['hash']
#z_list = z.values.tolist()
y_list = y.values.tolist()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=139, stratify=y, shuffle=True, test_size=0.3
)

In [None]:
idx_abnormal = np.where(y_train == 1.0)[0]  # find abnormal samples indexes in the training set

In [None]:
idx_abnormal

In [None]:
X_train.drop(idx_abnormal, axis=0, inplace=True)  # remove abnormal samples from training (EFC trains with only benign instances)

In [None]:
y_train.drop(idx_abnormal, axis=0, inplace=True)  # remove the corresponding abonrmal training targets

In [None]:
y_train

In [None]:
clf = EnergyBasedFlowClassifier(n_bins=10, cutoff_quantile=0.95)

In [None]:
clf.fit(X_train, y_train, base_class=1)

In [None]:
y_pred, y_energies = clf.predict(X_test, return_energies=True)

In [None]:
# ploting energies
benign = np.where(y_test == 0)[0]
malicious = np.where(y_test == 1)[0]

benign_energies = y_energies[benign]
malicious_energies = y_energies[malicious]
cutoff = clf.estimators_[0].cutoff_

bins = np.histogram(y_energies, bins=60)[1]

plt.hist(
    malicious_energies,
    bins,
    facecolor="#006680",
    alpha=0.7,
    ec="white",
    linewidth=0.3,
    label="malicious",
)
plt.hist(
    benign_energies,
    bins,
    facecolor="#b3b3b3",
    alpha=0.7,
    ec="white",
    linewidth=0.3,
    label="benign",
)
plt.axvline(cutoff, color="r", linestyle="dashed", linewidth=1)
plt.legend()

plt.xlabel("Energy", fontsize=12)
plt.ylabel("Density", fontsize=12)

plt.show()

In [None]:
metrics = calculate_model_score(y_true=y_test, y_pred=y_pred)

In [None]:
metrics

### Investigating f1_macro

In [None]:
f1_macro_weighted = f1_score(y_true=y_test.values, y_pred=y_pred, average="weighted")

In [None]:
f1_macro_weighted

In [None]:
f1 = f1_score(y_true=y_test.values, y_pred=y_pred, average="weighted")

In [None]:
f1

In [None]:
f1_micro = f1_score(y_true=y_test.values, y_pred=y_pred, average="micro")

In [None]:
f1_micro

In [None]:
f1_macro = f1_score(y_true=y_test.values, y_pred=y_pred, average="macro")

In [None]:
f1_macro