# Experiment x-5
Dealing with unbalanced dataset

In [9]:
from math import ceil
from pprint import pprint

In [10]:
import numpy as np
import pandas as pd
from sklearn.metrics import auc, classification_report, confusion_matrix, f1_score, roc_curve

In [11]:
from efc import EnergyBasedFlowClassifier

In [12]:
from research_aml_elliptic.src.reaml.model_performance import calculate_model_score
from research_aml_elliptic.src.experiments.general_functions.elliptic_data_preprocessing import run_elliptic_preprocessing_pipeline

Root directory:  /Users/kevinaraujo/repos/dissertation/PPCA-UnB-Dissertation/models/notebooks/research_aml_elliptic


In [13]:
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold

In [14]:
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, accuracy_score

In [15]:
import seaborn as sns

## Common Functions

In [16]:
def calculate_model_score(y_true, y_pred):
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred, average="weighted"),
        "f1_micro": f1_score(y_true, y_pred, average="micro"),
        "f1_macro": f1_score(y_true, y_pred, average="macro"),
        "precision": precision_score(y_true, y_pred, average="weighted"),
        "recall": recall_score(y_true, y_pred, average="weighted"),
    }

In [17]:
def train_test_from_splitted(X_train, y_train, X_test, y_test, return_df=False):
    df_train = pd.concat([X_train, y_train], axis=1)
    df_test = pd.concat([X_test, y_test], axis=1)
    df = pd.concat([df_train, df_test])
    X = df.drop(['class'], axis=1)
    y = df['class']

    if return_df:
        df_train = pd.concat([X_train, y_train], axis=1)
        df_test = pd.concat([X_test, y_test], axis=1)
        df_new = pd.concat([df_train, df_test])
        return df_new

    return X, y  

In [18]:
def recreate_original_df():
    last_time_step = 49
    last_train_time_step = 34
    only_labeled = True
    X_train, X_test, y_train, y_test = run_elliptic_preprocessing_pipeline(last_train_time_step=last_train_time_step,
                                                                             last_time_step=last_time_step,
                                                                             only_labeled=only_labeled)
    df_train = pd.concat([X_train, y_train], axis=1)
    df_test = pd.concat([X_test, y_test], axis=1)
    df = pd.concat([df_train, df_test])

    return df

In [19]:
from sklearn.model_selection import train_test_split as sklearn_train_test_split

def train_test_from_x_y(X, y, last_train_time_step, last_time_step):
    X_train_df, X_test_df, y_train, y_test = sklearn_train_test_split(X, y, random_state=139, stratify=y, shuffle=True, test_size=0.3)

    return X_train_df, X_test_df, y_train, y_test

## Dataset

In [20]:
import os
import sys

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [46]:
ROOT_DIR = os.getcwd()
sys.path.insert(0, ROOT_DIR)

In [47]:
def combine_dataframes(df_classes, df_features, only_labeled=True):
    df_combined = pd.merge(
        df_features, df_classes, left_on="id", right_on="txId", how="left"
    )
    if only_labeled == True:
        df_combined = df_combined[df_combined["class"] != 2].reset_index(drop=True)
    df_combined.drop(columns=["txId"], inplace=True)

    return df_combined

In [48]:
def rename_classes(df_classes):
    df_classes.replace({"class": {"1": 1, "2": 0, "unknown": 2}}, inplace=True)

    return df_classes

In [49]:
def rename_features(df_features):
    df_features.columns = (
        ["id", "time_step"]
        + [f"trans_feat_{i}" for i in range(93)]
        + [f"agg_feat_{i}" for i in range(72)]
    )

    return df_features

In [50]:
def import_elliptic_data_from_csvs():
    df_classes = pd.read_csv(
        os.path.join(ROOT_DIR, "efc/datasets/elliptic/elliptic_txs_classes.csv")
    )
    df_edges = pd.read_csv(
        os.path.join(ROOT_DIR, "efc/datasets/elliptic/elliptic_txs_edgelist.csv")
    )
    df_features = pd.read_csv(
        os.path.join(ROOT_DIR, "efc/datasets/elliptic/elliptic_txs_features.csv"),
        header=None,
    )

    return df_classes, df_edges, df_features

In [51]:
def setup_train_test_idx(
    X, last_train_time_step, last_time_step, aggregated_timestamp_column="time_step"
):
    """The aggregated_time_step_column needs to be a column with integer values, such as year, month or day"""

    split_timesteps = {}

    split_timesteps["train"] = list(range(last_train_time_step + 1))
    split_timesteps["test"] = list(range(last_train_time_step + 1, last_time_step + 1))

    train_test_idx = {}
    train_test_idx["train"] = X[
        X[aggregated_timestamp_column].isin(split_timesteps["train"])
    ].index
    train_test_idx["test"] = X[
        X[aggregated_timestamp_column].isin(split_timesteps["test"])
    ].index

    return train_test_idx

In [52]:
def train_test_split(X, y, train_test_idx):
    X_train_df = X.loc[train_test_idx["train"]]
    X_test_df = X.loc[train_test_idx["test"]]

    y_train = y.loc[train_test_idx["train"]]
    y_test = y.loc[train_test_idx["test"]]

    return X_train_df, X_test_df, y_train, y_test

In [53]:
def load_elliptic_data(only_labeled=True, drop_node_id=True):
    df_classes, df_edges, df_features = import_elliptic_data_from_csvs()
    df_features = rename_features(df_features)
    df_classes = rename_classes(df_classes)
    df_combined = combine_dataframes(df_classes, df_features, only_labeled)

    if drop_node_id == True:
        X = df_combined.drop(columns=["id", "class"])
    else:
        X = df_combined.drop(columns="class")

    y = df_combined["class"]

    return X, y

In [54]:
def load_elliptic_data(only_labeled=True, drop_node_id=True):
    df_classes, df_edges, df_features = import_elliptic_data_from_csvs()
    df_features = rename_features(df_features)
    df_classes = rename_classes(df_classes)
    df_combined = combine_dataframes(df_classes, df_features, only_labeled)

    if drop_node_id == True:
        X = df_combined.drop(columns=["id", "class"])
    else:
        X = df_combined.drop(columns="class")

    y = df_combined["class"]

    return X, y

In [55]:
def run_elliptic_preprocessing_pipeline(
    last_train_time_step, last_time_step, only_labeled=True, drop_node_id=True,
    only_x_y=False
):
    X, y = load_elliptic_data(only_labeled, drop_node_id)
    if only_x_y:
        return X, y
    train_test_idx = setup_train_test_idx(X, last_train_time_step, last_time_step)
    X_train_df, X_test_df, y_train, y_test = train_test_split(X, y, train_test_idx)

    return X_train_df, X_test_df, y_train, y_test

## Techniques

### Random Under-Sampling

In [56]:
# Import Elliptic data set and set variables
last_time_step = 49
last_train_time_step = 34
only_labeled = True

In [57]:
# '1': 1, -> class1 (illicit)
# '2': 0, -> class2 (licit)
# 'unknown': 2 -> dropped
X_train_1, X_test_1, y_train_1, y_test_1 = run_elliptic_preprocessing_pipeline(last_train_time_step=last_train_time_step,
                                                                               last_time_step=last_time_step,
                                                                               only_labeled=only_labeled)

  df_classes.replace({"class": {"1": 1, "2": 0, "unknown": 2}}, inplace=True)


In [58]:
df_train_1 = pd.concat([X_train_1, y_train_1], axis=1)

In [59]:
df_test_1 = pd.concat([X_test_1, y_test_1], axis=1)

In [60]:
df_train_1

Unnamed: 0,time_step,trans_feat_0,trans_feat_1,trans_feat_2,trans_feat_3,trans_feat_4,trans_feat_5,trans_feat_6,trans_feat_7,trans_feat_8,...,agg_feat_63,agg_feat_64,agg_feat_65,agg_feat_66,agg_feat_67,agg_feat_68,agg_feat_69,agg_feat_70,agg_feat_71,class
0,1,0.163054,1.963790,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,-0.115831,...,-0.613614,0.241128,0.241406,1.072793,0.085530,-0.131155,0.677799,-0.120613,-0.119792,0
1,1,-0.005027,0.578941,-0.091383,4.380281,-0.063725,4.667146,0.851305,-0.163645,-0.144554,...,-0.613614,0.241128,0.241406,0.604120,0.008632,-0.131155,0.333211,-0.120613,-0.119792,0
2,1,-0.147852,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.137933,-0.144108,...,-0.613614,0.241128,0.241406,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,0
3,1,-0.151357,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.141519,-0.147643,...,-0.582077,-0.979074,-0.978556,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,0
4,1,-0.172306,-0.184668,-1.201369,0.028105,-0.043875,-0.029140,0.242712,-0.163640,-0.169115,...,-0.600999,0.241128,0.241406,0.018279,-0.068266,-0.084674,-0.054450,-1.760926,-1.760984,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29889,34,-0.172968,-0.071395,0.463609,-0.121970,-0.043875,-0.113002,-0.061584,-0.163627,-0.169442,...,1.266002,1.461330,1.461369,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792,0
29890,34,-0.172924,-0.107411,1.018602,-0.121970,-0.063725,-0.113002,-0.061584,-0.163583,-0.169398,...,0.534340,0.241128,0.241406,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792,1
29891,34,-0.172897,-0.070152,0.463609,-0.121970,-0.043875,-0.113002,-0.061584,-0.163555,-0.169371,...,-0.146863,1.461330,1.461369,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,0
29892,34,-0.155367,-0.081852,0.463609,-0.121970,-0.043875,-0.113002,-0.061584,-0.145619,-0.151686,...,-0.613614,0.241128,0.241406,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,0


In [61]:
df_test_1

Unnamed: 0,time_step,trans_feat_0,trans_feat_1,trans_feat_2,trans_feat_3,trans_feat_4,trans_feat_5,trans_feat_6,trans_feat_7,trans_feat_8,...,agg_feat_63,agg_feat_64,agg_feat_65,agg_feat_66,agg_feat_67,agg_feat_68,agg_feat_69,agg_feat_70,agg_feat_71,class
29894,35,-0.172982,-0.055242,-1.201369,-0.121970,-0.024025,-0.113002,-0.061584,-0.163642,-0.169456,...,-0.626229,0.241128,0.241406,-0.216057,-0.125939,-0.131155,-0.269818,-0.120613,-0.119792,0
29895,35,-0.166832,-0.115508,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,-0.157351,-0.163254,...,-0.575769,-0.979074,-0.978556,0.018279,-0.049041,-0.038193,-0.011377,-1.760926,-1.760984,0
29896,35,-0.167233,-0.115086,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,-0.157761,-0.163658,...,0.956938,-0.979074,-0.978556,-0.098889,-0.087490,-0.084674,-0.140597,1.519700,1.521399,0
29897,35,-0.172509,-0.120473,-0.091383,-0.121970,-0.043875,-0.113002,-0.061584,-0.163159,-0.168980,...,-0.550539,-0.979074,-0.978556,-0.098889,-0.087490,-0.084674,-0.140597,-1.760926,-1.760984,1
29898,35,-0.172805,-0.112290,1.018602,-0.121970,-0.063725,-0.113002,-0.061584,-0.163461,-0.169278,...,0.004515,0.241128,0.241406,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46559,49,-0.159293,-0.037276,1.018602,-0.121970,0.035526,-0.113002,-0.061584,-0.149635,-0.155646,...,1.408971,0.231244,-0.388216,-0.098889,1.931078,3.168259,3.707301,-1.390548,-1.214035,0
46560,49,-0.172962,-0.126566,1.018602,-0.121970,-0.063725,-0.113002,-0.061584,-0.163622,-0.169437,...,0.647874,0.241128,0.241406,10.914916,1.700384,-0.131155,7.914145,-0.120613,-0.119792,0
46561,49,-0.170412,-0.078164,1.018602,0.028105,-0.043875,0.054722,-0.061584,-0.163631,-0.167106,...,1.606604,1.461330,1.461369,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,1
46562,49,-0.093732,-0.116160,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,-0.082559,-0.089510,...,-0.613614,0.241128,0.241406,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,0


In [62]:
df_train_1['class'].value_counts()

class
0    26432
1     3462
Name: count, dtype: int64

In [63]:
df_test_1['class'].value_counts()

class
0    15587
1     1083
Name: count, dtype: int64

In [65]:
df_1 = pd.concat([df_train_1, df_test_1])

In [66]:
df_1

Unnamed: 0,time_step,trans_feat_0,trans_feat_1,trans_feat_2,trans_feat_3,trans_feat_4,trans_feat_5,trans_feat_6,trans_feat_7,trans_feat_8,...,agg_feat_63,agg_feat_64,agg_feat_65,agg_feat_66,agg_feat_67,agg_feat_68,agg_feat_69,agg_feat_70,agg_feat_71,class
0,1,0.163054,1.963790,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,-0.115831,...,-0.613614,0.241128,0.241406,1.072793,0.085530,-0.131155,0.677799,-0.120613,-0.119792,0
1,1,-0.005027,0.578941,-0.091383,4.380281,-0.063725,4.667146,0.851305,-0.163645,-0.144554,...,-0.613614,0.241128,0.241406,0.604120,0.008632,-0.131155,0.333211,-0.120613,-0.119792,0
2,1,-0.147852,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.137933,-0.144108,...,-0.613614,0.241128,0.241406,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,0
3,1,-0.151357,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.141519,-0.147643,...,-0.582077,-0.979074,-0.978556,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,0
4,1,-0.172306,-0.184668,-1.201369,0.028105,-0.043875,-0.029140,0.242712,-0.163640,-0.169115,...,-0.600999,0.241128,0.241406,0.018279,-0.068266,-0.084674,-0.054450,-1.760926,-1.760984,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46559,49,-0.159293,-0.037276,1.018602,-0.121970,0.035526,-0.113002,-0.061584,-0.149635,-0.155646,...,1.408971,0.231244,-0.388216,-0.098889,1.931078,3.168259,3.707301,-1.390548,-1.214035,0
46560,49,-0.172962,-0.126566,1.018602,-0.121970,-0.063725,-0.113002,-0.061584,-0.163622,-0.169437,...,0.647874,0.241128,0.241406,10.914916,1.700384,-0.131155,7.914145,-0.120613,-0.119792,0
46561,49,-0.170412,-0.078164,1.018602,0.028105,-0.043875,0.054722,-0.061584,-0.163631,-0.167106,...,1.606604,1.461330,1.461369,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,1
46562,49,-0.093732,-0.116160,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,-0.082559,-0.089510,...,-0.613614,0.241128,0.241406,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,0


In [68]:
df_1 = df_1.sample(frac=1)

In [69]:
df_1

Unnamed: 0,time_step,trans_feat_0,trans_feat_1,trans_feat_2,trans_feat_3,trans_feat_4,trans_feat_5,trans_feat_6,trans_feat_7,trans_feat_8,...,agg_feat_63,agg_feat_64,agg_feat_65,agg_feat_66,agg_feat_67,agg_feat_68,agg_feat_69,agg_feat_70,agg_feat_71,class
43336,45,-0.171848,1.039881,1.573595,0.703443,0.015676,0.809483,-0.061584,-0.163635,-0.168727,...,0.183646,-0.363136,0.053658,-0.098889,0.008632,0.040258,-0.063065,0.973431,0.614171,0
29417,34,0.025870,-0.104657,0.463609,-0.121970,-0.043875,-0.113002,-0.061584,0.039812,0.031146,...,-0.367624,1.461330,1.461369,-0.098889,-0.087490,-0.084674,-0.140597,-1.760926,-1.760984,0
15229,15,-0.080569,-0.035653,1.018602,-0.046932,0.055376,-0.029140,-0.061584,-0.130656,-0.108756,...,0.188332,-0.515217,-0.418060,-0.098889,0.008632,0.009390,-0.072910,0.767232,0.260070,0
6290,5,-0.129950,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,-0.119616,-0.126048,...,-0.575769,-0.979074,-0.978556,-0.098889,-0.087490,-0.084674,-0.140597,1.519700,1.521399,0
38951,42,-0.126286,0.824849,-1.201369,0.328255,-0.043875,-0.113002,1.764193,-0.160979,-0.154667,...,0.023437,-0.979074,-0.978556,-0.098889,0.008632,0.147731,0.074770,-1.760926,-1.760984,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22990,24,-0.171862,-0.158783,-1.201369,-0.046932,-0.043875,-0.029140,-0.061584,-0.163603,-0.168371,...,0.017130,-0.979074,-0.978556,-0.098889,-0.087490,-0.084674,-0.140597,1.519700,1.521399,0
1437,1,-0.172974,-0.184668,-1.201369,-0.046932,-0.024025,-0.029140,-0.061584,-0.163644,-0.169453,...,-0.600999,0.241128,0.241406,0.018279,-0.068266,-0.093204,-0.068808,1.299939,1.301521,0
45533,47,-0.157036,0.231204,1.573595,0.028105,-0.024025,0.054722,-0.061584,-0.163639,-0.156658,...,1.429996,0.171515,-0.368575,-0.098889,-0.068266,-0.065421,-0.097524,0.699543,0.700804,0
26531,30,-0.172776,-0.124014,1.018602,-0.121970,-0.063725,-0.113002,-0.061584,-0.163432,-0.169250,...,1.076779,0.241128,0.241406,0.018279,-0.087490,-0.131155,-0.097524,-0.120613,-0.119792,1


In [70]:
df_1['class'].value_counts()

class
0    42019
1     4545
Name: count, dtype: int64

In [72]:
# amount of ilicit classes 4545 rows.
ilicit_df_1 = df_1.loc[df_1['class'] == 1]
licit_df_1 = df_1.loc[df_1['class'] == 0][:4545]

In [73]:
normal_distributed_df_1 = pd.concat([ilicit_df_1, licit_df_1])

In [74]:
# Shuffle dataframe rows
new_df_1 = normal_distributed_df_1.sample(frac=1, random_state=42)

In [75]:
new_df_1.head()

Unnamed: 0,time_step,trans_feat_0,trans_feat_1,trans_feat_2,trans_feat_3,trans_feat_4,trans_feat_5,trans_feat_6,trans_feat_7,trans_feat_8,...,agg_feat_63,agg_feat_64,agg_feat_65,agg_feat_66,agg_feat_67,agg_feat_68,agg_feat_69,agg_feat_70,agg_feat_71,class
8415,7,-0.172023,-0.158783,-1.201369,-0.12197,-0.063725,-0.113002,-0.061584,-0.162663,-0.168491,...,-0.487465,0.241128,0.241406,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,1
14135,13,-0.172795,-0.158783,-1.201369,-0.12197,-0.063725,-0.113002,-0.061584,-0.163452,-0.169269,...,0.092819,0.241128,0.241406,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,1
17905,19,-0.170162,-0.154392,1.018602,-0.12197,-0.063725,-0.113002,-0.061584,-0.160758,-0.166613,...,-0.437006,0.241128,0.241406,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792,1
39051,42,-0.172832,0.09427,1.018602,-0.12197,-0.063725,-0.113002,-0.061584,-0.163485,-0.169301,...,0.04236,0.241128,0.241406,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,1
28548,32,-0.17277,-0.13122,1.018602,-0.12197,-0.063725,-0.113002,-0.061584,-0.163426,-0.169243,...,0.105434,0.241128,0.241406,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792,1


### Equally Distributing and Correlating