# Illicit Transaction Detection Using the Elliptic++ Dataset
# Description: This notebook builds machine learning models to classify Bitcoin transactions as illicit or licit.
# Dataset: Elliptic, 203k transactions with 184 features, 822k wallet addresses with 56 features, labels: 1 (illicit), 2 (licit), 3 (unknown)

**Note**: There are two Classifications done, one for transaction ids and one for wallet addresses, this was done due to increase the detection chances, the actors dataset is the wallet addresses dataset

**Import necessary libraries**
**pandas for data manipulation, matplotlib/seaborn for plotting, sklearn for ML models**


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import plotly.graph_objs as go 
import plotly.offline as py 
import math
import eli5
from collections import Counter

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import VotingClassifier
from sklearn.base import clone 
import eli5
from eli5.sklearn import PermutationImportance

import xgboost as xgb

# Transaction IDS Classification

**Load the transactions dataset with 183 features and label information**

In [3]:
print("\nTransaction features: \n")
df_txs_features = pd.read_csv("E:/Financial Data Analytics/Final Project/Dataset/txs_features - Copy.csv")
df_txs_features

print("\nTransaction classes: \n")
df_txs_classes = pd.read_csv("E:/Financial Data Analytics/Final Project/Dataset/txs_classes.csv")
df_txs_classes

print("\nTransaction-Transaction edgelist: \n")
df_txs_edgelist = pd.read_csv("E:/Financial Data Analytics/Final Project/Dataset/txs_edgelist.csv")
df_txs_edgelist


Transaction features: 



Unnamed: 0,txId,Time step,class,Local_feature_1,Local_feature_2,Local_feature_3,Local_feature_4,Local_feature_5,Local_feature_6,Local_feature_7,...,in_BTC_min,in_BTC_max,in_BTC_mean,in_BTC_median,in_BTC_total,out_BTC_min,out_BTC_max,out_BTC_mean,out_BTC_median,out_BTC_total
0,3321,1,3,-0.169615,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,...,0.534072,0.534072,0.534072,0.534072,0.534072,1.668990e-01,0.367074,0.266986,0.266986,0.533972
1,11108,1,3,-0.137586,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,...,5.611878,5.611878,5.611878,5.611878,5.611878,5.861940e-01,5.025584,2.805889,2.805889,5.611778
2,51816,1,3,-0.170103,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,...,0.456608,0.456608,0.456608,0.456608,0.456608,2.279902e-01,0.228518,0.228254,0.228254,0.456508
3,68869,1,2,-0.114267,-0.184668,-1.201369,0.028105,-0.043875,-0.113002,0.547008,...,0.308900,8.000000,3.102967,1.000000,9.308900,1.229000e+00,8.079800,4.654400,4.654400,9.308800
4,89273,1,2,5.202107,-0.210553,-1.756361,-0.121970,260.090707,-0.113002,-0.061584,...,852.164680,852.164680,852.164680,852.164680,852.164680,1.300000e-07,41.264036,0.065016,0.000441,852.164680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203764,158304003,49,3,-0.165622,-0.139563,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,...,,,,,,,,,,
203765,158303998,49,3,-0.167040,-0.139563,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,...,,,,,,,,,,
203766,158303966,49,3,-0.167040,-0.139563,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,...,,,,,,,,,,
203767,161526077,49,3,-0.172212,-0.139573,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,...,,,,,,,,,,



Transaction classes: 



Unnamed: 0,txId,class
0,3321,3
1,11108,3
2,51816,3
3,68869,2
4,89273,2
...,...,...
203764,158304003,3
203765,158303998,3
203766,158303966,3
203767,161526077,3



Transaction-Transaction edgelist: 



Unnamed: 0,txId1,txId2
0,230425980,5530458
1,232022460,232438397
2,230460314,230459870
3,230333930,230595899
4,232013274,232029206
...,...,...
234350,158365409,157930723
234351,188708874,188708879
234352,157659064,157659046
234353,87414554,106877725


**General shape of the dataset**

In [4]:
print("\ntxs_features.csv for txId = 272145560\n")
df_txs_features[df_txs_features['txId']==272145560]

print("\ntxs_classes.csv for txId = 272145560\n")
df_txs_classes[df_txs_classes['txId']==272145560]

print("\ntxs_edgelist.csv for txId = 272145560\n")
df_txs_edgelist[(df_txs_edgelist['txId1']==272145560) | (df_txs_edgelist['txId2']==272145560)]


txs_features.csv for txId = 272145560



Unnamed: 0,txId,Time step,class,Local_feature_1,Local_feature_2,Local_feature_3,Local_feature_4,Local_feature_5,Local_feature_6,Local_feature_7,...,in_BTC_min,in_BTC_max,in_BTC_mean,in_BTC_median,in_BTC_total,out_BTC_min,out_BTC_max,out_BTC_mean,out_BTC_median,out_BTC_total
105573,272145560,24,1,-0.155493,-0.107012,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,2.7732,2.7732,2.7732,2.7732,2.7732,0.001917,2.770883,1.3864,1.3864,2.7728



txs_classes.csv for txId = 272145560



Unnamed: 0,txId,class
105573,272145560,1



txs_edgelist.csv for txId = 272145560



Unnamed: 0,txId1,txId2
123072,272145560,296926618
123272,272145560,272145556
125873,299475624,272145560


In [5]:
list(df_txs_features.columns)

['txId',
 'Time step',
 'class',
 'Local_feature_1',
 'Local_feature_2',
 'Local_feature_3',
 'Local_feature_4',
 'Local_feature_5',
 'Local_feature_6',
 'Local_feature_7',
 'Local_feature_8',
 'Local_feature_9',
 'Local_feature_10',
 'Local_feature_11',
 'Local_feature_12',
 'Local_feature_13',
 'Local_feature_14',
 'Local_feature_15',
 'Local_feature_16',
 'Local_feature_17',
 'Local_feature_18',
 'Local_feature_19',
 'Local_feature_20',
 'Local_feature_21',
 'Local_feature_22',
 'Local_feature_23',
 'Local_feature_24',
 'Local_feature_25',
 'Local_feature_26',
 'Local_feature_27',
 'Local_feature_28',
 'Local_feature_29',
 'Local_feature_30',
 'Local_feature_31',
 'Local_feature_32',
 'Local_feature_33',
 'Local_feature_34',
 'Local_feature_35',
 'Local_feature_36',
 'Local_feature_37',
 'Local_feature_38',
 'Local_feature_39',
 'Local_feature_40',
 'Local_feature_41',
 'Local_feature_42',
 'Local_feature_43',
 'Local_feature_44',
 'Local_feature_45',
 'Local_feature_46',
 'Local_fe

# Data Preprocessing

**Check for null values**

In [6]:
df_txs_features = df_txs_features.dropna()
df_txs_features

Unnamed: 0,txId,Time step,class,Local_feature_1,Local_feature_2,Local_feature_3,Local_feature_4,Local_feature_5,Local_feature_6,Local_feature_7,...,in_BTC_min,in_BTC_max,in_BTC_mean,in_BTC_median,in_BTC_total,out_BTC_min,out_BTC_max,out_BTC_mean,out_BTC_median,out_BTC_total
0,3321,1,3,-0.169615,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,...,0.534072,0.534072,0.534072,0.534072,0.534072,1.668990e-01,0.367074,0.266986,0.266986,0.533972
1,11108,1,3,-0.137586,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,...,5.611878,5.611878,5.611878,5.611878,5.611878,5.861940e-01,5.025584,2.805889,2.805889,5.611778
2,51816,1,3,-0.170103,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,...,0.456608,0.456608,0.456608,0.456608,0.456608,2.279902e-01,0.228518,0.228254,0.228254,0.456508
3,68869,1,2,-0.114267,-0.184668,-1.201369,0.028105,-0.043875,-0.113002,0.547008,...,0.308900,8.000000,3.102967,1.000000,9.308900,1.229000e+00,8.079800,4.654400,4.654400,9.308800
4,89273,1,2,5.202107,-0.210553,-1.756361,-0.121970,260.090707,-0.113002,-0.061584,...,852.164680,852.164680,852.164680,852.164680,852.164680,1.300000e-07,41.264036,0.065016,0.000441,852.164680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202799,194747812,49,3,0.558398,-0.198956,-0.091383,-0.121970,-0.043875,-0.113002,-0.061584,...,115.952889,115.952889,115.952889,115.952889,115.952889,1.653300e+00,114.299544,57.976422,57.976422,115.952844
202800,194747925,49,3,0.547658,-0.198956,-0.091383,-0.121970,-0.043875,-0.113002,-0.061584,...,114.250098,114.250098,114.250098,114.250098,114.250098,2.035300e-02,114.229700,57.125027,57.125027,114.250053
202801,194748063,49,3,0.543600,-0.198853,-0.091383,-0.121970,-0.043875,-0.113002,-0.061584,...,113.606771,113.606771,113.606771,113.606771,113.606771,9.257490e-01,112.680977,56.803363,56.803363,113.606726
202802,194748070,49,3,0.537760,-0.198853,-0.091383,-0.121970,-0.043875,-0.113002,-0.061584,...,112.680977,112.680977,112.680977,112.680977,112.680977,3.026970e-01,112.378235,56.340466,56.340466,112.680932


**Filter out rows with unknown class (label = 3) as they can't be used for supervised learning**

In [7]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Only scale feature columns, not the label
for column in df_txs_features.columns[168:]:
    if column == 'Class':
        continue  # Don't scale the class label

    feature = np.array(df_txs_features[column]).reshape(-1, 1)
    scaler = MinMaxScaler()
    scaler.fit(feature)
    feature_scaled = scaler.transform(feature)
    df_txs_features[column] = feature_scaled.reshape(1, -1)[0]


**Removing 'unknown' transactions**

In [8]:
data = df_txs_features.loc[(df_txs_features['class'] != 3), 'txId']
df_txs_features_selected = df_txs_features.loc[df_txs_features['txId'].isin(data)]
df_txs_features_selected

Unnamed: 0,txId,Time step,class,Local_feature_1,Local_feature_2,Local_feature_3,Local_feature_4,Local_feature_5,Local_feature_6,Local_feature_7,...,in_BTC_min,in_BTC_max,in_BTC_mean,in_BTC_median,in_BTC_total,out_BTC_min,out_BTC_max,out_BTC_mean,out_BTC_median,out_BTC_total
3,68869,1,2,-0.114267,-0.184668,-1.201369,0.028105,-0.043875,-0.113002,0.547008,...,2.711586e-05,7.022548e-04,2.723834e-04,8.778200e-05,8.171503e-04,6.113009e-04,7.142783e-04,0.001552,1.552291e-03,8.171446e-04
4,89273,1,2,5.202107,-0.210553,-1.756361,-0.121970,260.090707,-0.113002,-0.061584,...,7.480472e-02,7.480472e-02,7.480472e-02,7.480472e-02,7.480472e-02,6.466160e-11,3.647866e-03,0.000022,1.451405e-07,7.480473e-02
11,293323,1,2,-0.172726,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,...,3.579195e-06,3.577994e-06,3.578002e-06,3.579195e-06,3.575571e-06,4.715323e-07,3.511341e-06,0.000007,6.780735e-06,3.569892e-06
22,1494462,1,2,-0.172921,-0.158783,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,...,8.778192e-07,8.766174e-07,8.766262e-07,8.778192e-07,8.741947e-07,1.442451e-06,6.094506e-07,0.000002,1.632382e-06,8.597370e-07
25,1582950,1,2,-0.169967,-0.184668,-1.201369,-0.121970,-0.043875,-0.113002,-0.061584,...,4.198409e-05,4.198289e-05,4.198290e-05,4.198409e-05,4.198047e-05,2.302948e-05,3.817869e-05,0.000080,7.973672e-05,4.197479e-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202762,194334585,49,2,-0.039416,-0.118083,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,...,1.858865e-03,1.858864e-03,1.858864e-03,1.858865e-03,1.858862e-03,1.992449e-05,1.868443e-03,0.003531,3.531138e-03,1.858833e-03
202763,194334621,49,2,-0.050308,-0.112834,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,...,1.707288e-03,1.707287e-03,1.707287e-03,1.707288e-03,1.707285e-03,4.973969e-06,1.718449e-03,0.003243,3.243191e-03,1.707255e-03
202764,194335206,49,2,-0.154605,-0.116753,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,...,2.557984e-04,2.557972e-04,2.557972e-04,2.557984e-04,2.557948e-04,1.933202e-04,2.232165e-04,0.000486,4.858661e-04,2.557661e-04
202765,194335216,49,2,0.708000,-0.118083,1.018602,-0.121970,-0.043875,-0.113002,-0.061584,...,1.226060e-02,1.226060e-02,1.226060e-02,1.226060e-02,1.226060e-02,5.065905e-05,1.233830e-02,0.023291,2.329083e-02,1.226057e-02


**Goal: binary classification of 0,1**

**0: licit, 1: illicit**

In [9]:
X_data = df_txs_features_selected.loc[(df_txs_features_selected['Time step'] < 35) & (df_txs_features_selected['class'] != 3), 'txId']
X_training_timesteps = df_txs_features_selected.loc[df_txs_features_selected['txId'].isin(X_data)]
X_train = X_training_timesteps.drop(columns=['txId', 'class', 'Time step'])

X_data_test = df_txs_features_selected.loc[(df_txs_features_selected['Time step'] >= 35) & (df_txs_features_selected['class'] != 3), 'txId']
X_testing_timesteps = df_txs_features_selected.loc[df_txs_features_selected['txId'].isin(X_data_test)]
X_test = X_testing_timesteps.drop(columns=['txId', 'class', 'Time step'])

y_training_timesteps = X_training_timesteps[['class']]
y_training_timesteps = y_training_timesteps['class'].apply(lambda x: 0 if x == 2 else 1 ) # change illicit (class-2) to '0' for classification
y_train = y_training_timesteps

y_testing_timesteps = X_testing_timesteps[['class']]
y_testing_timesteps = y_testing_timesteps['class'].apply(lambda x: 0 if x == 2 else 1 ) # change illicit (class-2) to '0' for classification
y_test = y_testing_timesteps

# Machine Learning Models

**Initialize and train a Logistic Regression model**

**This is a simple baseline model**

In [10]:
# LOGISTIC REGRESSION (LR)
cLR = LogisticRegression(max_iter=1000).fit(X_train.values,y_train.values)
y_preds_LR = cLR.predict(X_test.values)
prec,rec,f1,num = precision_recall_fscore_support(y_test.values, y_preds_LR)

print("Logistic Regression")
print("Precision: %.3f \nRecall: %.3f \nF1 Score: %.3f"%(prec[1],rec[1],f1[1]))
micro_f1 = f1_score(y_test, y_preds_LR, average='micro')
print("Micro-Average F1 Score: %.3f"%(micro_f1))

Logistic Regression
Precision: 0.324 
Recall: 0.704 
F1 Score: 0.443
Micro-Average F1 Score: 0.883


**Train a Random Forest Classifier for improved performance**

In [11]:
# RANDOM FOREST (RF)
cRF = RandomForestClassifier(n_estimators=50).fit(X_train.values,y_train.values)
y_preds_RF = cRF.predict(X_test.values)
prec,rec,f1,num = precision_recall_fscore_support(y_test.values, y_preds_RF)

print("Random Forest")
print("Precision: %.3f \nRecall: %.3f \nF1 Score: %.3f"%(prec[1],rec[1],f1[1]))
micro_f1 = f1_score(y_test, y_preds_RF, average='micro')
print("Micro-Average F1 Score: %.3f"%(micro_f1))

Random Forest
Precision: 0.967 
Recall: 0.719 
F1 Score: 0.825
Micro-Average F1 Score: 0.980


**Train a Multi-Layer Perceptron model (neural network)**

In [12]:
# MULTILAYER PERCEPTRON (MLP)
cMLP = MLPClassifier(solver='adam', learning_rate_init=0.001, max_iter=200).fit(X_train.values,y_train.values)
y_preds_MLP = cMLP.predict(X_test.values)
prec,rec,f1,num = precision_recall_fscore_support(y_test.values, y_preds_MLP)

print("Multilayer Perceptron (MLP)")
print("Precision: %.3f \nRecall: %.3f \nF1 Score: %.3f"%(prec[1],rec[1],f1[1]))
micro_f1 = f1_score(y_test, y_preds_MLP, average='micro')
print("Micro-Average F1 Score: %.3f"%(micro_f1))

Multilayer Perceptron (MLP)
Precision: 0.597 
Recall: 0.657 
F1 Score: 0.626
Micro-Average F1 Score: 0.948


**Train XGBoost for gradient boosting performance**

In [13]:
# XGBOOST (XGB)
cXGB = xgb.XGBClassifier(objective="multi:softmax", num_class=2, random_state=42)
cXGB.fit(X_train.values, y_train.values)
y_preds_XGB = cXGB.predict(X_test.values)
prec,rec,f1,num = precision_recall_fscore_support(y_test.values, y_preds_XGB)

print("XGBOOST")
print("Precision: %.3f \nRecall: %.3f \nF1 Score: %.3f"%(prec[1],rec[1],f1[1]))
micro_f1 = f1_score(y_test, y_preds_XGB, average='micro')
print("Micro-Average F1 Score: %.3f"%(micro_f1))
#print(confusion_matrix(y, y_pred))

XGBOOST
Precision: 0.922 
Recall: 0.730 
F1 Score: 0.815
Micro-Average F1 Score: 0.978


**LightGBM Model**

In [14]:
# LIGHTGBM (LGBM)
import lightgbm as lgb
from sklearn.metrics import precision_recall_fscore_support, f1_score

# Initialize classifier with class_weight BEFORE fitting
cLGBM = lgb.LGBMClassifier(n_estimators=50, class_weight='balanced')
cLGBM.fit(X_train.values, y_train.values)

# Predict
y_preds_LGBM = cLGBM.predict(X_test.values)

# Compute metrics
prec, rec, f1, num = precision_recall_fscore_support(y_test.values, y_preds_LGBM)

print("LightGBM")
print("Precision: %.3f \nRecall: %.3f \nF1 Score: %.3f" % (prec[1], rec[1], f1[1]))

micro_f1 = f1_score(y_test, y_preds_LGBM, average='micro')
print("Micro-Average F1 Score: %.3f" % (micro_f1))


[LightGBM] [Info] Number of positive: 3462, number of negative: 26237
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042674 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41951
[LightGBM] [Info] Number of data points in the train set: 29699, number of used features: 181
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


LightGBM
Precision: 0.608 
Recall: 0.740 
F1 Score: 0.667
Micro-Average F1 Score: 0.951



X does not have valid feature names, but LGBMClassifier was fitted with feature names



# Wallet Addresses Classification

**Load the wallet addresses dataset with 56 features and label information**

In [15]:
print("\nActors features: \n")
df_wallets_features = pd.read_csv("E:/Financial Data Analytics/Final Project/Dataset/wallets_features.csv")
df_wallets_features

print("\nActors Classes: \n")
df_wallets_classes = pd.read_csv("E:/Financial Data Analytics/Final Project/Dataset/wallets_classes.csv")
df_wallets_classes

print("\nAddress-Address edgelist: \n") #Actor Interaction graph edgelist
df_AddrAddr_edgelist = pd.read_csv("E:/Financial Data Analytics/Final Project/Dataset/AddrAddr_edgelist.csv")
df_AddrAddr_edgelist

print("\nAddress-Transaction edgelist: \n") #Address-Transaction graph edgelist
df_AddrTx_edgelist = pd.read_csv("E:/Financial Data Analytics/Final Project/Dataset/AddrTX_edgelist.csv")
df_AddrTx_edgelist

print("\nTransaction-Address edgelist: \n") #Address-Transaction graph edgelist
df_TxAddr_edgelist = pd.read_csv("E:/Financial Data Analytics/Final Project/Dataset/TxAddr_edgelist.csv")
df_TxAddr_edgelist

# combined features and classes csv files for convenience
df_wallets_features_classes_combined = pd.read_csv("E:/Financial Data Analytics/Final Project/Dataset/wallets_features_classes_combined.csv")


Actors features: 



Unnamed: 0,address,Time step,num_txs_as_sender,num_txs_as receiver,first_block_appeared_in,last_block_appeared_in,lifetime_in_blocks,total_txs,first_sent_block,first_received_block,...,blocks_btwn_output_txs_min,blocks_btwn_output_txs_max,blocks_btwn_output_txs_mean,blocks_btwn_output_txs_median,num_addr_transacted_multiple,transacted_w_address_total,transacted_w_address_min,transacted_w_address_max,transacted_w_address_mean,transacted_w_address_median
0,111112TykSw72ztDN2WJger4cynzWYC5w,25,0.0,1.0,439586.0,439586.0,0.0,1.0,0.0,439586.0,...,0.0,0.0,0.000000,0.0,0.0,24.0,1.0,1.0,1.0,1.0
1,1111DAYXhoxZx2tsRnzimfozo783x1yC2,25,0.0,8.0,439589.0,485959.0,46370.0,8.0,0.0,439589.0,...,0.0,20164.0,6624.285714,8060.0,0.0,8.0,1.0,1.0,1.0,1.0
2,1111DAYXhoxZx2tsRnzimfozo783x1yC2,29,0.0,8.0,439589.0,485959.0,46370.0,8.0,0.0,439589.0,...,0.0,20164.0,6624.285714,8060.0,0.0,8.0,1.0,1.0,1.0,1.0
3,1111DAYXhoxZx2tsRnzimfozo783x1yC2,39,0.0,8.0,439589.0,485959.0,46370.0,8.0,0.0,439589.0,...,0.0,20164.0,6624.285714,8060.0,0.0,8.0,1.0,1.0,1.0,1.0
4,1111DAYXhoxZx2tsRnzimfozo783x1yC2,39,0.0,8.0,439589.0,485959.0,46370.0,8.0,0.0,439589.0,...,0.0,20164.0,6624.285714,8060.0,0.0,8.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1268255,3R2Uw5MRdSSigp8AjfT7K5es6Hupm4qLSq,31,1.0,1.0,451684.0,451685.0,1.0,2.0,451685.0,451684.0,...,0.0,0.0,0.000000,0.0,0.0,3.0,1.0,1.0,1.0,1.0
1268256,3R2VBFbqHGC4bQ7b4ixN4jZTdv7RMbEYtf,44,0.0,1.0,477895.0,477895.0,0.0,1.0,0.0,477895.0,...,0.0,0.0,0.000000,0.0,0.0,1.0,1.0,1.0,1.0,1.0
1268257,3R2WFmRwbDeo3rMVVu5J3jjMxAuQYYWAid,9,0.0,1.0,407342.0,407342.0,0.0,1.0,0.0,407342.0,...,0.0,0.0,0.000000,0.0,0.0,1.0,1.0,1.0,1.0,1.0
1268258,3R2WTZGYLmbJQyoDSBftJsPRvF1mSEtkh6,3,0.0,1.0,395235.0,395235.0,0.0,1.0,0.0,395235.0,...,0.0,0.0,0.000000,0.0,0.0,1.0,1.0,1.0,1.0,1.0



Actors Classes: 



Unnamed: 0,address,class
0,111112TykSw72ztDN2WJger4cynzWYC5w,2
1,1111DAYXhoxZx2tsRnzimfozo783x1yC2,3
2,1111VHuXEzHaRCgXbVwojtaP7Co3QABb,2
3,111218KKkh1JJFRHbwM16AwCiVCc4m7he1,3
4,1115LWW3xsD9jT9VRY7viCN9S34RVAAuA,2
...,...,...
822937,3R2Uw5MRdSSigp8AjfT7K5es6Hupm4qLSq,3
822938,3R2VBFbqHGC4bQ7b4ixN4jZTdv7RMbEYtf,3
822939,3R2WFmRwbDeo3rMVVu5J3jjMxAuQYYWAid,3
822940,3R2WTZGYLmbJQyoDSBftJsPRvF1mSEtkh6,3



Address-Address edgelist: 



Unnamed: 0,input_address,output_address
0,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,1GASxu5nMntiRKdVtTVRvEbP965G51bhHH
1,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a
2,13Lhad3SAmu2vqYg2dxbNcxH7LE77kJu2w,1GFdrdgtG34GChM8SMpMwcXFc4nYbH1A5G
3,1MAQQZn7EHP6J3erXByCciFiVcgS8ZhWqz,19q57SeCEzTnWrWVXA43nZzhSiXkYggh7c
4,1MAQQZn7EHP6J3erXByCciFiVcgS8ZhWqz,1Kk1NVYnCE8ALXDhgMM6HqTt1jDSvi6QBA
...,...,...
2868959,3MfN5to5K5be2RupWE8rjJHQ6V9L8ypWeh,3MfN5to5K5be2RupWE8rjJHQ6V9L8ypWeh
2868960,3DzbpEogZ1mn9FgCHcmzYPLDbV9GuxYHpi,38jMiiZs2C5n5MPkyc5pSA7wwW6H4p6hPa
2868961,34yD1sQg6C16aANCtibYXRj5NsX6tt4v5R,3G9b7hWZccuft1V4eGUcZqTZaxsqx699bM
2868962,1JERHCgwHG2Z7T3KjNpEwj3fJNX8vSfCX2,1JERHCgwHG2Z7T3KjNpEwj3fJNX8vSfCX2



Address-Transaction edgelist: 



Unnamed: 0,input_address,txId
0,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a,230325127
1,13Lhad3SAmu2vqYg2dxbNcxH7LE77kJu2w,230325139
2,1MAQQZn7EHP6J3erXByCciFiVcgS8ZhWqz,86875675
3,16zs5SVSyADh5WrLNbZbpRLsBsN5uEzgeK,230325147
4,1QJpwtUorBKPGUJkSyrRcBKTAHq4CXrdYh,230325154
...,...,...
477112,1HdnGvuc21Y4QfBEHUc3NFRJhGywdSFUb,157659046
477113,3MfN5to5K5be2RupWE8rjJHQ6V9L8ypWeh,157659306
477114,3DzbpEogZ1mn9FgCHcmzYPLDbV9GuxYHpi,157668825
477115,34yD1sQg6C16aANCtibYXRj5NsX6tt4v5R,125788182



Transaction-Address edgelist: 



Unnamed: 0,txId,output_address
0,230325127,1GASxu5nMntiRKdVtTVRvEbP965G51bhHH
1,230325127,14YRXHHof4BY1TVxN5FqYPcEdpmXiYT78a
2,230325139,1GFdrdgtG34GChM8SMpMwcXFc4nYbH1A5G
3,86875675,19q57SeCEzTnWrWVXA43nZzhSiXkYggh7c
4,86875675,1Kk1NVYnCE8ALXDhgMM6HqTt1jDSvi6QBA
...,...,...
837119,157659306,3MfN5to5K5be2RupWE8rjJHQ6V9L8ypWeh
837120,157668825,38jMiiZs2C5n5MPkyc5pSA7wwW6H4p6hPa
837121,125788182,3G9b7hWZccuft1V4eGUcZqTZaxsqx699bM
837122,157670868,1JERHCgwHG2Z7T3KjNpEwj3fJNX8vSfCX2


**General shape of the dataset**

In [16]:
print("\nwallets_features.csv for address = 39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K\n")
df_wallets_features[df_wallets_features['address']=='39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K']

print("\nwallets_classes.csv for address = 39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K\n")
df_wallets_classes[df_wallets_classes['address']=='39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K']

print("\nAddrAddr_edgelist.csv for address = 39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K0\n")
df_AddrAddr_edgelist[(df_AddrAddr_edgelist['input_address']=='39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K') | (df_AddrAddr_edgelist['output_address']=='39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K')]

print("\nAddrTx_edgelist.csv for address = 39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K0\n")
df_AddrTx_edgelist[df_AddrTx_edgelist['input_address']=='39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K']

print("\nTxAddr_edgelist.csv for address = 39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K0\n")
df_TxAddr_edgelist[df_TxAddr_edgelist['output_address']=='39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K']


wallets_features.csv for address = 39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K



Unnamed: 0,address,Time step,num_txs_as_sender,num_txs_as receiver,first_block_appeared_in,last_block_appeared_in,lifetime_in_blocks,total_txs,first_sent_block,first_received_block,...,blocks_btwn_output_txs_min,blocks_btwn_output_txs_max,blocks_btwn_output_txs_mean,blocks_btwn_output_txs_median,num_addr_transacted_multiple,transacted_w_address_total,transacted_w_address_min,transacted_w_address_max,transacted_w_address_mean,transacted_w_address_median
1149250,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,23,420.0,1.0,435559.0,453704.0,18145.0,421.0,435559.0,451682.0,...,0.0,0.0,0.0,0.0,17.0,495.0,1.0,3.0,1.046512,1.0
1149251,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,23,420.0,1.0,435559.0,453704.0,18145.0,421.0,435559.0,451682.0,...,0.0,0.0,0.0,0.0,17.0,495.0,1.0,3.0,1.046512,1.0
1149252,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,23,420.0,1.0,435559.0,453704.0,18145.0,421.0,435559.0,451682.0,...,0.0,0.0,0.0,0.0,17.0,495.0,1.0,3.0,1.046512,1.0
1149253,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,23,420.0,1.0,435559.0,453704.0,18145.0,421.0,435559.0,451682.0,...,0.0,0.0,0.0,0.0,17.0,495.0,1.0,3.0,1.046512,1.0
1149254,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,24,420.0,1.0,435559.0,453704.0,18145.0,421.0,435559.0,451682.0,...,0.0,0.0,0.0,0.0,17.0,495.0,1.0,3.0,1.046512,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1149666,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,32,420.0,1.0,435559.0,453704.0,18145.0,421.0,435559.0,451682.0,...,0.0,0.0,0.0,0.0,17.0,495.0,1.0,3.0,1.046512,1.0
1149667,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,32,420.0,1.0,435559.0,453704.0,18145.0,421.0,435559.0,451682.0,...,0.0,0.0,0.0,0.0,17.0,495.0,1.0,3.0,1.046512,1.0
1149668,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,32,420.0,1.0,435559.0,453704.0,18145.0,421.0,435559.0,451682.0,...,0.0,0.0,0.0,0.0,17.0,495.0,1.0,3.0,1.046512,1.0
1149669,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,32,420.0,1.0,435559.0,453704.0,18145.0,421.0,435559.0,451682.0,...,0.0,0.0,0.0,0.0,17.0,495.0,1.0,3.0,1.046512,1.0



wallets_classes.csv for address = 39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K



Unnamed: 0,address,class
745591,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,1



AddrAddr_edgelist.csv for address = 39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K0



Unnamed: 0,input_address,output_address
1317522,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,131fHSzsK1eQJp7bC94JVWZqJ2JcdR5UfS
1317523,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K
1317634,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,1FCw4XNAP6xeWMF4o3RaNENguE9qFumgpn
1317635,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K
1317644,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K
...,...,...
1592119,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K
1592120,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K
1592121,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,32ZkfqSGoxdx5UGo8YQp1tFhSv8R982Usk
1592122,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K



AddrTx_edgelist.csv for address = 39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K0



Unnamed: 0,input_address,txId
235194,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,291338524
235250,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,88250674
235255,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,331553508
235256,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,331553509
239085,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,289228146
...,...,...
298647,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,355110135
298648,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,355110139
298649,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,343635736
298650,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K,355110144



TxAddr_edgelist.csv for address = 39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K0



Unnamed: 0,txId,output_address
431088,291338524,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K
431200,88250674,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K
431209,331553508,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K
431212,331553509,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K
436269,289228146,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K
...,...,...
539069,355110135,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K
539071,355110139,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K
539073,343635736,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K
539074,355110144,39sfuA8pY4UfybgEZi7uvA13jkGzZpsg5K


In [17]:
df_wallets_features.columns

Index(['address', 'Time step', 'num_txs_as_sender', 'num_txs_as receiver',
       'first_block_appeared_in', 'last_block_appeared_in',
       'lifetime_in_blocks', 'total_txs', 'first_sent_block',
       'first_received_block', 'num_timesteps_appeared_in',
       'btc_transacted_total', 'btc_transacted_min', 'btc_transacted_max',
       'btc_transacted_mean', 'btc_transacted_median', 'btc_sent_total',
       'btc_sent_min', 'btc_sent_max', 'btc_sent_mean', 'btc_sent_median',
       'btc_received_total', 'btc_received_min', 'btc_received_max',
       'btc_received_mean', 'btc_received_median', 'fees_total', 'fees_min',
       'fees_max', 'fees_mean', 'fees_median', 'fees_as_share_total',
       'fees_as_share_min', 'fees_as_share_max', 'fees_as_share_mean',
       'fees_as_share_median', 'blocks_btwn_txs_total', 'blocks_btwn_txs_min',
       'blocks_btwn_txs_max', 'blocks_btwn_txs_mean', 'blocks_btwn_txs_median',
       'blocks_btwn_input_txs_total', 'blocks_btwn_input_txs_min',
       

# Data Preprocessing

**Combining the wallet features and wallet classes dataset**

**Dropping the null values**

In [18]:
df_wallets_classification = df_wallets_features_classes_combined
df_wallets_classification = df_wallets_classification.drop(columns=['Time step']).drop_duplicates()
df_wallets_classification
     

Unnamed: 0,address,class,num_txs_as_sender,num_txs_as receiver,first_block_appeared_in,last_block_appeared_in,lifetime_in_blocks,total_txs,first_sent_block,first_received_block,...,blocks_btwn_output_txs_min,blocks_btwn_output_txs_max,blocks_btwn_output_txs_mean,blocks_btwn_output_txs_median,num_addr_transacted_multiple,transacted_w_address_total,transacted_w_address_min,transacted_w_address_max,transacted_w_address_mean,transacted_w_address_median
0,111112TykSw72ztDN2WJger4cynzWYC5w,2,0.0,1.0,439586.0,439586.0,0.0,1.0,0.0,439586.0,...,0.0,0.0,0.000000,0.0,0.0,24.0,1.0,1.0,1.0,1.0
1,1111DAYXhoxZx2tsRnzimfozo783x1yC2,3,0.0,8.0,439589.0,485959.0,46370.0,8.0,0.0,439589.0,...,0.0,20164.0,6624.285714,8060.0,0.0,8.0,1.0,1.0,1.0,1.0
9,1111VHuXEzHaRCgXbVwojtaP7Co3QABb,2,0.0,1.0,431522.0,431522.0,0.0,1.0,0.0,431522.0,...,0.0,0.0,0.000000,0.0,0.0,1.0,1.0,1.0,1.0,1.0
10,111218KKkh1JJFRHbwM16AwCiVCc4m7he1,3,1.0,1.0,423456.0,423456.0,0.0,2.0,423456.0,423456.0,...,0.0,0.0,0.000000,0.0,0.0,4.0,1.0,1.0,1.0,1.0
12,1115LWW3xsD9jT9VRY7viCN9S34RVAAuA,2,0.0,1.0,429513.0,429513.0,0.0,1.0,0.0,429513.0,...,0.0,0.0,0.000000,0.0,0.0,5.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1268254,3R2Uw5MRdSSigp8AjfT7K5es6Hupm4qLSq,3,1.0,1.0,451684.0,451685.0,1.0,2.0,451685.0,451684.0,...,0.0,0.0,0.000000,0.0,0.0,3.0,1.0,1.0,1.0,1.0
1268256,3R2VBFbqHGC4bQ7b4ixN4jZTdv7RMbEYtf,3,0.0,1.0,477895.0,477895.0,0.0,1.0,0.0,477895.0,...,0.0,0.0,0.000000,0.0,0.0,1.0,1.0,1.0,1.0,1.0
1268257,3R2WFmRwbDeo3rMVVu5J3jjMxAuQYYWAid,3,0.0,1.0,407342.0,407342.0,0.0,1.0,0.0,407342.0,...,0.0,0.0,0.000000,0.0,0.0,1.0,1.0,1.0,1.0,1.0
1268258,3R2WTZGYLmbJQyoDSBftJsPRvF1mSEtkh6,3,0.0,1.0,395235.0,395235.0,0.0,1.0,0.0,395235.0,...,0.0,0.0,0.000000,0.0,0.0,1.0,1.0,1.0,1.0,1.0


**Scaling the dataset**

In [19]:
for column in df_wallets_classification.columns[2:]:
    feature = np.array(df_wallets_classification[column]).reshape(-1,1)
    scaler = MinMaxScaler()
    scaler.fit(feature)
    feature_scaled = scaler.transform(feature)
    df_wallets_classification[column] = feature_scaled.reshape(1,-1)[0]

df_wallets_classification

Unnamed: 0,address,class,num_txs_as_sender,num_txs_as receiver,first_block_appeared_in,last_block_appeared_in,lifetime_in_blocks,total_txs,first_sent_block,first_received_block,...,blocks_btwn_output_txs_min,blocks_btwn_output_txs_max,blocks_btwn_output_txs_mean,blocks_btwn_output_txs_median,num_addr_transacted_multiple,transacted_w_address_total,transacted_w_address_min,transacted_w_address_max,transacted_w_address_mean,transacted_w_address_median
0,111112TykSw72ztDN2WJger4cynzWYC5w,2,0.000000,0.001825,0.499985,0.499985,0.000000,0.000000,0.000000,0.900837,...,0.0,0.000000,0.000000,0.000000,0.0,0.000608,0.0,0.0,0.0,0.0
1,1111DAYXhoxZx2tsRnzimfozo783x1yC2,3,0.000000,0.014599,0.500015,0.979168,0.479153,0.004762,0.000000,0.900843,...,0.0,0.208377,0.068456,0.083293,0.0,0.000185,0.0,0.0,0.0,0.0
9,1111VHuXEzHaRCgXbVwojtaP7Co3QABb,2,0.000000,0.001825,0.416657,0.416657,0.000000,0.000000,0.000000,0.884312,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
10,111218KKkh1JJFRHbwM16AwCiVCc4m7he1,3,0.000688,0.001825,0.333309,0.333309,0.000000,0.000680,0.867782,0.867782,...,0.0,0.000000,0.000000,0.000000,0.0,0.000079,0.0,0.0,0.0,0.0
12,1115LWW3xsD9jT9VRY7viCN9S34RVAAuA,2,0.000000,0.001825,0.395898,0.395898,0.000000,0.000000,0.000000,0.880195,...,0.0,0.000000,0.000000,0.000000,0.0,0.000106,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1268254,3R2Uw5MRdSSigp8AjfT7K5es6Hupm4qLSq,3,0.000688,0.001825,0.624996,0.625006,0.000010,0.000680,0.925631,0.925629,...,0.0,0.000000,0.000000,0.000000,0.0,0.000053,0.0,0.0,0.0,0.0
1268256,3R2VBFbqHGC4bQ7b4ixN4jZTdv7RMbEYtf,3,0.000000,0.001825,0.895841,0.895841,0.000000,0.000000,0.000000,0.979343,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
1268257,3R2WFmRwbDeo3rMVVu5J3jjMxAuQYYWAid,3,0.000000,0.001825,0.166799,0.166799,0.000000,0.000000,0.000000,0.834760,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
1268258,3R2WTZGYLmbJQyoDSBftJsPRvF1mSEtkh6,3,0.000000,0.001825,0.041695,0.041695,0.000000,0.000000,0.000000,0.809949,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0


**Removing 'unknown' actors**

In [20]:
data = df_wallets_classification.loc[(df_wallets_classification['class'] != 3), 'address']
df_wallets_feature_selected = df_wallets_classification.loc[df_wallets_classification['address'].isin(data)]
df_wallets_feature_selected

Unnamed: 0,address,class,num_txs_as_sender,num_txs_as receiver,first_block_appeared_in,last_block_appeared_in,lifetime_in_blocks,total_txs,first_sent_block,first_received_block,...,blocks_btwn_output_txs_min,blocks_btwn_output_txs_max,blocks_btwn_output_txs_mean,blocks_btwn_output_txs_median,num_addr_transacted_multiple,transacted_w_address_total,transacted_w_address_min,transacted_w_address_max,transacted_w_address_mean,transacted_w_address_median
0,111112TykSw72ztDN2WJger4cynzWYC5w,2,0.000000,0.001825,0.499985,0.499985,0.000000,0.00000,0.00000,0.900837,...,0.0,0.0,0.0,0.0,0.0,0.000608,0.0,0.0,0.0,0.0
9,1111VHuXEzHaRCgXbVwojtaP7Co3QABb,2,0.000000,0.001825,0.416657,0.416657,0.000000,0.00000,0.00000,0.884312,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
12,1115LWW3xsD9jT9VRY7viCN9S34RVAAuA,2,0.000000,0.001825,0.395898,0.395898,0.000000,0.00000,0.00000,0.880195,...,0.0,0.0,0.0,0.0,0.0,0.000106,0.0,0.0,0.0,0.0
13,1117wASFaYgJJP6MiY8cPD5DMdQda8gDZ,2,0.000688,0.001825,0.083379,0.083400,0.000021,0.00068,0.81822,0.818216,...,0.0,0.0,0.0,0.0,0.0,0.000053,0.0,0.0,0.0,0.0
15,111ECNFEjJsQJ6LkAsAESBGTHBvZE66oX,2,0.000000,0.001825,0.333371,0.333371,0.000000,0.00000,0.00000,0.867794,...,0.0,0.0,0.0,0.0,0.0,0.000264,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1268243,3R2KQxz2Uk7xF9bfDGffTn52NrfnBjdaLy,2,0.000000,0.001825,0.208360,0.208360,0.000000,0.00000,0.00000,0.843002,...,0.0,0.0,0.0,0.0,0.0,0.000079,0.0,0.0,0.0,0.0
1268244,3R2LAgPtnjeU2XdyR1GUXaTzKmh81VETYm,2,0.000000,0.001825,0.458331,0.458331,0.000000,0.00000,0.00000,0.892576,...,0.0,0.0,0.0,0.0,0.0,0.000106,0.0,0.0,0.0,0.0
1268251,3R2SZEK2UggNm1ntkXTe8eoea11sxg9vjw,2,0.000000,0.001825,0.312508,0.312508,0.000000,0.00000,0.00000,0.863657,...,0.0,0.0,0.0,0.0,0.0,0.000211,0.0,0.0,0.0,0.0
1268252,3R2TfX1PDN93SHbDXZqYPCn1cgmCeLMBra,2,0.000000,0.001825,0.479173,0.479173,0.000000,0.00000,0.00000,0.896710,...,0.0,0.0,0.0,0.0,0.0,0.000264,0.0,0.0,0.0,0.0


**Goal: binary classification of 0**

**0: licit, 1: illicit**

**Changing illicit (class-2) to '0' for classification**

In [21]:
y = df_wallets_feature_selected[['class']]
y = y['class'].apply(lambda x: 0 if x == 2 else 1 )

X_train, X_test, y_train, y_test = train_test_split(df_wallets_feature_selected,y,test_size=0.30,random_state=15, shuffle=False)
X_train = X_train.drop(columns=['address', 'class'])
X_test = X_test.drop(columns=['address', 'class'])
     

# Machine Learning Models

**LOGISTIC REGRESSION (LR)**

In [22]:
# LOGISTIC REGRESSION (LR)
cLR = LogisticRegression(max_iter=1000).fit(X_train.values,y_train.values)
y_preds_LR = cLR.predict(X_test.values)
prec,rec,f1,num = precision_recall_fscore_support(y_test.values, y_preds_LR)

print("Logistic Regression")
print("Precision: %.3f \nRecall: %.3f \nF1 Score: %.3f"%(prec[1],rec[1],f1[1]))
micro_f1 = f1_score(y_test, y_preds_LR, average='micro')
print("Micro-Average F1 Score: %.3f"%(micro_f1))

Logistic Regression
Precision: 0.491 
Recall: 0.057 
F1 Score: 0.102
Micro-Average F1 Score: 0.964


**RANDOM FOREST (RF)**

In [23]:
# RANDOM FOREST (RF)
cRF = RandomForestClassifier(n_estimators=50).fit(X_train.values,y_train.values)
y_preds_RF = cRF.predict(X_test.values)
prec,rec,f1,num = precision_recall_fscore_support(y_test.values, y_preds_RF)

print("Random Forest 50 trees")
print("Precision: %.3f \nRecall: %.3f \nF1 Score: %.3f"%(prec[1],rec[1],f1[1]))
micro_f1 = f1_score(y_test, y_preds_RF, average='micro')
print("Micro-Average F1 Score: %.3f"%(micro_f1))

Random Forest 50 trees
Precision: 0.917 
Recall: 0.784 
F1 Score: 0.845
Micro-Average F1 Score: 0.990


**MULTILAYER PERCEPTRON (MLP)**

In [24]:
# MULTILAYER PERCEPTRON (MLP)
cMLP = MLPClassifier(solver='adam', learning_rate_init=0.001, max_iter=500).fit(X_train.values,y_train.values)
y_preds_MLP = cMLP.predict(X_test.values)
prec,rec,f1,num = precision_recall_fscore_support(y_test.values, y_preds_MLP)

print("Multilayer Perceptron (MLP)")
print("Precision: %.3f \nRecall: %.3f \nF1 Score: %.3f"%(prec[1],rec[1],f1[1]))
micro_f1 = f1_score(y_test, y_preds_MLP, average='micro')
print("Micro-Average F1 Score: %.3f"%(micro_f1))

Multilayer Perceptron (MLP)
Precision: 0.780 
Recall: 0.453 
F1 Score: 0.573
Micro-Average F1 Score: 0.976


**XGBOOST (XGB)**

In [25]:
# XGBOOST (XGB)
cXGB = xgb.XGBClassifier(objective="multi:softmax", num_class=2, random_state=42)
cXGB.fit(X_train.values, y_train.values)
y_preds_XGB = cXGB.predict(X_test.values)
prec,rec,f1,num = precision_recall_fscore_support(y_test.values, y_preds_XGB)

print("XGBOOST")
print("Precision: %.3f \nRecall: %.3f \nF1 Score: %.3f"%(prec[1],rec[1],f1[1]))
micro_f1 = f1_score(y_test, y_preds_XGB, average='micro')
print("Micro-Average F1 Score: %.3f"%(micro_f1))

XGBOOST
Precision: 0.893 
Recall: 0.808 
F1 Score: 0.848
Micro-Average F1 Score: 0.989


**LIGHTGBM (LGBM)**

In [26]:
# LIGHTGBM (LGBM)
import lightgbm as lgb
from sklearn.metrics import precision_recall_fscore_support, f1_score

# Initialize classifier with class_weight BEFORE fitting
cLGBM = lgb.LGBMClassifier(n_estimators=50, class_weight='balanced')
cLGBM.fit(X_train.values, y_train.values)

# Predict
y_preds_LGBM = cLGBM.predict(X_test.values)

# Compute metrics
prec, rec, f1, num = precision_recall_fscore_support(y_test.values, y_preds_LGBM)

print("LightGBM")
print("Precision: %.3f \nRecall: %.3f \nF1 Score: %.3f" % (prec[1], rec[1], f1[1]))

micro_f1 = f1_score(y_test, y_preds_LGBM, average='micro')
print("Micro-Average F1 Score: %.3f" % (micro_f1))


[LightGBM] [Info] Number of positive: 11377, number of negative: 174370
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.066482 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11922
[LightGBM] [Info] Number of data points in the train set: 185747, number of used features: 55
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000



X does not have valid feature names, but LGBMClassifier was fitted with feature names



LightGBM
Precision: 0.384 
Recall: 0.919 
F1 Score: 0.542
Micro-Average F1 Score: 0.944
