# **Ethereum wallet fraud detector**

---

Juli Sahun Montejano

Victor Pla Sanchis

### Problem


---
This notebook contains all the data processing and model training needed to train models to try to predict whether a ethereum wallet is fraudulent or its normal

### Dataset description


---
The data set that we are using contains 12000 rows and this columns:

* address : string
* flag : boolean
* min/max/avgTimeBetweeSentTnx : float
* min/max/avgTimeBetweeRecTnx : float
* lifetime : float
* sentTransactions : float
* receivedTransactions : float
* createdContracts : float
* numUniqSentAddress : float
* min/max/avgValReceived : float
* totalTransactions : float
* totalEtherSent : float
* totalEtherReceived : float
* totalEtherSentContracts : float
* totalEtherBalance : float
* activityDays : float
* dailyMax : float
* ratioRecSent : float
* ratioSentTotal : float
* ratioRecTotal : float
* giniSent : float
* giniRec : float
* txFreq : float
* stdBalanceEth : float



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
import sklearn
import yellowbrick
import skopt

In [None]:
print(f'Pandas version: {pd.__version__}  \nRecomended verion: 1.5.1 \n')
print(f'Numpy version: {np.__version__} \nRecomended verion: 1.23.4 \n')
print(f'Scikit-learn version: {sklearn.__version__} \nRecomended verion: 1.2.0 \n')
print(f'Yellowbrick version: {yellowbrick.__version__} \nRecomended verion: 1.5 \n')
print(f'Scikit-optimize version: {skopt.__version__} \nRecomended verion: 0.9.0 \n')
print(f'Matplotlib version: {mpl.__version__} \nRecomended verion: 3.6.0 \n')

In [None]:
#If set to True will train all the models, otherwise, will get the data from /store
train_models = False 

#If set to True will store the data and models in /store to be used in future executions
save_models = False 

#Number of crossValidations made in Grid and Bayes search
cv = 5

In [None]:
import lime.lime_tabular
from IPython.display import display, HTML
from sklearn.metrics import  ConfusionMatrixDisplay,\
                  classification_report,  RocCurveDisplay, \
                    accuracy_score, f1_score, precision_score, recall_score

show_html = lambda html: display(HTML(html))

def save_results(clf, X_test, y_test, nclf, df):
    df.loc[nclf,'test acc'] = accuracy_score(y_test, clf.predict(X_test))
    df.loc[nclf,'precision score (Fraudulent)'] = precision_score(y_test, clf.predict(X_test), average='binary')
    df.loc[nclf,'recall score (Fraudulent)'] = recall_score(y_test, clf.predict(X_test), average='binary')
    df.loc[nclf,'f1 score (Fraudulent)'] = f1_score(y_test, clf.predict(X_test), average='binary')
    return df

def displayConfusionMatrixAndRocCurve(estimator, X, y):
    fig, axes = plt.subplots(1,2,figsize=(15,5))
    ax = axes.reshape(-1)[0]
    ConfusionMatrixDisplay.from_estimator(estimator, X, y, display_labels=target_names, ax=ax)
    ax = axes.reshape(-1)[1]
    RocCurveDisplay.from_estimator(estimator, X, y, ax=ax, pos_label=1)
    plt.tight_layout()

def weights(estimator):
    coefs = pd.DataFrame(estimator.best_estimator_.coef_, columns=X_train.columns)
    coefs.columns = X_train.columns
    plt.figure(figsize=(25,2))
    sns.heatmap(coefs.abs().round(), annot=True, cbar=True, cmap='Blues', linewidths=.5, annot_kws={"size": 12}, fmt='g')

def explainer(estimator, X, y, i = 0):
    if (type(X) == pd.core.frame.DataFrame):
        X = X.to_numpy()        
    model_explainer = lime.lime_tabular.LimeTabularExplainer(X, feature_names=list(data.columns[:-1]),class_names=target_names, verbose=True, mode='classification')
    exp = model_explainer.explain_instance(X[i], estimator.predict_proba, num_features=6)
    exp.show_in_notebook(show_table=True)

results_df_lineal = pd.DataFrame()
results_df_no_lineal = pd.DataFrame()
target_names=['Normal', 'Fraudulent']

In [None]:
import pickle
import glob
import os

def save_state(variables):
    for name, value in variables.items():
        with open(f'store/{name}.pickle', 'wb') as f:
            pickle.dump(value, f)
def load_state():
    variables = {}
    for filename in glob.iglob('store/*'):
        with open(filename, 'rb') as f:
            name = filename[6:].replace('.pickle', '')
            variables[name] = pickle.load(f)
    globals().update(variables)

def clear_state():
    for filename in glob.iglob('store/*'):
        os.remove(filename)
    

In [None]:
if not train_models:
    load_state()

### Dataset load and analysis

---

In [None]:
#https://www.kaggle.com/datasets/gescobero/ethereum-fraud-dataset
data = pd.read_csv("./data/eth_illicit_features.csv")
print("Filas:",data.shape[0])
print("Columnas:",data.shape[1])

In [None]:
data.head()

In [None]:
data.drop(columns=['address'], inplace=True)
data.describe(include='all').T

In [None]:
X= data.iloc[:,1:]
y= data.loc[:,'flag'].copy()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X_train_n = scaler.fit_transform(X_train)
X_test_n = scaler.transform(X_test)

In [None]:
fig, axes = plt.subplots(11,3,figsize=(15,50))
X_train_nr = X_train_n.round(3)
X_test_nr = X_test_n.round(3)
X_train_frame = pd.DataFrame(X_train_nr)
X_train_frame.columns=X.columns
for i, c in enumerate(X.columns):
    ax = axes.reshape(-1)[i]
    if X_train_frame[c].dtype.kind == 'O':
        a = sns.countplot(x=c,data=X_train_frame,ax=ax)
    else:
        b = sns.histplot(x=c,data=X_train_frame,ax=ax)
plt.tight_layout()

In [None]:
corr = X_train.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
plt.subplots(figsize=(10, 8))
sns.heatmap(corr, mask=mask, cmap='seismic',  center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5});

In [None]:
from yellowbrick.target.feature_correlation import feature_correlation
plt.figure(figsize=(10,8))
visualizer = feature_correlation(X_train, y_train, labels=list(X_train.columns),method='mutual_info-classification');

In [None]:
from sklearn.decomposition import PCA
pca = PCA().fit(X_train_n)

In [None]:
fig = plt.figure(figsize=(8,6));
plt.plot(range(1,len(pca.explained_variance_ratio_ )+1),pca.explained_variance_ratio_ ,alpha=0.8,marker='.',label="Variancia Explicada");
y_label = plt.ylabel('Variancia explicada');
x_label = plt.xlabel('Componentes');
plt.plot(range(1,len(pca.explained_variance_ratio_ )+1),
         np.cumsum(pca.explained_variance_ratio_),
         c='red',marker='.',
         label="Variancia explicada acumulativa");
plt.legend();
plt.title('Porcentaje de variancia explicada por componente');

In [None]:
X_trans = pca.transform(X_train)
plt.figure(figsize=(8,8));
sns.scatterplot(x=X_trans[:,0], y=X_trans[:,1], hue=y_train)

### Lineal Models

#### LDA


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
if train_models:
    lda = LinearDiscriminantAnalysis()
print(np.mean(cross_val_score(lda,X_train_n,y_train,cv=10)))
lda.fit(X_train, y_train)

In [None]:
print(classification_report(lda.predict(X_test_n), y_test,target_names=target_names))
results_df_lineal = save_results(lda, X_test_n, y_test, 'LDA', results_df_lineal)

In [None]:
displayConfusionMatrixAndRocCurve(lda, X_test, y_test)

In [None]:
coefs = pd.DataFrame(lda.coef_, columns=X.columns)
coefs.columns = X.columns
plt.figure(figsize=(20,2))
sns.heatmap(coefs.abs().round(), annot=True, cbar=True, cmap='Blues', linewidths=.5, annot_kws={"size": 12})

In [None]:
explainer(lda, X_test, y_test, 5)

#### Lineal SVC

In [None]:
from sklearn.svm import LinearSVC
lsvc = LinearSVC(max_iter=25000)
print(np.mean(cross_val_score(lsvc,X_train_n,y_train,cv=10)))

In [None]:
if train_models:  
  param = {'C':10**np.linspace(-3,3,101), 
                'penalty':['l1','l2'], 
                'loss':['hinge','squared_hinge']}

  lsvc_gs =  GridSearchCV(lsvc,param,cv=cv, n_jobs=-1, refit=True)
  lsvc_gs.fit(X_train_n, y_train);
show_html(pd.DataFrame(lsvc_gs.cv_results_).loc[:,['params','mean_test_score','rank_test_score']].sort_values(by='rank_test_score').head().to_html())

In [None]:
print(classification_report(lsvc_gs.predict(X_test_n),y_test,target_names=target_names))
results_df_lineal = save_results(lsvc_gs, X_test_n, y_test, 'LSVC', results_df_lineal)

In [None]:
displayConfusionMatrixAndRocCurve(lsvc_gs, X_test_n, y_test)

In [None]:
weights(lsvc_gs)

In [None]:
from sklearn.calibration import CalibratedClassifierCV
lsvc_gs_c = CalibratedClassifierCV(lsvc_gs.best_estimator_) 
lsvc_gs_c.fit(X_train_n, y_train)
explainer(lsvc_gs_c, X_test_n, y_test)

#### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn =  KNeighborsClassifier()
print(np.mean(cross_val_score(knn,X_train_n,y_train,cv=10)))

In [None]:
if train_models:
    param = {'n_neighbors':[5, 10, 15, 20, 25, 30], 
            'weights':['uniform', 'distance'], 
            'leaf_size':[20, 30, 40, 50],
            'metric': ['l2', 'l1', 'cosine']}

    knn_gs =  GridSearchCV(knn,param,cv=cv, n_jobs=-1)
    knn_gs.fit(X_train_n, y_train);
show_html(pd.DataFrame(knn_gs.cv_results_).loc[:,['params', 'mean_test_score','rank_test_score']].sort_values(by='rank_test_score').head().to_html())

In [None]:
print(classification_report(knn_gs.predict(X_test_n), y_test,target_names=target_names))
results_df_lineal = save_results(knn_gs, X_test_n, y_test, 'KNN', results_df_lineal)

In [None]:
displayConfusionMatrixAndRocCurve(knn_gs, X_test_n, y_test)

In [None]:
explainer(knn_gs, X_test_n, y_test, 5)

#### Results

In [None]:
results_df_lineal.sort_values(by=['test acc'], ascending=False)

### Non Linear models

#### SVC

In [None]:
from sklearn.svm import SVC
svc = SVC()
print(np.mean(cross_val_score(svc,X_train_n,y_train,cv=10)))

In [None]:
if train_models:
    param = {'C':10**np.linspace(-3,3,101)}
    
    svc = SVC(max_iter=250000, random_state=0, probability=True)
    svc_gs = GridSearchCV(svc,param, cv=cv, n_jobs=-1, refit=True)
    svc_gs.fit(X_train_n, y_train)
show_html(pd.DataFrame(svc_gs.cv_results_).loc[:,['params', 'mean_test_score','rank_test_score']].sort_values(by='rank_test_score').head().to_html())

In [None]:
print(classification_report(svc_gs.predict(X_test_n), y_test,target_names=target_names))
results_df_no_lineal = save_results(svc_gs, X_test_n, y_test, 'SVC', results_df_no_lineal)

In [None]:
displayConfusionMatrixAndRocCurve(svc_gs, X_test_n, y_test)

In [None]:
explainer(svc_gs, X_test, y_test, 5)

#### MLP

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(max_iter=1000, early_stopping=True, random_state=10)
print(np.mean(cross_val_score(mlp,X_train_n,y_train,cv=10)))

In [None]:
if train_models:
    param = {'hidden_layer_sizes':[1, 25, 50, 75, 100, 125, 150], 
         'activation':['logistic', 'relu', 'tanh', 'identity'], 
         'learning_rate_init': [0.0001, 0.001, 0.01, 0.1]  }

    mlp_gs =  GridSearchCV(mlp,param,cv=cv, n_jobs=-1, refit=True)
    mlp_gs.fit(X_train_n, y_train);
show_html(pd.DataFrame(mlp_gs.cv_results_).loc[:,['params', 'mean_test_score','rank_test_score']].sort_values(by='rank_test_score').head().to_html())

In [None]:
print(classification_report(mlp_gs.predict(X_test_n), y_test,target_names=target_names))
results_df_no_lineal = save_results(mlp_gs, X_test_n, y_test, 'MPL', results_df_no_lineal)

In [None]:
displayConfusionMatrixAndRocCurve(mlp_gs, X_test_n, y_test)

In [None]:
explainer(mlp_gs, X_test_n, y_test, 5)

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
print(np.mean(cross_val_score(rf,X_train_n,y_train,cv=10)))

In [None]:
if train_models:
    param = {'n_estimators':[10,100, 150, 200],
            'min_samples_split': [1, 2,4,32, 64],
            'min_samples_leaf': [1, 2,4,32],
            'max_features': [1,2,32, 64, 128]
            }

    rf_bs =  BayesSearchCV(rf,param,n_iter=15, cv=cv, n_jobs=-1, refit=True, random_state=0)
    rf_bs.fit(X_train_n, y_train);
show_html(pd.DataFrame(rf_bs.cv_results_).loc[:,['params', 'mean_test_score','rank_test_score']].sort_values(by='rank_test_score').head().to_html())

In [None]:
print(classification_report(rf_bs.predict(X_test_n), y_test,target_names=target_names))
results_df_no_lineal = save_results(rf_bs, X_test_n, y_test, 'RF', results_df_no_lineal)

In [None]:
displayConfusionMatrixAndRocCurve(rf_bs, X_test_n, y_test)

In [None]:
explainer(rf_bs, X_test_n, y_test, 5)

#### Results

In [None]:
results_df_no_lineal.sort_values(by=['test acc'], ascending=False)

### Results

In [None]:
pd.concat([results_df_lineal, results_df_no_lineal], axis=0).sort_values(by=['test acc'], ascending=False)

In [None]:
if save_models:
    vars = {
        'results_df_lineal': results_df_lineal, 
        'results_df_no_lineal': results_df_no_lineal,
        'data': data,
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test,
        'X_train_n': X_train_n,
        'X_test_n': X_test_n,
        'lda': lda,
        'lsvc_gs': lsvc_gs,
        'knn_gs': knn_gs,
        'svc_gs': svc_gs,
        'mlp_gs': mlp_gs,
        'rf_bs': rf_bs
        # add more data here
        }
    save_state(vars)