# Bitcoin Ransomware Dataset
See https://archive.ics.uci.edu/ml/datasets/BitcoinHeistRansomwareAddressDataset for dataset information and feature descriptions.

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import csv
import numpy as np
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
from scipy import stats
import pickle
import operator
import glob
from scipy.io.arff import loadarff 
from scipy.stats import ttest_rel

import seaborn as sns; sns.set_style('white')

from sklearn.utils import resample
from sklearn.metrics import accuracy_score, plot_confusion_matrix, f1_score, plot_roc_curve, roc_auc_score, make_scorer
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_score

In [2]:
df = pd.read_csv('BitCoinHeistData.csv')
df = df.rename(columns={'count':'cnt'})
df

Unnamed: 0,address,year,day,length,weight,cnt,looped,neighbors,income,label
0,111K8kZAEnJg245r2cM6y9zgJGHZtJPy6,2017,11,18,0.008333,1,0,2,1.000500e+08,princetonCerber
1,1123pJv8jzeFQaCV4w644pzQJzVWay2zcA,2016,132,44,0.000244,1,0,1,1.000000e+08,princetonLocky
2,112536im7hy6wtKbpH1qYDWtTyMRAcA2p7,2016,246,0,1.000000,1,0,2,2.000000e+08,princetonCerber
3,1126eDRw2wqSkWosjTCre8cjjQW8sSeWH7,2016,322,72,0.003906,1,0,2,7.120000e+07,princetonCerber
4,1129TSjKtx65E35GiUo4AYVeyo48twbrGX,2016,238,144,0.072848,456,0,1,2.000000e+08,princetonLocky
...,...,...,...,...,...,...,...,...,...,...
2916692,12D3trgho1vJ4mGtWBRPyHdMJK96TRYSry,2018,330,0,0.111111,1,0,1,1.255809e+09,white
2916693,1P7PputTcVkhXBmXBvSD9MJ3UYPsiou1u2,2018,330,0,1.000000,1,0,1,4.409699e+07,white
2916694,1KYiKJEfdJtap9QX2v9BXJMpz2SfU4pgZw,2018,330,2,12.000000,6,6,35,2.398267e+09,white
2916695,15iPUJsRNZQZHmZZVwmQ63srsmughCXV4a,2018,330,0,0.500000,1,0,1,1.780427e+08,white


In [3]:
# 1 = ransomware; 0 = whitelist
df['new_label'] = np.where(df['label']=='white', 0, 1)
df = df.drop(columns=['year', 'day', 'address', 'label'])
df

Unnamed: 0,length,weight,cnt,looped,neighbors,income,new_label
0,18,0.008333,1,0,2,1.000500e+08,1
1,44,0.000244,1,0,1,1.000000e+08,1
2,0,1.000000,1,0,2,2.000000e+08,1
3,72,0.003906,1,0,2,7.120000e+07,1
4,144,0.072848,456,0,1,2.000000e+08,1
...,...,...,...,...,...,...,...
2916692,0,0.111111,1,0,1,1.255809e+09,0
2916693,0,1.000000,1,0,1,4.409699e+07,0
2916694,2,12.000000,6,6,35,2.398267e+09,0
2916695,0,0.500000,1,0,1,1.780427e+08,0


In [4]:
df['new_label'].value_counts()

0    2875284
1      41413
Name: new_label, dtype: int64

In [5]:
# Separate majority and minority classes
df_majority = df[df.new_label==0]
df_minority = df[df.new_label==1]

# Downsample majority and minority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=6000,     # to match minority class
                                 random_state=123) # reproducible results

df_minority_downsampled = resample(df_minority, 
                                 replace=False,    # sample without replacement
                                 n_samples=6000,     # to match majority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority_downsampled])
 
# Display new class counts
df_downsampled.new_label.value_counts()

1    6000
0    6000
Name: new_label, dtype: int64

In [6]:
df_downsampled

Unnamed: 0,length,weight,cnt,looped,neighbors,income,new_label
2488416,2,1.000000,1,0,2,6.089024e+08,0
2873770,2,0.500000,1,0,2,3.969468e+08,0
2537321,50,0.022457,6,0,2,1.228130e+08,0
2868318,144,0.077579,4825,0,2,2.000000e+08,0
636908,0,1.000000,1,0,2,5.605605e+09,0
...,...,...,...,...,...,...,...
23778,2,0.500000,1,0,1,3.000000e+08,1
6969,0,0.500000,1,0,2,1.220000e+08,1
36643,0,1.000000,1,0,1,5.000000e+07,1
31367,4,1.588816,4,0,8,8.068000e+08,1


In [7]:
df = df_downsampled.copy()
scaler = StandardScaler()
X, y = df.iloc[:,0:6].to_numpy(), df.iloc[:,6:].to_numpy()

## Hyperparameter Search & Experimentation

In [11]:
def experiment():
    pipeline1 = Pipeline((
    ('clf', RandomForestClassifier()),
    ))

    pipeline2 = Pipeline((
    ('clf', KNeighborsClassifier()),
    ))

    pipeline3 = Pipeline((
    ('clf', AdaBoostClassifier()),
    ))
    
    pipeline4 = Pipeline((
    ('clf', LogisticRegression()),
    ))
    
    pipeline5 = Pipeline((
    ('clf', MLPClassifier()),
    ))
    
    # Random Forest
    parameters1 = {
    'clf__n_estimators': [1024],
    'clf__max_features': [1, 2, 4, 6, 8, 12, 16, 20]
    }

    # KNN
    parameters2 = {
    'clf__n_neighbors': [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,65,69,73,77,81,85,89,93,97,101,105],
    'clf__weights': ['uniform', 'distance']
    }
    
    # AdaBoost (Boosted Decision Tree)
    parameters3 = {
        'clf__algorithm': ['SAMME.R'],
        'clf__n_estimators': [2,4,8,16,32,64,128,256,512,1024,2048],
        'clf__learning_rate': [1e-3, 1e-2, 1e-1, 1e0, 1e1, 2e1, 5e1]
    }
    
    # Logistic
    parameters4 = {
    'clf__penalty':['l1', 'l2', None],
    'clf__C':[1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0, 1e0, 1e1, 1e2, 1e3, 1e4],
    'clf__max_iter':[5000]
    }
    
    # Multi-layer Perceptron
    parameters5 = {
        'clf__hidden_layer_sizes':[(1,), (2,), (4,), (8,), (32,), (128,)],
        'clf__solver':['sgd'],
        'clf__activation':['relu'], 
        'clf__learning_rate':['constant', 'invscaling'], 
        'clf__learning_rate_init': [1e-3, 1e-2, 1e-1, 1e0],
        'clf__max_iter': [2, 4, 8, 16, 32, 64, 128, 256, 512]
    }
    
    pars = [parameters1, parameters2, parameters3, parameters4, parameters5]
    pips = [pipeline1, pipeline2, pipeline3, pipeline4, pipeline5]
    
    # List of dictionaries to hold the scores of the various metrics for each type of classifier
    best_clf_list = []
    trial_storage = {}
    training_storage = {}
    
    print("starting Gridsearch")
    for i in range(len(pars)):
        trial_averages = []
        train_performance = []
        for t in range(5):
            # split and scale data
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/6, random_state=t)
            X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=t)
            X_train = scaler.fit_transform(X_train)
            X_val = scaler.transform(X_val)
            X_test = scaler.transform(X_test)
                
            clf = GridSearchCV(pips[i], pars[i], refit=False, n_jobs=8, cv=5, verbose=3, scoring=('accuracy', 'roc_auc', 'f1'))
            clf = clf.fit(X_val, y_val)
            
            print("finished Gridsearch trial " + str(t + 1) + " classifier " + str(i + 1))
            print("")
            print("")
            
            # find the best params for each metric in a given trial 
            best_index_acc = np.argmin(clf.cv_results_['rank_test_accuracy'])
            best_params_acc = clf.cv_results_['params'][best_index_acc]
            best_index_roc = np.argmin(clf.cv_results_['rank_test_roc_auc'])
            best_params_roc = clf.cv_results_['params'][best_index_roc]
            best_index_f1 = np.argmin(clf.cv_results_['rank_test_f1'])
            best_params_f1 = clf.cv_results_['params'][best_index_f1]
    
            # train and test models for given metric with their corresponding best parameter settings
            pipe = pips[i]
            clf_acc = pipe.set_params(**best_params_acc)
            clf_acc = clf_acc.fit(X_train, y_train)
            clf_roc = pipe.set_params(**best_params_roc)
            clf_roc = clf_roc.fit(X_train, y_train)
            clf_f1 = pipe.set_params(**best_params_f1)
            clf_f1 = clf_f1.fit(X_train, y_train)
            
            # get training set performance
            train_acc = accuracy_score(y_train, clf_acc.predict(X_train))
            train_roc = roc_auc_score(y_train, clf_roc.predict_proba(X_train)[:, 1])
            train_f1 = f1_score(y_train, clf_f1.predict(X_train))
            
            train_performance.append({
                'Model #': i + 1,
                'average': (train_f1 + train_acc + train_roc)/3,
                'accuracy': train_acc,
                'roc_auc_score': train_roc,
                'f1 score': train_f1
            })
            
            # get test set performances 
            trial_acc = clf_acc.score(X_test, y_test)
            trial_roc = roc_auc_score(y_test, clf_roc.predict_proba(X_test)[:, 1])
            trial_f1 = f1_score(y_test, clf_f1.predict(X_test))
            
            # store scores and their averages in list containing averages for each trial
            trial_averages.append({
                'Model #': i + 1, # model number corresponds to the numbers used in pipeline above (i.e. 1 = Random Forest)
                'average':(trial_acc + trial_roc + trial_f1) / 3,
                'accuracy': trial_acc,
                'roc_auc_score': trial_roc,
                'f1_score': trial_f1
            })
            
            train_performance.append({
                'Model #': i + 1, # model number corresponds to the numbers used in pipeline above (i.e. 1 = Random Forest)
                'average':(train_acc + train_roc + train_f1) / 3,
                'accuracy': train_acc,
                'roc_auc_score': train_roc,
                'f1_score': train_f1
            })
            
        # find the trial with the best average metric scores and append those scores as a dict to best clf list
        max_average = 0
        for trial in trial_averages:
            if trial['average'] > max_average:
                max_average = trial['average']
                best_trial = trial

        best_clf_list.append(best_trial)
        training_storage[str(i + 1)]=train_performance
        trial_storage[str(i + 1)]=trial_averages
    
    return best_clf_list, trial_storage, training_storage

In [12]:
%%capture --no-stdout --no-display
best_clf_list, trial_storage, training_perf = experiment()

starting Gridsearch
Fitting 5 folds for each of 8 candidates, totalling 40 fits
finished Gridsearch trial 1 classifier 1


Fitting 5 folds for each of 8 candidates, totalling 40 fits
finished Gridsearch trial 2 classifier 1


Fitting 5 folds for each of 8 candidates, totalling 40 fits
finished Gridsearch trial 3 classifier 1


Fitting 5 folds for each of 8 candidates, totalling 40 fits
finished Gridsearch trial 4 classifier 1


Fitting 5 folds for each of 8 candidates, totalling 40 fits
finished Gridsearch trial 5 classifier 1


Fitting 5 folds for each of 54 candidates, totalling 270 fits
finished Gridsearch trial 1 classifier 2


Fitting 5 folds for each of 54 candidates, totalling 270 fits
finished Gridsearch trial 2 classifier 2


Fitting 5 folds for each of 54 candidates, totalling 270 fits
finished Gridsearch trial 3 classifier 2


Fitting 5 folds for each of 54 candidates, totalling 270 fits
finished Gridsearch trial 4 classifier 2


Fitting 5 folds for each of 54 candidates, to

## Calculating and Organizing Results

In [14]:
print('Best Models On Average For Test Set:')
for element in best_clf_list:
    print(element)

print()

print('Train Set Data')
for i in range(len(training_perf)):
    print(training_perf[str(i + 1)])

print()

alg_avg = {}
alg_acc = {}
alg_roc = {}
alg_f1 = {}

for i in range(len(trial_storage)):
    alg_avg[str(i + 1)]=[]
    alg_acc[str(i + 1)]=[]
    alg_roc[str(i + 1)]=[]
    alg_f1[str(i + 1)]=[]
    for entry in trial_storage[str(i + 1)]:
        alg_avg[str(i + 1)].append(entry['average'])
        alg_acc[str(i + 1)].append(entry['accuracy'])
        alg_roc[str(i + 1)].append(entry['roc_auc_score'])
        alg_f1[str(i + 1)].append(entry['f1_score'])

print('set of averages of algorithms over 5 trials:')
print(alg_avg)
print()
print('set of acc values of algorithms over 5 trials')
print(alg_acc)
print()
print('set of roc values of algorithms over 5 trials')
print(alg_roc)
print()
print('set of f1 values of algorithms over 5 trials')
print(alg_f1)

Best Models On Average For Test Set:
{'Model #': 1, 'average': 0.7397841712938319, 'accuracy': 0.7155, 'roc_auc_score': 0.779130694820054, 'f1_score': 0.7247218190614417}
{'Model #': 2, 'average': 0.6740052911361613, 'accuracy': 0.654, 'roc_auc_score': 0.7072315596829936, 'f1_score': 0.6607843137254902}
{'Model #': 3, 'average': 0.730361630281657, 'accuracy': 0.711, 'roc_auc_score': 0.7731883391208333, 'f1_score': 0.7068965517241379}
{'Model #': 4, 'average': 0.6369932174616385, 'accuracy': 0.597, 'roc_auc_score': 0.6237413741374138, 'f1_score': 0.690238278247502}
{'Model #': 5, 'average': 0.6760133373701054, 'accuracy': 0.65, 'roc_auc_score': 0.712968241775388, 'f1_score': 0.6650717703349281}

Train Set Data
[{'Model #': 1, 'average': 0.9896706077254435, 'accuracy': 0.986, 'roc_auc_score': 0.9968458152711918, 'f1 score': 0.9861660079051385}, {'Model #': 1, 'average': 0.9896706077254435, 'accuracy': 0.986, 'roc_auc_score': 0.9968458152711918, 'f1_score': 0.9861660079051385}, {'Model #'

In [15]:
# calculate average acc metric scores per algorithm over 5 trials
alg_acc_averages = {}
for i in range(len(alg_acc)):
    alg_acc_averages[str(i + 1)] = sum(alg_acc[str(i + 1)])/5

print(alg_acc_averages)

{'1': 0.7061, '2': 0.6407999999999999, '3': 0.6930000000000001, '4': 0.5826, '5': 0.6154}


In [16]:
# calculate average roc metric scores per algorithm over 5 trials
alg_roc_averages = {}
for i in range(len(alg_roc)):
    alg_roc_averages[str(i + 1)] = sum(alg_roc[str(i + 1)])/5

print(alg_roc_averages)

{'1': 0.7792694678872223, '2': 0.695448361523707, '3': 0.7595458295633938, '4': 0.6011590839600045, '5': 0.6709114484834557}


In [17]:
# calculate average f1 metric scores per algorithm over 5 trials
alg_f1_averages = {}
for i in range(len(alg_f1)):
    alg_f1_averages[str(i + 1)] = sum(alg_f1[str(i + 1)])/5

print(alg_f1_averages)

{'1': 0.7090258973514149, '2': 0.6609152023258265, '3': 0.686832230029941, '4': 0.6701586780927427, '5': 0.6310440889349851}


In [18]:
averages = {}
for i in range(len(alg_acc_averages)):
    averages[str(i + 1)] = (alg_acc_averages[str(i + 1)] + alg_roc_averages[str(i + 1)] + alg_f1_averages[str(i + 1)])/3

print(averages)

{'1': 0.7314651217462124, '2': 0.6657211879498445, '3': 0.7131260198644448, '4': 0.6179725873509158, '5': 0.6391185124728136}


In [19]:
# t-test best against rest mean of metrics (RF against rest)
combined_metrics_1 = []
combined_metrics_2 = []
combined_metrics_3 = []
combined_metrics_4 = []
combined_metrics_5 = []

for item in alg_acc['1']:
    combined_metrics_1.append(item)
for item in alg_roc['1']:
    combined_metrics_1.append(item)
for item in alg_f1['1']:
    combined_metrics_1.append(item)
    
for item in alg_acc['2']:
    combined_metrics_2.append(item)
for item in alg_roc['2']:
    combined_metrics_2.append(item)
for item in alg_f1['2']:
    combined_metrics_2.append(item)

for item in alg_acc['3']:
    combined_metrics_3.append(item)
for item in alg_roc['3']:
    combined_metrics_3.append(item)
for item in alg_f1['3']:
    combined_metrics_3.append(item)
    
for item in alg_acc['4']:
    combined_metrics_4.append(item)
for item in alg_roc['4']:
    combined_metrics_4.append(item)
for item in alg_f1['4']:
    combined_metrics_4.append(item)

for item in alg_acc['5']:
    combined_metrics_5.append(item)
for item in alg_roc['5']:
    combined_metrics_5.append(item)
for item in alg_f1['5']:
    combined_metrics_5.append(item)
    
print(ttest_rel(combined_metrics_1, combined_metrics_2))
print(ttest_rel(combined_metrics_1, combined_metrics_3))
print(ttest_rel(combined_metrics_1, combined_metrics_4))
print(ttest_rel(combined_metrics_1, combined_metrics_5))

Ttest_relResult(statistic=13.536554913411994, pvalue=1.9634473848890334e-09)
Ttest_relResult(statistic=3.626745938180636, pvalue=0.002748710807162354)
Ttest_relResult(statistic=7.091129109155958, pvalue=5.411836315428835e-06)
Ttest_relResult(statistic=8.681118334491316, pvalue=5.224431932512865e-07)
