In [24]:
import pandas as pd
import dask.dataframe as dd
import requests
import json

In [25]:
gcs_path = 'gs://safu-hk/eth_features/wallets_*'
etherscamdb_uri = 'https://etherscamdb.info/api/addresses/'
myetherwallet_uri = 'https://raw.githubusercontent.com/MyEtherWallet/ethereum-lists/master/src/addresses/addresses-darklist.json'

In [63]:
etherscamdb_data = json.loads(requests.get(etherscamdb_uri).text)
myetherwallet_data = json.loads(requests.get(myetherwallet_uri).text)

etherscamdb_addresses = [x.lower() for x in scams['result'].keys() if len(x)==42]
myetherwallet_addresses = [x['address'].lower() for x in myetherwallet_data if x.get('address')]

scam_addresses = list(set(etherscamdb_addresses).union(set(myetherwallet_addresses)))
pd.Series(scam_addresses).to_csv('known_scams.csv')
len(scam_addresses)

2107


In [15]:
tx_dd = dd.read_csv(gcs_path, compression='gzip', assume_missing=True)

Please ensure that each individual file can fit in memory and
use the keyword ``blocksize=None to remove this message``
Setting ``blocksize=None``
  "Setting ``blocksize=None``" % compression)


In [5]:
tx = tx_dd.compute().set_index('wallet').fillna(0)

In [55]:
tx.index.name = 'wallet'

In [47]:
scam_address_dummy = pd.DataFrame(scam_addresses).set_index(0)
scam_address_dummy.shape

(2107, 0)

In [54]:
scams_df = tx.join(scam_address_dummy, how='inner')
scams_df.shape

(1598, 33)

In [49]:
n_non_scams = scams_df.shape[0]
sampled_non_scams = tx.sample(n_non_scams)

In [50]:
# this set should be empty!
set(sampled_non_scams.index).intersection(set(scams_df.index))

set()

In [51]:
scams_df['is_scam'] = 1.0
sampled_non_scams['is_scam'] = 0.0

In [53]:
scams_df.shape, sampled_non_scams.shape

((1598, 34), (1598, 34))

In [52]:
df = pd.concat([scams_df, sampled_non_scams])
df.shape

(3196, 34)

In [44]:
df.loc[df.index.dropna()].to_csv('labeled_dataset.csv')

In [13]:
print(pd.io.sql.get_schema(df.reset_index(), 'eth_features'))

CREATE TABLE "eth_features" (
"wallet" TEXT,
  "num_tx_erc20" REAL,
  "num_in_tx_erc20" REAL,
  "num_distinct_tokens_in" REAL,
  "num_distinct_wallets_in_erc20" REAL,
  "num_distinct_blocks_in_erc20" REAL,
  "num_out_tx_erc20" REAL,
  "num_distinct_tokens_out" REAL,
  "num_distinct_wallets_out_erc20" REAL,
  "num_distinct_blocks_out_erc20" REAL,
  "min_block_number_in_erc20" REAL,
  "max_block_number_in_erc20" REAL,
  "min_block_number_out_erc20" REAL,
  "max_block_number_out_erc20" REAL,
  "num_tx_eth" REAL,
  "balance_eth" REAL,
  "num_in_tx" REAL,
  "num_distinct_wallets_in" REAL,
  "vol_in" REAL,
  "mean_in" REAL,
  "min_in" REAL,
  "max_in" REAL,
  "num_distinct_blocks_in" REAL,
  "num_out_tx" REAL,
  "num_distinct_wallets_out" REAL,
  "vol_out" REAL,
  "mean_out" REAL,
  "min_out" REAL,
  "max_out" REAL,
  "num_distinct_blocks_out" REAL,
  "min_block_number_in_eth" REAL,
  "max_block_number_in_eth" REAL,
  "min_block_number_out_eth" REAL,
  "max_block_number_out_eth" REAL,
  "is_

In [19]:
diff_vectors = df.groupby('is_scam').mean().T

In [22]:
(diff_vectors[1] / diff_vectors[0]).sort_values(ascending=False)

num_in_tx                         22.305838
num_distinct_blocks_in            17.941418
num_tx_eth                        12.700806
num_distinct_wallets_in           11.537239
num_distinct_tokens_out            3.384306
min_out                            3.262335
num_distinct_wallets_out           2.384679
num_in_tx_erc20                    2.144213
num_tx_erc20                       2.139798
num_distinct_blocks_out_erc20      2.136811
num_out_tx_erc20                   2.126955
num_distinct_blocks_in_erc20       2.124339
num_distinct_blocks_out            1.946425
num_out_tx                         1.933924
num_distinct_wallets_in_erc20      1.866410
num_distinct_tokens_in             1.698324
min_block_number_in_eth            1.418978
max_block_number_in_eth            1.406579
min_block_number_out_eth           1.377159
max_block_number_out_eth           1.360278
max_block_number_in_erc20          0.828697
min_block_number_in_erc20          0.809769
num_distinct_wallets_out_erc20  

In [56]:
tx.to_csv('all_wallets.csv')

In [61]:
df.groupby('is_scam').median().T

is_scam,0.0,1.0
num_tx_erc20,1.0,0.0
num_in_tx_erc20,1.0,0.0
num_distinct_tokens_in,1.0,0.0
num_distinct_wallets_in_erc20,1.0,0.0
num_distinct_blocks_in_erc20,1.0,0.0
num_out_tx_erc20,0.0,0.0
num_distinct_tokens_out,0.0,0.0
num_distinct_wallets_out_erc20,0.0,0.0
num_distinct_blocks_out_erc20,0.0,0.0
min_block_number_in_erc20,4236436.0,0.0
