# Address Acquisition: Real-Time Mempool Deployment Datasets

## Import Required Libraries

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np
from etherscan import Etherscan

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import Keys, ActionChains
import time
import random 

In [None]:
#Etherscan API Key
eth = Etherscan('API_KEY')

## Illicit Addresses

### Import Prior Datasets with Illicit Ethereum Addresses

In [None]:
#Import Datasets/Files

#https://github.com/sfarrugia15/Ethereum_Fraud_Detection/blob/master/Account_Stats/Complete.csv
farrugia = pd.read_csv('Github Farrugia.csv')

#https://www.kaggle.com/datasets/vagifa/ethereum-frauddetection-dataset
aliyev = pd.read_csv('Kaggle Aliyev.csv')

#https://www.kaggle.com/datasets/gescobero/ethereum-fraud-dataset
escobero = pd.read_csv('Kaggle Escobero.csv')

In [None]:
#Drop Legal Addresses from Datasets
farrugia_illicit = farrugia[farrugia['FLAG']==1]
aliyev_illict = aliyev[aliyev['FLAG']==1]
escobero_illicit = escobero[escobero['flag']==1]

#Isolate Addresses
farrugia_addresses = list(farrugia_illicit['Address'])
aliyev_addresses = list(aliyev_illict['Address'])
escobero_addresses = list(escobero_illicit['address'])

### Scrape OFAC SDN List Text File

In [None]:
#https://www.treasury.gov/ofac/downloads/sdnlist.txt
ofac = open('ofac_sdnlist.txt','r')
lines = ofac.readlines()

#Extract Ethereum Addresses
address_lines = []
for i in lines:
    if 'ETH' in i:
        address_lines.append(i)
        
intermediate_addresses = []
clean_addresses = []
for line in address_lines:
    index = line.find('ETH')
    intermediate_addresses.append(line[index:index+46])

for string in intermediate_addresses:
    if 'ETH 0x' in string:
        clean_addresses.append(string)
        
clean_addresses_no_eth = []

for i in clean_addresses:
    clean_addresses_no_eth.append(i[4:])
    
clean_df = pd.DataFrame(clean_addresses_no_eth, columns=['Address'])
clean_df = clean_df.drop_duplicates()
ofac_addresses = [i.lower() for i in clean_df['Address']]

#Identify & Remove Addresses with Only Internal Transactions
start_block = 1
end_block = 18000000
zero_ex_tx = []
for i in ofac_addresses:
    try:
        tx = eth.get_normal_txs_by_address(i, start_block, end_block, 'asc')
    except:
        zero_ex_tx.append(i)
        continue
        
ofac_ex_accounts = []
for i in ofac_addresses:
    if i not in zero_ex_tx:
        ofac_ex_accounts.append(i)
        
#ofac_ex_accounts

### Scrape CryptoScamDB Text File

In [None]:
#https://github.com/CryptoScamDB/EtherScamDB/blob/master/_data/scams.yaml
file = open('scams.yaml','r')
lines = file.readlines()

#Extract Ethereum Addresses
address_lines = []
for i in lines:
    if '0x' in i:
        address_lines.append(i)

trimmed = []
for i in address_lines:
    trimmed.append(i[11:53])
    
cryptoscam_only_0x = []
for i in trimmed:
    if i[0:2]=='0x':
        cryptoscam_only_0x.append(i)

#Identify & Remove Addresses with Only Internal Transactions
start_block = 1
end_block = 18000000
zero_ex_tx = []
for i in cryptoscam_only_0x:
    try:
        tx = eth.get_normal_txs_by_address(i, start_block, end_block, 'asc')
    except:
        zero_ex_tx.append(i)
        continue

cryptoscam_ex_accounts = []
for i in cryptoscam_only_0x:
    if i not in zero_ex_tx:
        etherscam_ex_accounts.append(i)

#cryptoscam_ex_accounts

### Webscrape Etherscan Website for Fake_Phishing Tagged Addresses

In [None]:
def random_sleep_time():
    r_num = np.random.uniform(2,4)
    r_num = round(r_num, 4)
    time.sleep(r_num)

#Iterativly Scrapes Fake Phishing Tags    
driver = webdriver.Chrome()

driver.get('https://etherscan.io/')
time.sleep(3)
etherscan_scam_addresses = []
driver.execute_script('return navigator.userAgent')

def random_sleep_time():
    r_num = np.random.uniform(2,4)
    r_num = round(r_num, 4)
    time.sleep(r_num)

for i in range(2,5):
    driver.find_element(By.XPATH, '/html/body/main/section[1]/div/div/div[1]/form/div/div[2]/input[1]').click()
    random_sleep_time()
    driver.find_element(By.XPATH, '/html/body/main/section[1]/div/div/div[1]/form/div/div[2]/input[1]').send_keys('Fake_Phishing'+str(i))
    random_sleep_time()
    ActionChains(driver).key_down(Keys.ENTER).perform()
    try:
        text = driver.find_element(By.XPATH, '/html/body/main/section[1]/div/div[1]/div/div[2]/span/span').text
        etherscan_scam_addresses.append(text)
        random_sleep_time()
        driver.back()
    except:
        random_sleep_time()
        driver.back()
    random_sleep_time()
    driver.find_element(By.XPATH, '/html/body/main/section[1]/div/div/div[1]/form/div/div[2]/input[1]').click()
    random_sleep_time()
    driver.find_element(By.XPATH, '/html/body/main/section[1]/div/div/div[1]/form/div/div[2]/input[1]').clear()
    random_sleep_time()
    
#etherscan_scam_addresses

### Combine Illicit Addresses Together 

In [None]:
all_illicit_addresses = farrugia_addresses + aliyev_addresses + escobero_addresses + ofac_ex_accounts + cryptoscam_ex_accounts + etherscan_scam_addresses
all_illicit_addresses = [i.lower() for i in all_illicit_addresses]
all_illicit_addresses = list(set(all_illicit_addresses))
#all_illicit_addreses.to_csv('all_illicit_addresses.csv')

## Legal Addresses: Randomly Sampled 

In [None]:
%%time
#Get Randomly Sampled Legal Addresses

block_range = range(17999900, 18000000, 1)
random_sample_accounts = []

for i in block_range:
    try:
        block = eth.get_proxy_block_by_number(tag = hex(i))
        block_txs = block['transactions']
        for i in block_txs:
            random_sample_accounts.append(i['from'])
            random_sample_accounts.append(i['to'])
    except:
        continue

#Take random sample of acquired accounts (number of addresses acquired would make dataset imbalanced, must trim to match illicit accounts)        
random_sample_accounts = random.sample(random_sample_accounts, 15000)
random_sample_accounts = [i.lower() for i in random_sample_accounts]

In [None]:
#Check to make sure there is no overlap with illict addresses
all_illicit = pd.read_csv('all_illicit_addresses.csv')
all_illicit_list = all_illicit['0'].tolist()

accounts_no_overlap = []
for i in random_sample_accounts:
    if i not in all_illicit_list:
        accounts_no_overlap.append(i)

#Convert to Dataframe and Drop Duplicates
random_sample_addresses = accounts_no_overlap
random_sample_addresses_df = pd.DataFrame(random_sample_addresses, columns = ['Address'])
random_sample_addresses_df = random_sample_addresses_df.drop_duplicates()
#random_sample_addresses_df.to_csv('random_sample_addresses.csv')

## Legal Addresses: Addresses with Coinbase Interaction

In [None]:
#Accounts with Coinbase Interaction: https://etherscan.io/accounts/label/coinbase?subcatid=undefined&size=50&start=0&col=1&order=asc
coinbase_address_list = ['0x00000000219ab540356cBB839Cbe05303d7705Fa',
                         '0xC02aaA39b223FE8D0A0e5C4F27eAD9083C756Cc2',
                         '0x71660c4005BA85c37ccec55d0C4493E66Fe775d3',
                         '0x503828976D22510aad0201ac7EC88293211D23Da',
                         '0xddfAbCdc4D8FfC6d5beaf154f18B778f892A0740',
                         '0x3cD751E6b0078Be393132286c442345e5DC49699',
                         '0xb5d85CBf7cB3EE0D56b3bB207D5Fc4B82f43F511',
                         '0xeB2629a2734e272Bcc07BDA959863f316F4bD4Cf',
                         '0xD688AEA8f7d450909AdE10C47FaA95707b0682d9',
                         '0x02466E547BFDAb679fC49e96bBfc62B9747D997C',
                         '0x6b76F8B1e9E59913BfE758821887311bA1805cAB',
                         '0xA9D1e08C7793af67e9d92fe308d5697FB81d3E43',
                         '0x77696bb39917C91A0c3908D577d5e322095425cA',
                         '0x7c195D981AbFdC3DDecd2ca0Fed0958430488e34',
                         '0x95A9bd206aE52C4BA8EecFc93d18EACDd41C88CC',
                         '0xb739D0895772DBB71A89A3754A160269068f0D45',
                         '0xf6874c88757721a02f47592140905c4336DfBc61',
                         '0x881D4032abe4188e2237eFCD27aB435E81FC6bb1',
                         '0x6c8dd0e9cC58c07429e065178d88444B60e60b80',
                         '0xBc8Ec259E3026aE0D87bc442D034d6882ce4a35C',
                         '0x02d24cAB4f2c3Bf6e6EB07ea07e45F96baccFfE7',
                         '0xCe352e98934499be70F641353f16A47D9E1E3aBd',
                         '0x90E18a6920985DBACc3d76Cf27a3F2131923C720',
                         '0x4B23d52eFf7C67F5992C2aB6D3f69b13a6a33561',
                         '0xd2276aF80582CAc230EDC4c42e9a9C096F3C09AA',
                         '0xA090e606E30bD747d4E6245a1517EbE430F0057e']

coinbase_address_list = [i.lower() for i in coinbase_address_list]

In [None]:
%%time
#Utilize Accounts Interacting with Coinbase As Legal
#Get Legal ETH Accounts: Etherscan API

block_range = range(17995000,18000000,1)
coinbase_interaction_addresses = []
for i in block_range:
    try:
        block = eth.get_proxy_block_by_number(tag = hex(i))
        block_txs = block['transactions']
        for i in block_txs:
            if i['from'] in coinbase_address_list:
                coinbase_interaction_addresses.append(i['to'])
            if i['to'] in coinbase_address_list:
                coinbase_interaction_addresses.append(i['from'])
    except:
        continue
        
#Convert to Dataframe and Drop Duplicates
coinbase_df = pd.DataFrame(coinbase_interaction_addresses, columns = ['Address'])
coinbase_df = coinbase_df.drop_duplicates()
#coinbase_df.to_csv('legal_coinbase_addresses.csv')